diff --git a/kokkos/basic/Box.hpp b/kokkos/basic/Box.hpp
deleted file mode 100644
index 62046e4..0000000
--- a/kokkos/basic/Box.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _Box_hpp_
-#define _Box_hpp_
-
-/**
-  * a 'Box' is 3 pairs of ints, where each pair specifies a lower
-  * and upper bound for one of the 3 spatial dimensions.
-  *
-  * This struct stores the 3 pairs as a simple array of 6 ints,
-  * but defines the bracket operator so that it can be referenced
-  * using 2-dimensional array notation like this:
-  * int xmin = box[0][0]; int xmax = box[0][1];
-  * int ymin = box[1][0]; int ymax = box[1][1];
-  * int zmin = box[2][0]; int zmax = box[2][1];
- */
-struct Box {
-  int ranges[6];
-  int* operator[](int xyz) { return &ranges[xyz*2]; }
-  const int* operator[](int xyz) const { return &ranges[xyz*2]; }
-};
-
-#endif
-
diff --git a/kokkos/basic/BoxIterator.hpp b/kokkos/basic/BoxIterator.hpp
deleted file mode 100644
index f644119..0000000
--- a/kokkos/basic/BoxIterator.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#ifndef _BoxTraverser_hpp_
-#define _BoxTraverser_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-namespace miniFE {
-
-/** Class for traversing a 3-dimensional 'box' of indices.
-
-  //One way to traverse a 'box[3][2]' is to use a triply-nested for-loop:
-  for(int z=box[2][0]; z<box[2][1]; ++z) {
-    for(int y=box[1][0]; y<box[1][1]; ++y) {
-      for(int x=box[0][0]; x<box[0][1]; ++x) {
-        ...
-      }
-    }
-  }
-
-  //Another way is to use this BoxIterator class, like so:
-  //BoxIterator iter = BoxIterator::begin(box);
-  //BoxIterator end = BoxIterator::end(box);
-  for(; iter != end; ++iter) {
-    int x = iter.x;
-    int y = iter.y;
-    int z = iter.z;
-    ...
-  }
-*/
-class BoxIterator {
-public:
-  ~BoxIterator(){}
-
-  static BoxIterator begin(const Box& box)
-  {
-    return BoxIterator(box);
-  }
-
-  static BoxIterator end(const Box& box)
-  {
-    return BoxIterator(box, true/*at_end==true*/);
-  }
-
-  BoxIterator& operator=(const BoxIterator& src)
-  {
-    box_[0][0] = src.box_[0][0]; box_[0][1] = src.box_[0][1];
-    box_[1][0] = src.box_[1][0]; box_[1][1] = src.box_[1][1];
-    box_[2][0] = src.box_[2][0]; box_[2][1] = src.box_[2][1];
-    x = src.x;
-    y = src.y;
-    z = src.z;
-    return *this;
-  }
-
-  BoxIterator& operator++()
-  {
-    ++x;
-    if (x >= box_[0][1]) {
-      x = box_[0][0];
-      ++y;
-      if (y >= box_[1][1]) {
-        y = box_[1][0];
-        ++z;
-        if (z >= box_[2][1]) {
-          z = box_[2][1];
-          y = box_[1][1];
-          x = box_[0][1];
-        }
-      }
-    }
-    return *this;
-  }
-
-  BoxIterator operator++(int)
-  {
-    BoxIterator temp = *this;
-    ++(*this);
-    return temp;
-  }
-
-  bool operator==(const BoxIterator& rhs) const
-  {
-    return x == rhs.x && y == rhs.y && z == rhs.z;
-  }
-
-  bool operator!=(const BoxIterator& rhs) const
-  {
-    return !(this->operator==(rhs));
-  }
-
-  int x;
-  int y;
-  int z;
-
-private:
-  BoxIterator(const Box& box, bool at_end = false)
-   : x(box[0][0]),
-     y(box[1][0]),
-     z(box[2][0]),
-     box_()
-  {
-    box_[0][0] = box[0][0]; box_[0][1] = box[0][1];
-    box_[1][0] = box[1][0]; box_[1][1] = box[1][1];
-    box_[2][0] = box[2][0]; box_[2][1] = box[2][1];
-    if (at_end) {
-      x = box[0][1];
-      y = box[1][1];
-      z = box[2][1];
-    }
-  }
-
-  Box box_;
-};//class BoxTraverser
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/BoxPartition.cpp b/kokkos/basic/BoxPartition.cpp
deleted file mode 100644
index 2a4e5a7..0000000
--- a/kokkos/basic/BoxPartition.cpp
+++ /dev/null
@@ -1,477 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <Box.hpp>
-#include <BoxPartition.hpp>
-
-/*--------------------------------------------------------------------*/
-
-static int box_map_local_entry( const Box& box ,
-                                const int ghost ,
-                                int local_x ,
-                                int local_y ,
-                                int local_z )
-{
-  const int nx = 2 * ghost + box[0][1] - box[0][0] ;
-  const int ny = 2 * ghost + box[1][1] - box[1][0] ;
-  const int nz = 2 * ghost + box[2][1] - box[2][0] ;
-  int result = -1 ;
-
-  local_x += ghost ;
-  local_y += ghost ;
-  local_z += ghost ;
-
-  if ( 0 <= local_x && local_x < nx &&
-       0 <= local_y && local_y < ny &&
-       0 <= local_z && local_z < nz ) {
-
-    result = local_z * ny * nx + local_y * nx + local_x ;
-  }
-  return result ;
-}
-
-int box_map_local( const Box& box_local,
-                   const int ghost ,
-                   const int box_local_map[] ,
-                   const int local_x ,
-                   const int local_y ,
-                   const int local_z )
-{
-  int result = box_map_local_entry(box_local,ghost,local_x,local_y,local_z);
-
-  if ( 0 <= result ) {
-    result = box_local_map[ result ];
-  }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-/* Recursively split a box into into (up-ip) sub-boxes */
-
-void box_partition( int ip , int up , int axis ,
-                    const Box& box,
-                    Box* p_box )
-{
-  const int np = up - ip ;
-  if ( 1 == np ) {
-    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
-    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
-    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
-  }
-  else {
-    const int n = box[ axis ][1] - box[ axis ][0] ;
-    const int np_low = np / 2 ;  /* Rounded down */
-    const int np_upp = np - np_low ;
-
-    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
-    const int n_low = n - n_upp ;
-    const int next_axis = ( axis + 2 ) % 3 ;
-
-    if ( np_low ) { /* P = [ip,ip+np_low) */
-      Box dbox ;
-      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
-      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
-      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
-
-      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
-
-      box_partition( ip, ip + np_low, next_axis, dbox, p_box );
-    }
-
-    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
-      Box dbox;
-      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
-      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
-      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
-
-      ip += np_low ;
-      dbox[ axis ][0] += n_low ;
-      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
-
-      box_partition( ip, ip + np_upp, next_axis, dbox, p_box );
-    }
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-static int box_disjoint( const Box& a , const Box& b)
-{
-  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
-         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
-         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
-}
-
-static void resize_int( int ** a , int * allocLen , int newLen )
-{
-  int k = 32;
-  while ( k < newLen ) { k <<= 1 ; }
-  if ( NULL == *a )
-    { *a = (int*)malloc( sizeof(int)*(*allocLen = k) ); }
-  else if ( *allocLen < k ) 
-    { *a = (int*)realloc(*a , sizeof(int)*(*allocLen = k)); }
-}
-
-static void box_partition_maps( 
-  const int np ,
-  const int my_p ,
-  const Box* pbox,
-  const int ghost ,
-  int ** map_local_id ,
-  int ** map_recv_pc ,
-  int ** map_send_pc ,
-  int ** map_send_id )
-{
-  const Box& my_box = pbox[my_p] ;
-
-  const int my_ix = my_box[0][0] ;
-  const int my_iy = my_box[1][0] ;
-  const int my_iz = my_box[2][0] ;
-  const int my_nx = my_box[0][1] - my_box[0][0] ;
-  const int my_ny = my_box[1][1] - my_box[1][0] ;
-  const int my_nz = my_box[2][1] - my_box[2][0] ;
-
-  const int my_use_nx = 2 * ghost + my_nx ;
-  const int my_use_ny = 2 * ghost + my_ny ;
-  const int my_use_nz = 2 * ghost + my_nz ;
-
-  const int id_length = my_use_nx * my_use_ny * my_use_nz ;
-
-  int * local_id  = (int *) malloc( id_length * sizeof(int) );
-  int * recv_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
-  int * send_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
-
-  int * send_id  = NULL ;
-  int   send_id_size = 0 ;
-
-  int iLocal , iSend ;
-  int i ;
-
-  Box my_use_box;
-
-  my_use_box[0][0] = my_box[0][0] - ghost ;
-  my_use_box[0][1] = my_box[0][1] + ghost ;
-  my_use_box[1][0] = my_box[1][0] - ghost ;
-  my_use_box[1][1] = my_box[1][1] + ghost ;
-  my_use_box[2][0] = my_box[2][0] - ghost ;
-  my_use_box[2][1] = my_box[2][1] + ghost ;
-
-  for ( i = 0 ; i < id_length ; ++i ) { local_id[i] = -1 ; }
-
-  iSend = 0 ;
-  iLocal = 0 ;
-
-  /* The vector space is partitioned by processors */
-
-  for ( i = 0 ; i < np ; ++i ) {
-    const int ip = ( i + my_p ) % np ;
-    recv_pc[i] = iLocal ;
-    send_pc[i] = iSend ;
-
-    if ( ! box_disjoint( my_use_box , pbox[ip] ) ) {
-      const int p_ix = pbox[ip][0][0] ;
-      const int p_iy = pbox[ip][1][0] ;
-      const int p_iz = pbox[ip][2][0] ;
-      const int p_ex = pbox[ip][0][1] ;
-      const int p_ey = pbox[ip][1][1] ;
-      const int p_ez = pbox[ip][2][1] ;
-
-      int local_x , local_y , local_z ;
-
-      /* Run the span of global cells that my processor uses */
-
-      for ( local_z = -ghost ; local_z < my_nz + ghost ; ++local_z ) {
-      for ( local_y = -ghost ; local_y < my_ny + ghost ; ++local_y ) {
-      for ( local_x = -ghost ; local_x < my_nx + ghost ; ++local_x ) {
-
-        const int global_z = local_z + my_iz ;
-        const int global_y = local_y + my_iy ;
-        const int global_x = local_x + my_ix ;
-
-        const int entry = 
-          box_map_local_entry(my_box,ghost,local_x,local_y,local_z);
-
-        if ( entry < 0 ) { abort(); }
-
-        if ( p_iz <= global_z && global_z < p_ez &&
-             p_iy <= global_y && global_y < p_ey &&
-             p_ix <= global_x && global_x < p_ex ) {
-
-          /* This ordinal is owned by processor 'ip' */
-
-          local_id[ entry ] = iLocal++ ;
-
-#if defined(DEBUG_PRINT)
-if ( my_p != ip ) {
-  fprintf(stdout,"  (%d,%d,%d) : P%d recv at local %d from P%d\n",
-                  global_x,global_y,global_z,my_p,local_id[entry],ip);
-  fflush(stdout);
-}
-#endif
-        }
-
-        /* If in my ownership and used by the other processor */
-        if ( my_p != ip &&
-             /* In my ownership: */
-             ( 0 <= local_z && local_z < my_nz &&
-               0 <= local_y && local_y < my_ny &&
-               0 <= local_x && local_x < my_nx ) &&
-             /* In other processors usage: */
-             ( p_iz - ghost <= global_z && global_z < p_ez + ghost &&
-               p_iy - ghost <= global_y && global_y < p_ey + ghost &&
-               p_ix - ghost <= global_x && global_x < p_ex + ghost ) ) {
-
-          resize_int( & send_id , & send_id_size , (iSend + 1) );
-          send_id[ iSend ] = local_id[ entry ] ;
-          ++iSend ;
-
-#if defined(DEBUG_PRINT)
-{
-  fprintf(stdout,"  (%d,%d,%d) : P%d send at local %d to P%d\n",
-                  global_x,global_y,global_z,my_p,local_id[entry],ip);
-  fflush(stdout);
-}
-#endif
-        }
-      }
-    }
-    }
-    }
-  }
-  recv_pc[np] = iLocal ;
-  send_pc[np] = iSend ;
-
-  *map_local_id  = local_id ;
-  *map_recv_pc   = recv_pc ;
-  *map_send_pc   = send_pc ;
-  *map_send_id   = send_id ;
-}
-
-void box_partition_rcb( const int np , 
-                        const int my_p ,
-                        const Box& root_box,
-                        const int ghost ,
-                        Box** pbox,
-                        int ** map_local_id ,
-                        int ** map_recv_pc ,
-                        int ** map_send_pc ,
-                        int ** map_send_id )
-{
-  *pbox = new Box[ np ];
-
-  box_partition( 0 , np , 2 , root_box , *pbox );
-
-  box_partition_maps( np , my_p , *pbox , ghost ,
-                      map_local_id , map_recv_pc , 
-                      map_send_pc , map_send_id );
-}
-
-/*--------------------------------------------------------------------*/
-
-#ifdef UNIT_TEST
-
-static int box_contain( const Box& a , const Box& b )
-{
-  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
-         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
-         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
-}
-
-static void box_print( FILE * fp , const Box& a )
-{
-  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
-                a[0][0] , a[0][1] ,  
-                a[1][0] , a[1][1] ,  
-                a[2][0] , a[2][1] );
-}
-
-static void test_box( const Box& box , const int np )
-{
-  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
-  int ncell_total = 0 ;
-  int ncell_min = ncell_box ;
-  int ncell_max = 0 ;
-  std::vector<Box> pbox(np);
-  int i , j ;
-
-  box_partition( 0 , np , 2 , box , &pbox[0] );
-
-  for ( i = 0 ; i < np ; ++i ) {
-    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
-                      ( pbox[i][1][1] - pbox[i][1][0] ) *
-                      ( pbox[i][2][1] - pbox[i][2][0] );
-
-    if ( ! box_contain( box , pbox[i] ) ) {
-      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
-      box_print(stdout,pbox[i]);
-      fprintf(stdout,"\n");
-      abort();
-    }
-
-    for ( j = i + 1 ; j < np ; ++j ) {
-      if ( ! box_disjoint( pbox[i] , pbox[j] ) ) {
-        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
-        box_print(stdout, pbox[i]);
-        fprintf(stdout,"\n");
-        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
-        box_print(stdout, pbox[j]);
-        fprintf(stdout,"\n");
-        abort();
-      }
-    }
-    ncell_total += ncell ;
-
-    if ( ncell_max < ncell ) { ncell_max = ncell ; }
-    if ( ncell < ncell_min ) { ncell_min = ncell ; }
-  }
-
-  if ( ncell_total != ncell_box ) {
-    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
-    abort();
-  }
-  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
-          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
-}
-
-/*--------------------------------------------------------------------*/
-
-static void test_maps( const Box& root_box , const int np )
-{
-  const int ghost = 1 ;
-  const int nx_global = root_box[0][1] - root_box[0][0] ;
-  const int ny_global = root_box[1][1] - root_box[1][0] ;
-  int ieq , i , j ;
-  std::vector<Box> pbox(np);
-  int **local_values ;
-  int **map_local_id ;
-  int **map_recv_pc ;
-  int **map_send_pc ;
-  int **map_send_id ;
-
-  box_partition( 0 , np , 2 , root_box , &pbox[0] );
-
-  local_values = (int **) malloc( sizeof(int*) * np );
-  map_local_id = (int **) malloc( sizeof(int*) * np );
-  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
-  map_send_pc  = (int **) malloc( sizeof(int*) * np );
-  map_send_id  = (int **) malloc( sizeof(int*) * np );
-
-  /* Set each local value to the global equation number */
-
-  for ( ieq = i = 0 ; i < np ; ++i ) {
-    const Box& mybox = pbox[i] ;
-    const int nx = mybox[0][1] - mybox[0][0] ;
-    const int ny = mybox[1][1] - mybox[1][0] ;
-    const int nz = mybox[2][1] - mybox[2][0] ;
-    int ix , iy , iz ;
-
-    /* Generate the partition maps for this rank */
-    box_partition_maps( np , i , &pbox[0] , ghost ,
-                        & map_local_id[i] , & map_recv_pc[i] , 
-                        & map_send_pc[i] , & map_send_id[i] );
-
-    local_values[i] = (int *) malloc( sizeof(int) * map_recv_pc[i][np] );
-
-    for ( iz = -ghost ; iz < nz + ghost ; ++iz ) {
-    for ( iy = -ghost ; iy < ny + ghost ; ++iy ) {
-    for ( ix = -ghost ; ix < nx + ghost ; ++ix ) {
-      const int ieq = box_map_local(mybox,ghost,map_local_id[i],ix,iy,iz);
-
-      if ( 0 <= ieq ) {
-        const int ix_global = ix + mybox[0][0] ;
-        const int iy_global = iy + mybox[1][0] ;
-        const int iz_global = iz + mybox[2][0] ;
-
-        if ( root_box[0][0] <= ix_global && ix_global < root_box[0][1] &&
-             root_box[1][0] <= iy_global && iy_global < root_box[1][1] &&
-             root_box[2][0] <= iz_global && iz_global < root_box[2][1] ) {
-
-          local_values[i][ ieq ] = ix_global +
-                                   iy_global * nx_global +
-                                   iz_global * nx_global * ny_global ;
-        }
-        else {
-          local_values[i][ ieq ] = -1 ;
-        }
-      }
-    }
-    }
-    }
-  }
-
-  /* Pair-wise compare the local values */
-  /* i  == receiving processor rank */
-  /* ip == sending   processor rank */
-  /* j  == receiving processor data entry for message from 'ip' */
-  /* jp == sending   processor data entry for message to   'i' */
-
-  for ( i = 0 ; i < np ; ++i ) {
-    for ( j = 1 ; j < np ; ++j ) {
-      const int ip = ( i + j ) % np ;
-      const int jp = ( i + np - ip ) % np ;
-      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
-      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
-      int k ;
-      if ( nrecv != nsend ) {
-        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
-        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
-        abort();
-      }
-      for ( k = 0 ; k < nrecv ; ++k ) {
-        const int irecv = map_recv_pc[i][j] + k ;
-        const int isend = map_send_pc[ip][jp] + k ;
-        const int val_irecv = local_values[i][irecv] ;
-        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
-        if ( val_irecv != val_isend ) {
-          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
-          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
-          abort();
-        }
-      }
-    }
-  }
-
-  for ( i = 0 ; i < np ; ++i ) {
-    free( map_local_id[i] );
-    free( map_recv_pc[i] );
-    free( map_send_pc[i] );
-    free( map_send_id[i] );
-    free( local_values[i] );
-  }
-  free( map_send_id );
-  free( map_send_pc );
-  free( map_recv_pc );
-  free( map_local_id );
-  free( local_values );
-}
-
-/*--------------------------------------------------------------------*/
-
-int main( int argc , char * argv[] )
-{
-  int np_max = 256 ;
-  Box box = { 0 , 64 , 0 , 64 , 0 , 64 };
-  int np = 0 ;
-
-  switch( argc ) {
-  case 3:
-    sscanf(argv[1],"%d",&np);
-    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
-    if ( 0 < np ) { test_box( box , np ); }
-    if ( 0 < np ) { test_maps( box , np ); }
-    break ;
-  default:
-    for ( np = 1 ; np <= np_max ; ++np ) {
-      test_box( box , np );
-      test_maps( box , np );
-    }
-    break ;
-  }
-  return 0 ;
-}
-
-#endif
-
-
diff --git a/kokkos/basic/BoxPartition.hpp b/kokkos/basic/BoxPartition.hpp
deleted file mode 100644
index 4359a16..0000000
--- a/kokkos/basic/BoxPartition.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _BoxPartition_hpp_
-#define _BoxPartition_hpp_
-
-#include <Box.hpp>
-
-/** \brief Recursively split a box into (up-ip) sub-boxes
- */
-void box_partition( int ip , int up , int axis ,
-                    const Box& box ,
-                    Box* p_box );
-
-/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
- *
- *  Use recursive coordinate bisection to partition a box 
- *  into np disjoint sub-boxes.  Allocate (via malloc) and
- *  populate the sub-boxes, mapping the local (x,y,z) to
- *  a local ordinal, and mappings for the send-recv messages
- *  to update the ghost cells.
- *
- *  usage:
- *
- *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
- *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
- *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
- *
- *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
- *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
- *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
- *    const int x_global = x + pbox[my_p][0][0] ;
- *    const int y_global = y + pbox[my_p][1][0] ;
- *    const int z_global = z + pbox[my_p][2][0] ;
- *
- *    const int local_ordinal =
- *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
- *
- *    if ( 0 <= local_ordinal ) {
- *    }
- *  }
- *  
- *  for ( i = 1 ; i < np ; ++i ) {
- *    const int recv_processor = ( my_p + i ) % np ;
- *    const int recv_ordinal_begin = map_recv_pc[i];
- *    const int recv_ordinal_end   = map_recv_pc[i+1];
- *  }
- *
- *  for ( i = 1 ; i < np ; ++i ) {
- *    const int send_processor = ( my_p + i ) % np ;
- *    const int send_map_begin = map_send_pc[i];
- *    const int send_map_end   = map_send_pc[i+1];
- *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
- *      send_ordinal = map_send_id[j] ;
- *    }
- *  }
- */
-void box_partition_rcb( 
-  const int np            /**< [in]  Number of partitions */ ,
-  const int my_p          /**< [in]  My partition rank    */ ,
-  const Box& root_box     /**< [in]  3D Box to partition  */ ,
-  const int ghost         /**< [in]  Ghost cell boundary  */ ,
-  Box* pbox               /**< [out] Partition's 3D boxes */ ,
-  int ** map_local_id     /**< [out] Map local cells */ ,
-  int ** map_recv_pc      /**< [out] Receive spans per processor */ ,
-  int ** map_send_pc      /**< [out] Send prefix counts per processor */ ,
-  int ** map_send_id      /**< [out] Send message ordinals */ );
-
-/* \brief  Map a local (x,y,z) to a local ordinal.
- */
-int box_map_local( const Box& box_local ,
-                   const int ghost ,
-                   const int map_local_id[] ,
-                   const int local_x ,
-                   const int local_y ,
-                   const int local_z );
-
-#endif
-
diff --git a/kokkos/basic/CSRMatrix.hpp b/kokkos/basic/CSRMatrix.hpp
deleted file mode 100644
index 9cfeaee..0000000
--- a/kokkos/basic/CSRMatrix.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef _CSRMatrix_hpp_
-#define _CSRMatrix_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cstddef>
-#include <vector>
-#include <algorithm>
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-namespace miniFE {
-
-template<typename Scalar,
-         typename LocalOrdinal,
-         typename GlobalOrdinal,
-         typename ComputeNode>
-struct
-CSRMatrix {
-  CSRMatrix(ComputeNode& comp_node)
-   : has_local_indices(false),
-     rows(), row_offsets(), row_offsets_external(),
-     packed_cols(), packed_coefs(),
-     num_cols(0),
-     compute_node(comp_node)
-#ifdef HAVE_MPI
-     ,external_index(), external_local_index(), elements_to_send(),
-      neighbors(), recv_length(), send_length(), send_buffer(), request()
-#endif
-  {
-  }
-
-  ~CSRMatrix()
-  {}
-
-  typedef Scalar        ScalarType;
-  typedef LocalOrdinal  LocalOrdinalType;
-  typedef GlobalOrdinal GlobalOrdinalType;
-  typedef ComputeNode   ComputeNodeType;
-
-  bool                       has_local_indices;
-  std::vector<GlobalOrdinal> rows;
-  std::vector<LocalOrdinal>  row_offsets;
-  std::vector<LocalOrdinal>  row_offsets_external;
-  std::vector<GlobalOrdinal> packed_cols;
-  std::vector<Scalar>        packed_coefs;
-  LocalOrdinal               num_cols;
-  ComputeNode&               compute_node;
-
-#ifdef HAVE_MPI
-  std::vector<GlobalOrdinal> external_index;
-  std::vector<GlobalOrdinal>  external_local_index;
-  std::vector<GlobalOrdinal> elements_to_send;
-  std::vector<int>           neighbors;
-  std::vector<LocalOrdinal>  recv_length;
-  std::vector<LocalOrdinal>  send_length;
-  std::vector<Scalar>        send_buffer;
-  std::vector<MPI_Request>   request;
-#endif
-
-  size_t num_nonzeros() const
-  {
-    return row_offsets[row_offsets.size()-1];
-  }
-
-  void reserve_space(unsigned nrows, unsigned ncols_per_row)
-  {
-    rows.resize(nrows);
-    row_offsets.resize(nrows+1);
-    packed_cols.reserve(nrows * ncols_per_row);
-    packed_coefs.reserve(nrows * ncols_per_row);
-  }
-
-  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
-                        GlobalOrdinalType*& cols,
-                        ScalarType*& coefs)
-  {
-    ptrdiff_t local_row = -1;
-    //first see if we can get the local-row index using fast direct lookup:
-    if (rows.size() >= 1) {
-      ptrdiff_t idx = row - rows[0];
-      if (idx < rows.size() && rows[idx] == row) {
-        local_row = idx;
-      }
-    }
- 
-    //if we didn't get the local-row index using direct lookup, try a
-    //more expensive binary-search:
-    if (local_row == -1) {
-      typename std::vector<GlobalOrdinal>::iterator row_iter =
-          std::lower_bound(rows.begin(), rows.end(), row);
-  
-      //if we still haven't found row, it's not local so jump out:
-      if (row_iter == rows.end() || *row_iter != row) {
-        row_length = 0;
-        return;
-      }
-  
-      local_row = row_iter - rows.begin();
-    }
-
-    LocalOrdinalType offset = row_offsets[local_row];
-    row_length = row_offsets[local_row+1] - offset;
-    cols = &packed_cols[offset];
-    coefs = &packed_coefs[offset];
-  }
-};
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/ComputeNodeType.hpp b/kokkos/basic/ComputeNodeType.hpp
deleted file mode 100644
index e59f3eb..0000000
--- a/kokkos/basic/ComputeNodeType.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ComputeNodeType_hpp_
-#define _ComputeNodeType_hpp_
-
-#if defined(MINIFE_HAVE_TBB)
-
-#include <tbb/task_scheduler_init.h>
-#include <TBBNode.hpp>
-typedef TBBNode ComputeNodeType;
-
-#elif defined(MINIFE_HAVE_TPI)
-
-#include <TPI.h>
-#include <TPINode.hpp>
-typedef TPINode ComputeNodeType;
-
-#elif defined(MINIFE_HAVE_CUDA)
-
-#include <CudaNode.hpp>
-typedef CUDANode ComputeNodeType;
-
-#else
-
-#include <SerialComputeNode.hpp>
-typedef SerialComputeNode ComputeNodeType;
-
-#endif
-
-#endif
-
diff --git a/kokkos/basic/DotOp.hpp b/kokkos/basic/DotOp.hpp
deleted file mode 100644
index 6471949..0000000
--- a/kokkos/basic/DotOp.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef DOTOP_HPP_
-#define DOTOP_HPP_
-
-template <class Scalar>
-struct DotOp {
-  typedef Scalar ReductionType;
-
-  const Scalar* x;
-  const Scalar* y;
-
-  size_t n;
-
-  ReductionType result;
-
-  inline DotOp() {
-    result = identity();
-  }
-
-  static inline KERNEL_PREFIX ReductionType identity()
-  {
-    return 0.0;
-  }
-
-  inline KERNEL_PREFIX ReductionType reduce(ReductionType u, ReductionType v) const
-  {
-    return u+v;
-  }
-
-  inline KERNEL_PREFIX Scalar generate(int i) const
-  {
-    return x[i]*y[i];
-  }
-};
-
-#endif
diff --git a/kokkos/basic/ELLMatrix.hpp b/kokkos/basic/ELLMatrix.hpp
deleted file mode 100644
index 97b662f..0000000
--- a/kokkos/basic/ELLMatrix.hpp
+++ /dev/null
@@ -1,144 +0,0 @@
-#ifndef _ELLMatrix_hpp_
-#define _ELLMatrix_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cstddef>
-#include <vector>
-#include <algorithm>
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-namespace miniFE {
-
-template<typename Scalar,
-         typename LocalOrdinal,
-         typename GlobalOrdinal,
-         typename ComputeNode>
-struct
-ELLMatrix {
-  ELLMatrix(ComputeNode& comp_node)
-   : has_local_indices(false),
-     rows(),
-     cols(), coefs(),
-     num_cols(0),
-     num_cols_per_row(0),
-     compute_node(comp_node)
-#ifdef HAVE_MPI
-     ,external_index(), external_local_index(), elements_to_send(),
-      neighbors(), recv_length(), send_length(), send_buffer(), request()
-#endif
-  {
-  }
-
-  ~ELLMatrix()
-  {}
-
-  typedef Scalar        ScalarType;
-  typedef LocalOrdinal  LocalOrdinalType;
-  typedef GlobalOrdinal GlobalOrdinalType;
-  typedef ComputeNode   ComputeNodeType;
-
-  bool                       has_local_indices;
-  std::vector<GlobalOrdinal> rows;
-  std::vector<GlobalOrdinal> cols;
-  std::vector<Scalar>        coefs;
-  LocalOrdinal               num_cols;
-  LocalOrdinal               num_cols_per_row;
-  ComputeNode&               compute_node;
-
-#ifdef HAVE_MPI
-  std::vector<GlobalOrdinal> external_index;
-  std::vector<GlobalOrdinal>  external_local_index;
-  std::vector<GlobalOrdinal> elements_to_send;
-  std::vector<int>           neighbors;
-  std::vector<LocalOrdinal>  recv_length;
-  std::vector<LocalOrdinal>  send_length;
-  std::vector<Scalar>        send_buffer;
-  std::vector<MPI_Request>   request;
-#endif
-
-  size_t num_nonzeros() const
-  {
-    return rows.size()*num_cols_per_row;
-  }
-
-  void reserve_space(unsigned nrows, unsigned ncols_per_row)
-  {
-    rows.resize(nrows);
-    cols.resize(nrows * ncols_per_row);
-    coefs.resize(nrows * ncols_per_row);
-    num_cols_per_row = ncols_per_row;
-  }
-
-  void get_row_pointers(GlobalOrdinalType row, size_t& row_length,
-                        GlobalOrdinalType*& cols_ptr,
-                        ScalarType*& coefs_ptr)
-  {
-    ptrdiff_t local_row = -1;
-    //first see if we can get the local-row index using fast direct lookup:
-    if (rows.size() >= 1) {
-      ptrdiff_t idx = row - rows[0];
-      if (idx < rows.size() && rows[idx] == row) {
-        local_row = idx;
-      }
-    }
- 
-    //if we didn't get the local-row index using direct lookup, try a
-    //more expensive binary-search:
-    if (local_row == -1) {
-      typename std::vector<GlobalOrdinal>::iterator row_iter =
-          std::lower_bound(rows.begin(), rows.end(), row);
-  
-      //if we still haven't found row, it's not local so jump out:
-      if (row_iter == rows.end() || *row_iter != row) {
-        row_length = 0;
-        return;
-      }
-  
-      local_row = row_iter - rows.begin();
-    }
-
-    cols_ptr = &cols[local_row*num_cols_per_row];
-    coefs_ptr = &coefs[local_row*num_cols_per_row];
-    
-    int idx = num_cols_per_row-1;
-    while(idx>=0) {
-      if (cols_ptr[idx] != 0) break;
-      --idx;
-    }
-    row_length = idx+1;
-  }
-};
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/FEComputeElem.hpp b/kokkos/basic/FEComputeElem.hpp
deleted file mode 100644
index 03aa8a2..0000000
--- a/kokkos/basic/FEComputeElem.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef FECOMPUTEELEM_HPP_
-#define FECOMPUTEELEM_HPP_
-
-#include <Hex8.hpp>
-
-#ifndef KERNEL_PREFIX 
-#define KERNEL_PREFIX
-#endif
-
-template<typename GlobalOrdinal,typename Scalar>
-struct FEComputeElem {
-  Scalar* elem_node_coords;
-  Scalar* elem_diffusion_matrix;
-  Scalar* elem_source_vector;
-
-inline KERNEL_PREFIX void operator()(int i)
-{
-  unsigned nnodes = miniFE::Hex8::numNodesPerElem;
-  unsigned dim = miniFE::Hex8::spatialDim;
-  Scalar* coords = elem_node_coords+i*nnodes*dim;
-  Scalar* diffusionMat = elem_diffusion_matrix+i*nnodes*nnodes;
-  Scalar* sourceVec = elem_source_vector+i*nnodes;
-
-  miniFE::Hex8::diffusionMatrix(coords, diffusionMat);
-  miniFE::Hex8::sourceVector(coords, sourceVec);
-}
-};
-
-#endif
diff --git a/kokkos/basic/FusedMatvecDotOp.hpp b/kokkos/basic/FusedMatvecDotOp.hpp
deleted file mode 100644
index e4b59e4..0000000
--- a/kokkos/basic/FusedMatvecDotOp.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef FUSEDMATVECDOTOP_HPP_
-#define FUSEDMATVECDOTOP_HPP_
-
-#ifndef KERNEL_PREFIX
-#define KERNEL_PREFIX
-#endif
-
-template <typename MatrixType,
-          typename VectorType>
-struct FusedMatvecDotOp {
-
-  typedef typename VectorType::GlobalOrdinalType GlobalOrdinalType;
-  typedef typename VectorType::LocalOrdinalType LocalOrdinalType;
-  typedef typename VectorType::ScalarType ScalarType;
-  typedef ScalarType ReductionType;
-
-  size_t n;
-
-  const LocalOrdinalType*  Arowoffsets;
-  const GlobalOrdinalType* Acols;
-  const ScalarType*        Acoefs;
-
-  const ScalarType* x;
-        ScalarType* y;
-  ScalarType beta;
-
-  ReductionType result;
-
-  inline FusedMatvecDotOp() {
-    result = identity();
-  }
-
-  static inline KERNEL_PREFIX ReductionType identity()
-  {
-    return 0.0;
-  }
-
-  inline KERNEL_PREFIX ReductionType reduce(ReductionType u, ReductionType v) const
-  {
-    return u+v;
-  }
-
-  inline KERNEL_PREFIX ScalarType generate(int row)
-  {
-    //we count on the caller (ComputeNode) to pass in 'row'
-    //in range 0..n-1
-  
-    ScalarType sum = beta*y[row];
-
-    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
-      sum += Acoefs[i]*x[Acols[i]];
-    }
-
-    y[row] = sum;
-    return x[row]*sum;
-  }
-};
-
-#endif
diff --git a/kokkos/basic/GetNodesCoords.hpp b/kokkos/basic/GetNodesCoords.hpp
deleted file mode 100644
index 01ed26a..0000000
--- a/kokkos/basic/GetNodesCoords.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _GETNODESCOORDS_HPP_
-#define _GETNODESCOORDS_HPP_
-
-#include <Hex8_enums.hpp>
-#include <simple_mesh_description.hpp>
-
-template<typename GlobalOrdinal,typename Scalar>
-struct GetNodesCoords {
-  const miniFE::simple_mesh_description<GlobalOrdinal>* mesh;
-  GlobalOrdinal* elemIDs;
-  GlobalOrdinal* node_ordinals;
-  Scalar* elem_node_coords;
-
-inline void operator()(int i)
-{
-  unsigned nnodes = miniFE::Hex8::numNodesPerElem;
-  GlobalOrdinal elemID = elemIDs[i];
-  GlobalOrdinal* node_ords = node_ordinals+i*nnodes;
-  Scalar* node_coords = elem_node_coords+i*nnodes*miniFE::Hex8::spatialDim;
-  get_elem_nodes_and_coords(*mesh, elemID, node_ords, node_coords);
-}
-};
-
-#endif
diff --git a/kokkos/basic/Hex8_box_utils.hpp b/kokkos/basic/Hex8_box_utils.hpp
deleted file mode 100644
index c1662ec..0000000
--- a/kokkos/basic/Hex8_box_utils.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef _Hex8_box_utils_hpp_
-#define _Hex8_box_utils_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <stdexcept>
-
-#include <box_utils.hpp>
-#include <ElemData.hpp>
-#include <simple_mesh_description.hpp>
-#include <Hex8.hpp>
-
-namespace miniFE {
-
-
-template<typename GlobalOrdinal>
-void get_hex8_node_ids(int nx, int ny,
-                       GlobalOrdinal node0,
-                       GlobalOrdinal* elem_node_ids)
-{
-//Given box dimensions nx and ny, and a starting node
-//(local-node-0 for a hex8), compute the other nodes
-//of the hex8 using the exodus ordering convention.
-  elem_node_ids[0] = node0;
-  elem_node_ids[1] = node0 + 1;
-  elem_node_ids[2] = node0 + nx + 1;
-  elem_node_ids[3] = node0 + nx;
-  elem_node_ids[4] = node0 +     nx*ny;
-  elem_node_ids[5] = node0 + 1 + nx*ny;
-  elem_node_ids[6] = node0 + nx + nx*ny + 1;
-  elem_node_ids[7] = node0 + nx + nx*ny;
-}
-
-template<typename Scalar>
-void get_hex8_node_coords_3d(Scalar x, Scalar y, Scalar z,
-                             Scalar hx, Scalar hy, Scalar hz,
-                             Scalar* elem_node_coords)
-{
-  //Input: x,y,z are the coordinates of local-node 0 for a Hex8.
-  //'hx', 'hy', 'hz' are the lengths of the sides of the element
-  //in each direction.
-
-  elem_node_coords[0] = x;
-  elem_node_coords[1] = y;
-  elem_node_coords[2] = z;
-
-  elem_node_coords[3] = x + hx;
-  elem_node_coords[4] = y;
-  elem_node_coords[5] = z;
-
-  elem_node_coords[6] = x + hx;
-  elem_node_coords[7] = y + hy;
-  elem_node_coords[8] = z;
-
-  elem_node_coords[9]  = x;
-  elem_node_coords[10] = y + hy;
-  elem_node_coords[11] = z;
-
-  elem_node_coords[12] = x;
-  elem_node_coords[13] = y;
-  elem_node_coords[14] = z + hz;
-
-  elem_node_coords[15] = x + hx;
-  elem_node_coords[16] = y;
-  elem_node_coords[17] = z + hz;
-
-  elem_node_coords[18] = x + hx;
-  elem_node_coords[19] = y + hy;
-  elem_node_coords[20] = z + hz;
-
-  elem_node_coords[21] = x;
-  elem_node_coords[22] = y + hy;
-  elem_node_coords[23] = z + hz;
-}
-
-template<typename GlobalOrdinal, typename Scalar>
-void
-get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
-                          GlobalOrdinal elemID,
-                          GlobalOrdinal* node_ords, Scalar* node_coords)
-{
-  int global_nodes_x = mesh.global_box[0][1]+1;
-  int global_nodes_y = mesh.global_box[1][1]+1;
-  int global_nodes_z = mesh.global_box[2][1]+1;
- 
-  if (elemID < 0) {
-    //I don't think this can happen, but check for the sake of paranoia...
-    throw std::runtime_error("get_elem_nodes_and_coords ERROR, negative elemID");
-  }
-
-  int elem_int_x, elem_int_y, elem_int_z;
-  get_int_coords(elemID, global_nodes_x-1, global_nodes_y-1, global_nodes_z-1,
-             elem_int_x, elem_int_y, elem_int_z);
-  GlobalOrdinal nodeID = get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z, elem_int_x, elem_int_y, elem_int_z);
-
-#ifdef MINIFE_DEBUG
-  std::cout<<"\nelemID: "<<elemID<<", nodeID: "<<nodeID<<std::endl;
-#endif
-  get_hex8_node_ids(global_nodes_x, global_nodes_y, nodeID, node_ords);
-
-  //Map node-IDs to rows because each processor may have a non-contiguous block of
-  //node-ids, but needs a contiguous block of row-numbers:
-#ifdef MINIFE_DEBUG
-  std::cout<<"elem "<<elemID<<" nodes: ";
-#endif
-  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
-#ifdef MINIFE_DEBUG
-    std::cout<<node_ords[i]<<" ";
-#endif
-    node_ords[i] = mesh.map_id_to_row(node_ords[i]);
-  }
-#ifdef MINIFE_DEBUG
-  std::cout << std::endl;
-#endif
-
-  int global_elems_x = mesh.global_box[0][1];
-  int global_elems_y = mesh.global_box[1][1];
-  int global_elems_z = mesh.global_box[2][1];
- 
-  Scalar ix,iy,iz;
-  get_coords<GlobalOrdinal,Scalar>(nodeID, global_nodes_x,global_nodes_y,global_nodes_z,
-                            ix,iy,iz);
-  Scalar hx = 1.0/global_elems_x;
-  Scalar hy = 1.0/global_elems_y;
-  Scalar hz = 1.0/global_elems_z;
-  get_hex8_node_coords_3d(ix, iy, iz, hx, hy, hz, node_coords);
-#ifdef MINIFE_DEBUG
-  int offset = 0;
-  for(int i=0; i<Hex8::numNodesPerElem; ++i) {
-    std::cout << "("<<node_coords[offset++]<<","<<node_coords[offset++]<<","<<node_coords[offset++]<<")";
-  }
-  std::cout << std::endl;
-#endif
-}
-
-template<typename GlobalOrdinal, typename Scalar>
-void
-get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
-                          GlobalOrdinal elemID,
-                          ElemData<GlobalOrdinal,Scalar>& elem_data)
-{
-  get_elem_nodes_and_coords(mesh, elemID, elem_data.elem_node_ids, elem_data.elem_node_coords);
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/Lock.hpp b/kokkos/basic/Lock.hpp
deleted file mode 100644
index 16be86f..0000000
--- a/kokkos/basic/Lock.hpp
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef _Lock_hpp_
-#define _Lock_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#ifdef MINIFE_HAVE_TBB
-
-#include <iostream>
-#include <tbb/atomic.h>
-
-namespace miniFE {
-
-static tbb::atomic<size_t> miniFE_num_matrix_conflicts;
-static tbb::atomic<size_t> miniFE_num_vector_conflicts;
-
-//We have two lock classes, LockM and LockV. The only reason for
-//this is so that they can separately track the number of conflicts
-//for matrix accesses versus vector accesses (by incrementing the
-//above counters).
-//The LockingMatrix class uses LockM, LockingVector uses LockV.
-
-template<typename T>
-class LockM {
-public:
-   // Constructors/destructors
-   LockM(tbb::atomic<T>& row)
-       : locked_row_(row)
-   {
-     if (++locked_row_ != 1) {
-       unsigned counter = 0;
-       while(locked_row_ != 1) {
-         ++counter;
-       }
-       ++miniFE_num_matrix_conflicts;
-     }
-   }
-   ~LockM()
-   { --locked_row_; }
-
-private:
-   tbb::atomic<T>& locked_row_;
-   LockM(const LockM&);
-   LockM& operator=(const LockM&);
-};
-
-template<typename T>
-class LockV {
-public:
-   // Constructors/destructors
-   LockV(tbb::atomic<T>& row)
-       : locked_row_(row)
-   {
-     if (++locked_row_ != 1) {
-       unsigned counter = 0;
-       while(locked_row_ != 1) {
-         ++counter;
-       }
-       ++miniFE_num_vector_conflicts;
-     }
-   }
-   ~LockV()
-   { --locked_row_; }
-
-private:
-   tbb::atomic<T>& locked_row_;
-   LockV(const LockV&);
-   LockV& operator=(const LockV&);
-};
-
-}//namespace miniFE
-
-#else
-#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB isn't defined."
-#endif
-
-#endif
-
diff --git a/kokkos/basic/LockingMatrix.hpp b/kokkos/basic/LockingMatrix.hpp
deleted file mode 100644
index c278274..0000000
--- a/kokkos/basic/LockingMatrix.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef _LockingMatrix_hpp_
-#define _LockingMatrix_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <vector>
-
-#include <Lock.hpp>
-
-namespace miniFE {
-
-template<typename MatrixType>
-class LockingMatrix {
-public:
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-  LockingMatrix(MatrixType& A) : A_(A), myFirstRow_(0), myLastRow_(0), numMyRows_(0), row_locks_()
-  {
-    if (A_.rows.size() > 0) {
-      myFirstRow_ = A_.rows[0];
-      myLastRow_ = A_.rows[A_.rows.size()-1];
-    }
-    numMyRows_ = myLastRow_-myFirstRow_+1;
-    row_locks_.resize(numMyRows_);
-  }
-
-  void sum_in(GlobalOrdinal row, size_t row_len, const GlobalOrdinal* col_indices, const Scalar* values)
-  {
-    int local_row = row - myFirstRow_;
-    if (local_row >= 0 && local_row < numMyRows_) {
-      LockM<int> lock(row_locks_[local_row]);
-      sum_into_row(row, row_len, col_indices, values, A_);
-    }
-  }
-
-private:
-  MatrixType& A_;
-  GlobalOrdinal myFirstRow_;
-  GlobalOrdinal myLastRow_;
-  size_t numMyRows_;
-  std::vector<tbb::atomic<int> > row_locks_;
-};
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/LockingVector.hpp b/kokkos/basic/LockingVector.hpp
deleted file mode 100644
index 60f7598..0000000
--- a/kokkos/basic/LockingVector.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef _LockingVector_hpp_
-#define _LockingVector_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <vector>
-
-#include <Lock.hpp>
-
-namespace miniFE {
-
-template<typename VectorType>
-class LockingVector {
-public:
-  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename VectorType::ScalarType Scalar;
-
-  LockingVector(VectorType& x) : x_(x), myFirstRow_(0), myLastRow_(0), numMyRows_(0), row_locks_()
-  {
-    if (x_.local_size > 0) {
-      myFirstRow_ = x_.startIndex;
-      myLastRow_ = myFirstRow_ + x_.local_size - 1;
-    }
-    numMyRows_ = myLastRow_-myFirstRow_+1;
-    row_locks_.resize(numMyRows_);
-  }
-
-  void sum_in(size_t num_indices, const GlobalOrdinal* indices, const Scalar* values)
-  {
-    for(int i=0; i<num_indices; ++i) {
-      GlobalOrdinal row = indices[i];
-      int local_row = row - myFirstRow_;
-      if (local_row >= 0 && local_row < numMyRows_) {
-        LockV<int> lock(row_locks_[local_row]);
-        sum_into_vector(1, &row, &values[i], x_);
-      }
-    }
-  }
-
-private:
-  VectorType& x_;
-  GlobalOrdinal myFirstRow_;
-  GlobalOrdinal myLastRow_;
-  size_t numMyRows_;
-  std::vector<tbb::atomic<int> > row_locks_;
-};
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/MatrixCopyOp.hpp b/kokkos/basic/MatrixCopyOp.hpp
deleted file mode 100644
index f6c300a..0000000
--- a/kokkos/basic/MatrixCopyOp.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _MatrixCopyOp_hpp_
-#define _MatrixCopyOp_hpp_
-
-template<typename MatrixType>
-struct MatrixCopyOp {
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
-  typedef typename MatrixType::ScalarType ScalarType;
-
-  const GlobalOrdinalType* src_rows;
-  const LocalOrdinalType*  src_rowoffsets;
-  const GlobalOrdinalType* src_cols;
-  const ScalarType*        src_coefs;
-
-  GlobalOrdinalType* dest_rows;
-  LocalOrdinalType*  dest_rowoffsets;
-  GlobalOrdinalType* dest_cols;
-  ScalarType*        dest_coefs;
-  int n;
-
-  inline void operator()(int i)
-  {
-    dest_rows[i] = src_rows[i];
-    dest_rowoffsets[i] = src_rowoffsets[i];
-    for(int j=src_rowoffsets[i]; j<src_rowoffsets[i+1]; ++j) {
-      dest_cols[j] = src_cols[j];
-      dest_coefs[j] = src_coefs[j];
-    }
-  }
-};
-
-#endif
-
diff --git a/kokkos/basic/MatrixInitOp.hpp b/kokkos/basic/MatrixInitOp.hpp
deleted file mode 100644
index 0ab9048..0000000
--- a/kokkos/basic/MatrixInitOp.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-#ifndef _MatrixInitOp_hpp_
-#define _MatrixInitOp_hpp_
-
-#include <simple_mesh_description.hpp>
-#include <box_utils.hpp>
-#include <ComputeNodeType.hpp>
-
-#include <CSRMatrix.hpp>
-#include <ELLMatrix.hpp>
-
-#include <algorithm>
-
-template<typename GlobalOrdinal>
-void sort_if_needed(GlobalOrdinal* list,
-                    GlobalOrdinal list_len)
-{
-  bool need_to_sort = false;
-  for(GlobalOrdinal i=list_len-1; i>=1; --i) {
-    if (list[i] < list[i-1]) {
-      need_to_sort = true;
-      break;
-    }
-  }
-
-  if (need_to_sort) {
-    std::sort(list,list+list_len);
-  }
-}
-
-template<typename MatrixType>
-struct MatrixInitOp {
-};
-
-template<>
-struct MatrixInitOp<miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType> > {
-  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
-               const std::vector<MINIFE_LOCAL_ORDINAL>& row_offsets_vec,
-               const std::vector<int>& row_coords_vec,
-               int global_nx, int global_ny, int global_nz,
-               MINIFE_GLOBAL_ORDINAL global_n_rows,
-               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
-               miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType>& matrix)
-   : rows(&rows_vec[0]),
-     row_offsets(&row_offsets_vec[0]),
-     row_coords(&row_coords_vec[0]),
-     global_nodes_x(global_nx),
-     global_nodes_y(global_ny),
-     global_nodes_z(global_nz),
-     global_nrows(global_n_rows),
-     mesh(&input_mesh),
-     dest_rows(&matrix.rows[0]),
-     dest_rowoffsets(&matrix.row_offsets[0]),
-     dest_cols(&matrix.packed_cols[0]),
-     dest_coefs(&matrix.packed_coefs[0]),
-     n(matrix.rows.size())
-  {
-    matrix.packed_cols.resize(row_offsets_vec[n]);
-    matrix.packed_coefs.resize(row_offsets_vec[n]);
-    dest_rowoffsets[n] = row_offsets_vec[n];
-  }
-
-  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
-  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
-  typedef MINIFE_SCALAR ScalarType;
-
-  const GlobalOrdinalType* rows;
-  const LocalOrdinalType*  row_offsets;
-  const int*               row_coords;
-
-  int global_nodes_x;
-  int global_nodes_y;
-  int global_nodes_z;
-
-  GlobalOrdinalType global_nrows;
-
-  GlobalOrdinalType* dest_rows;
-  LocalOrdinalType*  dest_rowoffsets;
-  GlobalOrdinalType* dest_cols;
-  ScalarType*        dest_coefs;
-  int n;
-
-  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
-
-  inline void operator()(int i)
-  {
-    dest_rows[i] = rows[i];
-    int offset = row_offsets[i];
-    dest_rowoffsets[i] = offset;
-    int ix = row_coords[i*3];
-    int iy = row_coords[i*3+1];
-    int iz = row_coords[i*3+2];
-    GlobalOrdinalType nnz = 0;
-    for(int sz=-1; sz<=1; ++sz)
-      for(int sy=-1; sy<=1; ++sy)
-        for(int sx=-1; sx<=1; ++sx) {
-          GlobalOrdinalType col_id =
-              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
-                                   ix+sx, iy+sy, iz+sz);
-          if (col_id >= 0 && col_id < global_nrows) {
-            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
-            dest_cols[offset+nnz] = col;
-            dest_coefs[offset+nnz] = 0;
-            ++nnz;
-          }
-        }
-
-    sort_if_needed(&dest_cols[offset], nnz);
-  }
-};
-
-template<>
-struct MatrixInitOp<miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType> > {
-  MatrixInitOp(const std::vector<MINIFE_GLOBAL_ORDINAL>& rows_vec,
-               const std::vector<MINIFE_LOCAL_ORDINAL>& /*row_offsets_vec*/,
-               const std::vector<int>& row_coords_vec,
-               int global_nx, int global_ny, int global_nz,
-               MINIFE_GLOBAL_ORDINAL global_n_rows,
-               const miniFE::simple_mesh_description<MINIFE_GLOBAL_ORDINAL>& input_mesh,
-               miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,ComputeNodeType>& matrix)
-   : rows(&rows_vec[0]),
-     row_coords(&row_coords_vec[0]),
-     global_nodes_x(global_nx),
-     global_nodes_y(global_ny),
-     global_nodes_z(global_nz),
-     global_nrows(global_n_rows),
-     mesh(&input_mesh),
-     dest_rows(&matrix.rows[0]),
-     dest_cols(&matrix.cols[0]),
-     dest_coefs(&matrix.coefs[0]),
-     n(matrix.rows.size()),
-     ncols_per_row(matrix.num_cols_per_row)
-  {
-  }
-
-  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
-  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
-  typedef MINIFE_SCALAR ScalarType;
-
-  const GlobalOrdinalType* rows;
-  const int*               row_coords;
-
-  int global_nodes_x;
-  int global_nodes_y;
-  int global_nodes_z;
-
-  GlobalOrdinalType global_nrows;
-
-  GlobalOrdinalType* dest_rows;
-  GlobalOrdinalType* dest_cols;
-  ScalarType*        dest_coefs;
-  int n;
-  int ncols_per_row;
-
-  const miniFE::simple_mesh_description<GlobalOrdinalType>* mesh;
-
-  inline void operator()(int i)
-  {
-    dest_rows[i] = rows[i];
-    int offset = i*ncols_per_row;
-    int ix = row_coords[i*3];
-    int iy = row_coords[i*3+1];
-    int iz = row_coords[i*3+2];
-    GlobalOrdinalType nnz = 0;
-    for(int sz=-1; sz<=1; ++sz)
-      for(int sy=-1; sy<=1; ++sy)
-        for(int sx=-1; sx<=1; ++sx) {
-          GlobalOrdinalType col_id =
-              miniFE::get_id<GlobalOrdinalType>(global_nodes_x, global_nodes_y, global_nodes_z,
-                                   ix+sx, iy+sy, iz+sz);
-          if (col_id >= 0 && col_id < global_nrows) {
-            GlobalOrdinalType col = mesh->map_id_to_row(col_id);
-            dest_cols[offset+nnz] = col;
-            dest_coefs[offset+nnz] = 0;
-            ++nnz;
-          }
-        }
-
-    sort_if_needed(&dest_cols[offset], nnz);
-  }
-};
-
-#endif
-
diff --git a/kokkos/basic/MatvecOp.hpp b/kokkos/basic/MatvecOp.hpp
deleted file mode 100644
index 9c5c8e4..0000000
--- a/kokkos/basic/MatvecOp.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef _MatvecOp_hpp_
-#define _MatvecOp_hpp_
-
-#ifndef KERNEL_PREFIX
-#define KERNEL_PREFIX
-#endif
-
-#include <CSRMatrix.hpp>
-#include <ELLMatrix.hpp>
-#include <ComputeNodeType.hpp>
-
-template<typename MatrixType>
-struct MatvecOp {
-};
-
-template<>
-struct MatvecOp<miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType> > {
-  MatvecOp(miniFE::CSRMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType>& A)
-  : n(A.rows.size()),
-    Arowoffsets(&A.row_offsets[0]),
-    Acols(&A.packed_cols[0]),
-    Acoefs(&A.packed_coefs[0])
-  {
-  }
-
-  size_t n;
-
-  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
-  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
-  typedef MINIFE_SCALAR ScalarType;
-
-  const LocalOrdinalType*  Arowoffsets;
-  const GlobalOrdinalType* Acols;
-  const ScalarType*        Acoefs;
-
-  const ScalarType* x;
-        ScalarType* y;
-  ScalarType beta;
-
-  inline KERNEL_PREFIX void operator()(int row)
-  {
-    //we count on the caller (ComputeNode) to pass in 'row'
-    //in range 0..n-1
-  
-    ScalarType sum = beta*y[row];
-
-    for(LocalOrdinalType i=Arowoffsets[row]; i<Arowoffsets[row+1]; ++i) {
-      sum += Acoefs[i]*x[Acols[i]];
-    }
-
-    y[row] = sum;
-  }
-
-};//struct MatvecOp
-
-template<>
-struct MatvecOp<miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType> > {
-  MatvecOp(miniFE::ELLMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL, ComputeNodeType>& A)
-  : n(A.rows.size()),
-    Acols(&A.cols[0]),
-    Acoefs(&A.coefs[0]),
-    ncols_per_row(A.num_cols_per_row)
-  {
-  }
-
-  size_t n;
-
-  typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinalType;
-  typedef MINIFE_LOCAL_ORDINAL LocalOrdinalType;
-  typedef MINIFE_SCALAR ScalarType;
-
-  const GlobalOrdinalType* Acols;
-  const ScalarType*        Acoefs;
-  int ncols_per_row;
-
-  const ScalarType* x;
-        ScalarType* y;
-  ScalarType beta;
-
-  inline KERNEL_PREFIX void operator()(int row)
-  {
-    //we count on the caller (ComputeNode) to pass in 'row'
-    //in range 0..n-1
-  
-    ScalarType sum = beta*y[row];
-
-    for(LocalOrdinalType i=0; i<ncols_per_row; ++i) {
-      GlobalOrdinalType col = Acols[row*ncols_per_row + i];
-      ScalarType coef      = Acoefs[row*ncols_per_row + i];
-      if (coef != 0) sum += coef*x[col];
-    }
-
-    y[row] = sum;
-  }
-
-};//struct MatvecOp
-
-#endif
-
diff --git a/kokkos/basic/MemInitOp.hpp b/kokkos/basic/MemInitOp.hpp
deleted file mode 100644
index f7bd579..0000000
--- a/kokkos/basic/MemInitOp.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef MEMINITOP_HPP_
-#define MEMINITOP_HPP_
-
-template <class Scalar>
-struct MemInitOp {
-  Scalar* ptr;
-  size_t n;
-  inline void operator()(size_t i)
-  {
-    ptr[i] = 0;
-  }
-};
-
-#endif
diff --git a/kokkos/basic/NoOpMemoryModel.hpp b/kokkos/basic/NoOpMemoryModel.hpp
deleted file mode 100644
index 92d1eb1..0000000
--- a/kokkos/basic/NoOpMemoryModel.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef _NoOpMemoryModel_hpp_
-#define _NoOpMemoryModel_hpp_
-
-class NoOpMemoryModel {
-  public:
-    NoOpMemoryModel(){}
-    virtual ~NoOpMemoryModel(){}
-
-    template<class T>
-    T* get_buffer(const T* host_ptr, size_t buf_size)
-    { return const_cast<T*>(host_ptr); }
-
-    template<class T>
-    void destroy_buffer(T*& device_ptr)
-    { }
-
-    template<class T>
-    void copy_to_buffer(const T* host_ptr, size_t buf_size, T* device_ptr)
-    { }
-
-    template<class T>
-    void copy_from_buffer(T* host_ptr, size_t buf_size, const T* device_ptr)
-    { }
-};
-
-#endif
-
diff --git a/kokkos/basic/SerialComputeNode.hpp b/kokkos/basic/SerialComputeNode.hpp
deleted file mode 100644
index 1f45ed8..0000000
--- a/kokkos/basic/SerialComputeNode.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef SERIALCOMPUTENODE_HPP_
-#define SERIALCOMPUTENODE_HPP_
-
-#include <NoOpMemoryModel.hpp>
-
-class SerialComputeNode : public NoOpMemoryModel {
-  public:
-    template <class WDP>
-    void parallel_for(unsigned int length, WDP wd) {
-      for(int i=0; i<length; ++i) {
-        wd(i);
-      }
-    }
-
-    template <class WDP>
-    void parallel_reduce(unsigned int length, WDP &wd) {
-      wd.result = wd.identity();
-      for(int i=0; i<length; ++i) {
-        wd.result = wd.reduce(wd.result, wd.generate(i));
-      }
-    }
-
-};
-
-#endif
diff --git a/kokkos/basic/SparseMatrix_functions.hpp b/kokkos/basic/SparseMatrix_functions.hpp
deleted file mode 100644
index f4f6e3e..0000000
--- a/kokkos/basic/SparseMatrix_functions.hpp
+++ /dev/null
@@ -1,621 +0,0 @@
-#ifndef _SparseMatrix_functions_hpp_
-#define _SparseMatrix_functions_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cstddef>
-#include <vector>
-#include <set>
-#include <algorithm>
-#include <sstream>
-#include <fstream>
-
-#include <Vector.hpp>
-#include <Vector_functions.hpp>
-#include <ElemData.hpp>
-#include <FusedMatvecDotOp.hpp>
-#include <MatvecOp.hpp>
-#include <MatrixInitOp.hpp>
-#include <MatrixCopyOp.hpp>
-#include <exchange_externals.hpp>
-#include <mytimer.hpp>
-
-#ifdef MINIFE_HAVE_TBB
-#include <LockingMatrix.hpp>
-#endif
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-namespace miniFE {
-
-template<typename MatrixType>
-void init_matrix(MatrixType& M,
-                 const std::vector<typename MatrixType::GlobalOrdinalType>& rows,
-                 const std::vector<typename MatrixType::LocalOrdinalType>& row_offsets,
-                 const std::vector<int>& row_coords,
-                 int global_nodes_x,
-                 int global_nodes_y,
-                 int global_nodes_z,
-                 typename MatrixType::GlobalOrdinalType global_nrows,
-                 const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh)
-{
-  MatrixInitOp<MatrixType> mat_init(rows, row_offsets, row_coords,
-                                 global_nodes_x, global_nodes_y, global_nodes_z,
-                                 global_nrows, mesh, M);
-
-#ifdef MINIFE_HAVE_CUDA
-//if on cuda, don't do this with parallel_for...
-  for(size_t i=0; i<mat_init.n; ++i) {
-    mat_init(i);
-  }
-#else
-  M.compute_node.parallel_for(mat_init.n, mat_init);
-#endif
-}
-
-template<typename T,
-         typename U>
-void sort_with_companions(ptrdiff_t len, T* array, U* companions)
-{
-  ptrdiff_t i, j, index;
-  U companion;
-
-  for (i=1; i < len; i++) {
-    index = array[i];
-    companion = companions[i];
-    j = i;
-    while ((j > 0) && (array[j-1] > index))
-    {
-      array[j] = array[j-1];
-      companions[j] = companions[j-1];
-      j = j - 1;
-    }
-    array[j] = index;
-    companions[j] = companion;
-  }
-}
-
-template<typename MatrixType>
-void write_matrix(const std::string& filename, 
-                  MatrixType& mat)
-{
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
-  typedef typename MatrixType::ScalarType ScalarType;
-
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  std::ostringstream osstr;
-  osstr << filename << "." << numprocs << "." << myproc;
-  std::string full_name = osstr.str();
-  std::ofstream ofs(full_name.c_str());
-
-  size_t nrows = mat.rows.size();
-  size_t nnz = mat.num_nonzeros();
-
-  for(int p=0; p<numprocs; ++p) {
-    if (p == myproc) {
-      if (p == 0) {
-        ofs << nrows << " " << nnz << std::endl;
-      }
-      for(size_t i=0; i<nrows; ++i) {
-        size_t row_len = 0;
-        GlobalOrdinalType* cols = NULL;
-        ScalarType* coefs = NULL;
-        mat.get_row_pointers(mat.rows[i], row_len, cols, coefs);
-
-        for(size_t j=0; j<row_len; ++j) {
-          ofs << mat.rows[i] << " " << cols[j] << " " << coefs[j] << std::endl;
-        }
-      }
-    }
-#ifdef HAVE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-  }
-}
-
-template<typename GlobalOrdinal,typename Scalar>
-void
-sum_into_row(int row_len,
-             GlobalOrdinal* row_indices,
-             Scalar* row_coefs,
-             int num_inputs,
-             const GlobalOrdinal* input_indices,
-             const Scalar* input_coefs)
-{
-  for(size_t i=0; i<num_inputs; ++i) {
-    GlobalOrdinal* loc = std::lower_bound(row_indices, row_indices+row_len,
-                                          input_indices[i]);
-    if (loc-row_indices < row_len && *loc == input_indices[i]) {
-//if(flag && *loc==6)
-//std::cout<<"  ("<<*loc<<":"<<row_coefs[loc-row_indices]<<" += "<<input_coefs[i]<<")"<<std::endl;
-      row_coefs[loc-row_indices] += input_coefs[i];
-    }
-  }
-}
-
-template<typename MatrixType>
-void
-sum_into_row(typename MatrixType::GlobalOrdinalType row,
-             size_t num_indices,
-             const typename MatrixType::GlobalOrdinalType* col_inds,
-             const typename MatrixType::ScalarType* coefs,
-             MatrixType& mat)
-{
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-  size_t row_len = 0;
-  GlobalOrdinal* mat_row_cols = NULL;
-  Scalar* mat_row_coefs = NULL;
-
-  mat.get_row_pointers(row, row_len, mat_row_cols, mat_row_coefs);
-  if (row_len == 0) return;
-
-  sum_into_row(row_len, mat_row_cols, mat_row_coefs, num_indices, col_inds, coefs);
-}
-
-template<typename MatrixType>
-void
-sum_in_symm_elem_matrix(size_t num,
-                   const typename MatrixType::GlobalOrdinalType* indices,
-                   const typename MatrixType::ScalarType* coefs,
-                   MatrixType& mat)
-{
-  typedef typename MatrixType::ScalarType Scalar;
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-
-//indices is length num (which should be nodes-per-elem)
-//coefs is the upper triangle of the element diffusion matrix
-//which should be length num*(num+1)/2
-//std::cout<<std::endl;
-
-  int row_offset = 0;
-  bool flag = false;
-  for(size_t i=0; i<num; ++i) {
-    GlobalOrdinal row = indices[i];
- 
-    const Scalar* row_coefs = &coefs[row_offset];
-    const GlobalOrdinal* row_col_inds = &indices[i];
-    size_t row_len = num - i;
-    row_offset += row_len;
-
-    size_t mat_row_len = 0;
-    GlobalOrdinal* mat_row_cols = NULL;
-    Scalar* mat_row_coefs = NULL;
-  
-    mat.get_row_pointers(row, mat_row_len, mat_row_cols, mat_row_coefs);
-    if (mat_row_len == 0) continue;
-
-    sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
-                 row_len, row_col_inds, row_coefs);
-
-    int offset = i;
-    for(size_t j=0; j<i; ++j) {
-      Scalar coef = coefs[offset];
-//std::cout<<"i: "<<i<<", j: "<<j<<", offset: "<<offset<<std::endl;
-      sum_into_row(mat_row_len, mat_row_cols, mat_row_coefs,
-                   1, &indices[j], &coef);
-      offset += num - (j+1);
-    }
-  }
-}
-
-template<typename MatrixType>
-void
-sum_in_elem_matrix(size_t num,
-                   const typename MatrixType::GlobalOrdinalType* indices,
-                   const typename MatrixType::ScalarType* coefs,
-                   MatrixType& mat)
-{
-  size_t offset = 0;
-
-  for(size_t i=0; i<num; ++i) {
-    sum_into_row(indices[i], num,
-                 &indices[0], &coefs[offset], mat);
-    offset += num;
-  }
-}
-
-template<typename GlobalOrdinal, typename Scalar,
-         typename MatrixType, typename VectorType>
-void
-sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
-                              MatrixType& A, VectorType& b)
-{
-  sum_in_symm_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
-                     elem_data.elem_diffusion_matrix, A);
-  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
-                  elem_data.elem_source_vector, b);
-}
-
-#ifdef MINIFE_HAVE_TBB
-template<typename MatrixType>
-void
-sum_in_elem_matrix(size_t num,
-                   const typename MatrixType::GlobalOrdinalType* indices,
-                   const typename MatrixType::ScalarType* coefs,
-                   LockingMatrix<MatrixType>& mat)
-{
-  size_t offset = 0;
-
-  for(size_t i=0; i<num; ++i) {
-    mat.sum_in(indices[i], num, &indices[0], &coefs[offset]);
-    offset += num;
-  }
-}
-
-template<typename GlobalOrdinal, typename Scalar,
-         typename MatrixType, typename VectorType>
-void
-sum_into_global_linear_system(ElemData<GlobalOrdinal,Scalar>& elem_data,
-                              LockingMatrix<MatrixType>& A, LockingVector<VectorType>& b)
-{
-  sum_in_elem_matrix(elem_data.nodes_per_elem, elem_data.elem_node_ids,
-                     elem_data.elem_diffusion_matrix, A);
-  sum_into_vector(elem_data.nodes_per_elem, elem_data.elem_node_ids,
-                  elem_data.elem_source_vector, b);
-}
-#endif
-
-template<typename MatrixType>
-void
-add_to_diagonal(typename MatrixType::ScalarType value, MatrixType& mat)
-{
-  for(size_t i=0; i<mat.rows.size(); ++i) {
-    sum_into_row(mat.rows[i], 1, &mat.rows[i], &value, mat);
-  }
-}
-
-template<typename MatrixType>
-double
-parallel_memory_overhead_MB(const MatrixType& A)
-{
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  double mem_MB = 0;
-
-#ifdef HAVE_MPI
-  double invMB = 1.0/(1024*1024);
-  mem_MB = invMB*A.external_index.size()*sizeof(GlobalOrdinal);
-  mem_MB += invMB*A.external_local_index.size()*sizeof(GlobalOrdinal);
-  mem_MB += invMB*A.elements_to_send.size()*sizeof(GlobalOrdinal);
-  mem_MB += invMB*A.neighbors.size()*sizeof(int);
-  mem_MB += invMB*A.recv_length.size()*sizeof(LocalOrdinal);
-  mem_MB += invMB*A.send_length.size()*sizeof(LocalOrdinal);
-
-  double tmp = mem_MB;
-  MPI_Allreduce(&tmp, &mem_MB, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#endif
-
-  return mem_MB;
-}
-
-template<typename MatrixType>
-void rearrange_matrix_local_external(MatrixType& A)
-{
-  //This function will rearrange A so that local entries are contiguous at the front
-  //of A's memory, and external entries are contiguous at the back of A's memory.
-  //
-  //A.row_offsets will describe where the local entries occur, and
-  //A.row_offsets_external will describe where the external entries occur.
-
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-  size_t nrows = A.rows.size();
-  std::vector<LocalOrdinal> tmp_row_offsets(nrows*2);
-  std::vector<LocalOrdinal> tmp_row_offsets_external(nrows*2);
-
-  LocalOrdinal num_local_nz = 0;
-  LocalOrdinal num_extern_nz = 0;
-
-  //First sort within each row of A, so that local entries come
-  //before external entries within each row.
-  //tmp_row_offsets describe the locations of the local entries, and
-  //tmp_row_offsets_external describe the locations of the external entries.
-  //
-  for(size_t i=0; i<nrows; ++i) {
-    GlobalOrdinal* row_begin = &A.packed_cols[A.row_offsets[i]];
-    GlobalOrdinal* row_end = &A.packed_cols[A.row_offsets[i+1]];
-
-    Scalar* coef_row_begin = &A.packed_coefs[A.row_offsets[i]];
-
-    tmp_row_offsets[i*2] = A.row_offsets[i];
-    tmp_row_offsets[i*2+1] = A.row_offsets[i+1];
-    tmp_row_offsets_external[i*2] = A.row_offsets[i+1];
-    tmp_row_offsets_external[i*2+1] = A.row_offsets[i+1];
-
-    ptrdiff_t row_len = row_end - row_begin;
-
-    sort_with_companions(row_len, row_begin, coef_row_begin);
-
-    GlobalOrdinal* row_iter = std::lower_bound(row_begin, row_end, nrows);
-
-    LocalOrdinal offset = A.row_offsets[i] + row_iter-row_begin;
-    tmp_row_offsets[i*2+1] = offset;
-    tmp_row_offsets_external[i*2] = offset;
-
-    num_local_nz += tmp_row_offsets[i*2+1]-tmp_row_offsets[i*2];
-    num_extern_nz += tmp_row_offsets_external[i*2+1]-tmp_row_offsets_external[i*2];
-  }
-
-  //Next, copy the external entries into separate arrays.
-
-  std::vector<GlobalOrdinal> ext_cols(num_extern_nz);
-  std::vector<Scalar> ext_coefs(num_extern_nz);
-  std::vector<LocalOrdinal> ext_offsets(nrows+1);
-  LocalOrdinal offset = 0;
-  for(size_t i=0; i<nrows; ++i) {
-    ext_offsets[i] = offset;
-    for(LocalOrdinal j=tmp_row_offsets_external[i*2];
-                     j<tmp_row_offsets_external[i*2+1]; ++j) {
-      ext_cols[offset] = A.packed_cols[j];
-      ext_coefs[offset++] = A.packed_coefs[j];
-    }
-  }
-  ext_offsets[nrows] = offset;
-
-  //Now slide all local entries down to the beginning of A's packed arrays
-
-  A.row_offsets.resize(nrows+1);
-  offset = 0;
-  for(size_t i=0; i<nrows; ++i) {
-    A.row_offsets[i] = offset;
-    for(LocalOrdinal j=tmp_row_offsets[i*2]; j<tmp_row_offsets[i*2+1]; ++j) {
-      A.packed_cols[offset] = A.packed_cols[j];
-      A.packed_coefs[offset++] = A.packed_coefs[j];
-    }
-  }
-  A.row_offsets[nrows] = offset;
-
-  //Finally, copy the external entries back into A.packed_cols and
-  //A.packed_coefs, starting at the end of the local entries.
-
-  for(LocalOrdinal i=offset; i<offset+ext_cols.size(); ++i) {
-    A.packed_cols[i] = ext_cols[i-offset];
-    A.packed_coefs[i] = ext_coefs[i-offset];
-  }
-
-  A.row_offsets_external.resize(nrows+1);
-  for(size_t i=0; i<=nrows; ++i) A.row_offsets_external[i] = ext_offsets[i] + offset;
-}
-
-//------------------------------------------------------------------------
-template<typename MatrixType>
-void
-zero_row_and_put_1_on_diagonal(MatrixType& A, typename MatrixType::GlobalOrdinalType row)
-{
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-  size_t row_len = 0;
-  GlobalOrdinal* cols = NULL;
-  Scalar* coefs = NULL;
-  A.get_row_pointers(row, row_len, cols, coefs);
-  
-  for(size_t i=0; i<row_len; ++i) {
-    if (cols[i] == row) coefs[i] = 1;
-    else coefs[i] = 0;
-  }
-}
-
-//------------------------------------------------------------------------
-template<typename MatrixType,
-         typename VectorType>
-void
-impose_dirichlet(typename MatrixType::ScalarType prescribed_value,
-                    MatrixType& A,
-                    VectorType& b,
-                    int global_nx,
-                    int global_ny,
-                    int global_nz,
-                    const std::set<typename MatrixType::GlobalOrdinalType>& bc_rows)
-{
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-  GlobalOrdinal first_local_row = A.rows.size()>0 ? A.rows[0] : 0;
-  GlobalOrdinal last_local_row  = A.rows.size()>0 ? A.rows[A.rows.size()-1] : -1;
-
-  typename std::set<GlobalOrdinal>::const_iterator
-    bc_iter = bc_rows.begin(), bc_end = bc_rows.end();
-  for(; bc_iter!=bc_end; ++bc_iter) {
-    GlobalOrdinal row = *bc_iter;
-    if (row >= first_local_row && row <= last_local_row) {
-      size_t local_row = row - first_local_row;
-      b.coefs[local_row] = prescribed_value;
-      zero_row_and_put_1_on_diagonal(A, row);
-    }
-  }
-
-  for(size_t i=0; i<A.rows.size(); ++i) {
-    GlobalOrdinal row = A.rows[i];
-
-    if (bc_rows.find(row) != bc_rows.end()) continue;
-
-    size_t row_length = 0;
-    GlobalOrdinal* cols = NULL;
-    Scalar* coefs = NULL;
-    A.get_row_pointers(row, row_length, cols, coefs);
-
-    Scalar sum = 0;
-    for(size_t j=0; j<row_length; ++j) {
-      if (bc_rows.find(cols[j]) != bc_rows.end()) {
-        sum += coefs[j];
-        coefs[j] = 0;
-      }
-    }
-
-    b.coefs[i] -= sum*prescribed_value;
-  }
-}
-
-static timer_type exchtime = 0;
-
-//------------------------------------------------------------------------
-//Compute matrix vector product y = A*x and return dot(x,y), where:
-//
-// A - input matrix
-// x - input vector
-// y - result vector
-//
-template<typename MatrixType,
-         typename VectorType>
-typename TypeTraits<typename VectorType::ScalarType>::magnitude_type
-matvec_and_dot(MatrixType& A,
-               VectorType& x,
-               VectorType& y)
-{
-  timer_type t0 = mytimer();
-  exchange_externals(A, x);
-  exchtime += mytimer()-t0;
-
-  typedef typename TypeTraits<typename VectorType::ScalarType>::magnitude_type magnitude;
-  typedef typename MatrixType::ScalarType ScalarType;
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
-  typedef typename MatrixType::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& comp_node = A.compute_node;
-
-  FusedMatvecDotOp<MatrixType,VectorType> mvdotop;
-
-  mvdotop.n = A.rows.size();
-  mvdotop.Arowoffsets = comp_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
-  mvdotop.Acols       = comp_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
-  mvdotop.Acoefs      = comp_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
-  mvdotop.x = comp_node.get_buffer(&x.coefs[0], x.coefs.size());
-  mvdotop.y = comp_node.get_buffer(&y.coefs[0], y.coefs.size());
-  mvdotop.beta = 0;
-
-  comp_node.parallel_reduce(mvdotop.n, mvdotop);
-
-#ifdef HAVE_MPI
-  magnitude local_dot = mvdotop.result, global_dot = 0;
-  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
-  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
-  return global_dot;
-#else
-  return mvdotop.result;
-#endif
-}
-
-//------------------------------------------------------------------------
-//Compute matrix vector product y = A*x where:
-//
-// A - input matrix
-// x - input vector
-// y - result vector
-//
-template<typename MatrixType,
-         typename VectorType>
-struct matvec_std {
-void operator()(MatrixType& A,
-            VectorType& x,
-            VectorType& y)
-{
-  exchange_externals(A, x);
-
-  typedef typename MatrixType::ScalarType ScalarType;
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
-  typedef typename MatrixType::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& comp_node = A.compute_node;
-
-  MatvecOp<MatrixType> mvop(A);
-
-  mvop.x = comp_node.get_buffer(&x.coefs[0], x.coefs.size());
-  mvop.y = comp_node.get_buffer(&y.coefs[0], y.coefs.size());
-  mvop.beta = 0;
-
-  comp_node.parallel_for(mvop.n, mvop);
-}
-};
-
-template<typename MatrixType,
-         typename VectorType>
-void matvec(MatrixType& A, VectorType& x, VectorType& y)
-{
-  matvec_std<MatrixType,VectorType> mv;
-  mv(A, x, y);
-}
-
-template<typename MatrixType,
-         typename VectorType>
-struct matvec_overlap {
-void operator()(MatrixType& A,
-                    VectorType& x,
-                    VectorType& y)
-{
-#ifdef HAVE_MPI
-  begin_exchange_externals(A, x);
-#endif
-
-  typedef typename MatrixType::ScalarType ScalarType;
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinalType;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinalType;
-  typedef typename MatrixType::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& comp_node = A.compute_node;
-
-  MatvecOp<MatrixType> mvop(A);
-
-  mvop.x = comp_node.get_buffer(&x.coefs[0], x.coefs.size());
-  mvop.y = comp_node.get_buffer(&y.coefs[0], y.coefs.size());
-  mvop.beta = 0;
-
-  comp_node.parallel_for(mvop.n, mvop);
-
-#ifdef HAVE_MPI
-  finish_exchange_externals(A.neighbors.size());
-
-  mvop.Arowoffsets = comp_node.get_buffer(&A.row_offsets_external[0], A.row_offsets_external.size());
-  mvop.beta = 1;
-
-  comp_node.parallel_for(A.rows.size(), mvop);
-#endif
-}
-};
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/SumInLinSys.hpp b/kokkos/basic/SumInLinSys.hpp
deleted file mode 100644
index d5f6471..0000000
--- a/kokkos/basic/SumInLinSys.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef _SUMINLINSYS_HPP_
-#define _SUMINLINSYS_HPP_
-
-#include <Hex8_enums.hpp>
-#include <LockingMatrix.hpp>
-#include <LockingVector.hpp>
-
-template<typename GlobalOrdinal,typename Scalar,
-         typename MatrixType, typename VectorType>
-struct SumInLinSys {
-  GlobalOrdinal* node_ordinals;
-  Scalar* elem_diffusion_matrix;
-  Scalar* elem_source_vector;
-  miniFE::LockingMatrix<MatrixType>* A;
-  miniFE::LockingVector<VectorType>* b;
-
-inline void operator()(int i)
-{
-  size_t nnodes = miniFE::Hex8::numNodesPerElem;
-  GlobalOrdinal* node_ords = node_ordinals+i*nnodes;
-  Scalar* diffusionMat = elem_diffusion_matrix+i*nnodes*nnodes;
-  Scalar* sourceVec = elem_source_vector+i*nnodes;
-  for(size_t ii=0; ii<nnodes; ++ii) {
-    GlobalOrdinal row = node_ords[ii];
-    A->sum_in(row, nnodes, node_ords,
-              &(diffusionMat[ii*nnodes]));
-    b->sum_in(1, &row, &(sourceVec[ii]));
-  }
-}
-
-};
-
-#endif
diff --git a/kokkos/basic/TBBNode.cpp b/kokkos/basic/TBBNode.cpp
deleted file mode 100644
index 20078fd..0000000
--- a/kokkos/basic/TBBNode.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifdef MINIFE_HAVE_TBB
-
-#include "TBBNode.hpp"
-
-tbb::task_scheduler_init TBBNode::tsi_(tbb::task_scheduler_init::deferred);
-
-#endif
-
diff --git a/kokkos/basic/TBBNode.hpp b/kokkos/basic/TBBNode.hpp
deleted file mode 100644
index 6b1fe89..0000000
--- a/kokkos/basic/TBBNode.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef TBBNODE_HPP_
-#define TBBNODE_HPP_
-
-#include <tbb/blocked_range.h>
-#include <tbb/parallel_for.h>
-#include <tbb/parallel_reduce.h>
-#include <tbb/task_scheduler_init.h>
-#include <stdlib.h>
-
-#include <NoOpMemoryModel.hpp>
-
-#include <iostream> // debug
-
-template <class WDPin>
-struct BlockedRangeWDP {
-  mutable WDPin wd;
-  BlockedRangeWDP(WDPin &in) : wd(in) {}
-  inline void operator()(tbb::blocked_range<int> &rng) const
-  {
-    for(int i=rng.begin(); i<rng.end(); ++i) {
-      wd(i);
-    }
-  }
-};
-
-template <class WDPin>
-struct BlockedRangeWDPReducer {
-  WDPin wd;
-  BlockedRangeWDPReducer(WDPin &in) : wd(in) {}
-  BlockedRangeWDPReducer(BlockedRangeWDPReducer &in, tbb::split) : wd(in.wd)
-  {
-    wd.result = wd.identity();
-  }
-  void operator()(tbb::blocked_range<int> &rng)
-  { 
-    for(int i=rng.begin(); i<rng.end(); ++i) {
-      wd.result = wd.reduce(wd.result, wd.generate(i));
-    }
-  }
-  inline void join( const BlockedRangeWDPReducer<WDPin> &other ) {
-    wd.result = wd.reduce( wd.result, other.wd.result );
-  }
-};
-
-class TBBNode : public NoOpMemoryModel {
-  public:
-
-    TBBNode(int numThreads=0) {
-      if (numThreads >= 1) {
-        tsi_.initialize(numThreads);
-      }
-      else {
-        tsi_.initialize(tbb::task_scheduler_init::automatic);
-      }
-    }
-
-    ~TBBNode() {}
-
-    template <class WDP>
-    void parallel_for(int length, WDP wd) {
-      BlockedRangeWDP<WDP> tbb_wd(wd);
-      tbb::parallel_for(tbb::blocked_range<int>(0,length), tbb_wd, tbb::auto_partitioner()); 
-    }
-
-    template <class WDP>
-    void parallel_reduce(int length, WDP &wd) {
-      BlockedRangeWDPReducer<WDP> tbb_wd(wd);
-      tbb::parallel_reduce(tbb::blocked_range<int>(0,length), tbb_wd, tbb::auto_partitioner());
-      wd.result = tbb_wd.wd.result;  // have to put result from final tbb_wd into orginal wd
-    }
-
-  private:
-    static tbb::task_scheduler_init tsi_;
-};
-
-#endif
diff --git a/kokkos/basic/TPINode.hpp b/kokkos/basic/TPINode.hpp
deleted file mode 100644
index 66ec84f..0000000
--- a/kokkos/basic/TPINode.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef TPINODE_HPP_
-#define TPINODE_HPP_
-
-#include <TPI.h>
-
-#include <NoOpMemoryModel.hpp>
-
-#include <iostream> // debug
-
-inline
-void tpi_work_span(TPI_Work* work, int n,
-                   size_t& ibeg, size_t& iend)
-{
-  const int chunk = ( n + work->count - 1 ) / work->count ;
-
-  iend = chunk * ( work->rank + 1 );
-  ibeg = chunk * ( work->rank );
-
-  if ( n < iend ) { iend = n; }
-}
-
-template<class WDP>
-void tpi_execute(TPI_Work * work)
-{
-  const WDP* const_wdp = static_cast<const WDP*>(work->info);
-  WDP* wdp = const_cast<WDP*>(const_wdp);
-  size_t n = wdp->n;
-  size_t ibeg = 0, iend = n;
-  tpi_work_span(work, n, ibeg, iend);
-  for(size_t i=ibeg; i<iend; ++i) {
-    (*wdp)(i);
-  }
-}
-
-template<class WDP>
-void tpi_reduction_work(TPI_Work * work)
-{
-  const WDP* wdp = static_cast<const WDP*>(work->info);
-  size_t n = wdp->n;
-  size_t ibeg = 0, iend = n;
-  tpi_work_span(work, n, ibeg, iend);
-
-  typedef typename WDP::ReductionType ReductionType;
-  ReductionType tmpres = wdp->result, tmpi;
-
-  for(size_t i=ibeg; i<iend; ++i) {
-    tmpi = wdp->generate(i);
-    tmpres = wdp->reduce(tmpres, tmpi);
-  }
-  *(static_cast<ReductionType*>(work->reduce)) = tmpres;
-}
-
-template<class WDP>
-void tpi_reduction_join(TPI_Work * work, const void* src)
-{
-  typedef typename WDP::ReductionType ReductionType;
-
-  const WDP* wdp = static_cast<const WDP*>(work->info);
-
-  ReductionType& work_reduce = *(static_cast<ReductionType*>(work->reduce));
-
-  work_reduce = wdp->reduce(work_reduce, *(static_cast<const ReductionType*>(src)) );
-}
-
-template<class WDP>
-void tpi_reduction_init(TPI_Work * work)
-{
-  typedef typename WDP::ReductionType ReductionType;
-
-  const WDP* wdp = static_cast<const WDP*>(work->info);
-
-  *(static_cast<ReductionType*>(work->reduce)) = wdp->identity();
-}
-
-class TPINode : public NoOpMemoryModel {
-  public:
-
-    TPINode(int numThreads=0)
-     : numThreads_(numThreads)
-    {
-      if (numThreads >= 1) {
-        TPI_Init(numThreads);
-      }
-    }
-
-    ~TPINode()
-    {
-      if (numThreads_ >= 1) {
-        TPI_Finalize();
-      }
-    }
-
-    template <class WDP>
-    void parallel_for(int length, WDP & wd ) {
-      TPI_Run_threads(tpi_execute<WDP>, &wd, 0 );
-    }
-
-    template <class WDP>
-    void parallel_reduce(int length, WDP & wd ) {
-      typedef typename WDP::ReductionType ReductionType;
-      ReductionType result = 0;
-      TPI_Run_threads_reduce(tpi_reduction_work<WDP>, &wd,
-                             tpi_reduction_join<WDP>,
-                             tpi_reduction_init<WDP>, sizeof(result), &result);
-      wd.result = result;
-    }
-
-  private:
-    int numThreads_;
-};
-
-#endif
-
diff --git a/kokkos/basic/TypeTraits.hpp b/kokkos/basic/TypeTraits.hpp
deleted file mode 100644
index 3ac472c..0000000
--- a/kokkos/basic/TypeTraits.hpp
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef _TypeTraits_hpp_
-#define _TypeTraits_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <complex>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-namespace miniFE {
-
-template<typename T> struct TypeTraits {};
-
-template<>
-struct TypeTraits<float> {
-  typedef float magnitude_type;
-
-  static const char* name() {return "float";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_FLOAT;}
-#endif
-};
-
-template<>
-struct TypeTraits<double> {
-  typedef double magnitude_type;
-
-  static const char* name() {return "double";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_DOUBLE;}
-#endif
-};
-
-template<>
-struct TypeTraits<int> {
-  typedef int magnitude_type;
-
-  static const char* name() {return "int";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_INT;}
-#endif
-};
-
-template<>
-struct TypeTraits<long int> {
-  typedef long int magnitude_type;
-
-  static const char* name() {return "long int";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_LONG;}
-#endif
-};
-
-#ifndef MINIFE_NO_LONG_LONG
-
-template<>
-struct TypeTraits<long long> {
-  typedef long long magnitude_type;
-
-  static const char* name() {return "long long";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_LONG_LONG;}
-#endif
-};
-
-#endif
-
-template<>
-struct TypeTraits<unsigned> {
-  typedef unsigned magnitude_type;
-
-  static const char* name() {return "unsigned";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_UNSIGNED;}
-#endif
-};
-
-template<>
-struct TypeTraits<std::complex<float> > {
-  typedef float magnitude_type;
-
-  static const char* name() {return "std::complex<float>";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_COMPLEX;}
-#endif
-};
-
-template<>
-struct TypeTraits<std::complex<double> > {
-  typedef double magnitude_type;
-
-  static const char* name() {return "std::complex<double>";}
-
-#ifdef HAVE_MPI
-  static MPI_Datatype mpi_type() {return MPI_DOUBLE_COMPLEX;}
-#endif
-};
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/Vector.hpp b/kokkos/basic/Vector.hpp
deleted file mode 100644
index 4290ae4..0000000
--- a/kokkos/basic/Vector.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#ifndef _Vector_hpp_
-#define _Vector_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <vector>
-
-#include <MemInitOp.hpp>
-
-namespace miniFE {
-
-
-template<typename Scalar,
-         typename LocalOrdinal,
-         typename GlobalOrdinal,
-         typename ComputeNode>
-struct Vector {
-  typedef ComputeNode ComputeNodeType;
-  typedef Scalar ScalarType;
-  typedef LocalOrdinal LocalOrdinalType;
-  typedef GlobalOrdinal GlobalOrdinalType;
-
-  Vector(GlobalOrdinal startIdx, LocalOrdinal local_sz, ComputeNode& cn)
-   : startIndex(startIdx),
-     local_size(local_sz),
-     coefs(local_size),
-     compute_node(cn)
-  {
-    MemInitOp<Scalar> mem_init;
-    mem_init.ptr = &coefs[0];
-    mem_init.n = local_size;
-#ifdef MINIFE_HAVE_CUDA
-//we don't want to run this mem-init kernel on cuda, we want
-//to just run it locally on the host.
-    for(size_t i=0; i<mem_init.n; ++i) {
-      mem_init(i);
-    }
-#else
-    cn.parallel_for(local_size, mem_init);
-#endif
-  }
-
-  ~Vector()
-  {
-  }
-
-  GlobalOrdinal startIndex;
-  LocalOrdinal local_size;
-  std::vector<Scalar> coefs;
-  ComputeNode& compute_node;
-};
-
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/Vector_functions.hpp b/kokkos/basic/Vector_functions.hpp
deleted file mode 100644
index f82866e..0000000
--- a/kokkos/basic/Vector_functions.hpp
+++ /dev/null
@@ -1,249 +0,0 @@
-#ifndef _Vector_functions_hpp_
-#define _Vector_functions_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <vector>
-#include <sstream>
-#include <fstream>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#ifdef MINIFE_HAVE_TBB
-#include <LockingVector.hpp>
-#endif
-
-#include <TypeTraits.hpp>
-#include <Vector.hpp>
-#include <WaxpbyOp.hpp>
-#include <DotOp.hpp>
-
-
-namespace miniFE {
-
-
-template<typename VectorType>
-void write_vector(const std::string& filename,
-                  const VectorType& vec)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  std::ostringstream osstr;
-  osstr << filename << "." << numprocs << "." << myproc;
-  std::string full_name = osstr.str();
-  std::ofstream ofs(full_name.c_str());
-
-  typedef typename VectorType::ScalarType ScalarType;
-
-  const std::vector<ScalarType>& coefs = vec.coefs;
-  for(int p=0; p<numprocs; ++p) {
-    if (p == myproc) {
-      if (p == 0) {
-        ofs << vec.local_size << std::endl;
-      }
-  
-      typename VectorType::GlobalOrdinalType first = vec.startIndex;
-      for(size_t i=0; i<vec.local_size; ++i) {
-        ofs << first+i << " " << coefs[i] << std::endl;
-      }
-    }
-#ifdef HAVE_MPI
-    MPI_Barrier(MPI_COMM_WORLD);
-#endif
-  }
-}
-
-template<typename VectorType>
-void sum_into_vector(size_t num_indices,
-                     const typename VectorType::GlobalOrdinalType* indices,
-                     const typename VectorType::ScalarType* coefs,
-                     VectorType& vec)
-{
-  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename VectorType::ScalarType Scalar;
-
-  GlobalOrdinal first = vec.startIndex;
-  GlobalOrdinal last = first + vec.local_size - 1;
-
-  std::vector<Scalar>& vec_coefs = vec.coefs;
-
-  for(size_t i=0; i<num_indices; ++i) {
-    if (indices[i] < first || indices[i] > last) continue;
-    size_t idx = indices[i] - first;
-    vec_coefs[idx] += coefs[i];
-  }
-}
-
-#ifdef MINIFE_HAVE_TBB
-template<typename VectorType>
-void sum_into_vector(size_t num_indices,
-                     const typename VectorType::GlobalOrdinalType* indices,
-                     const typename VectorType::ScalarType* coefs,
-                     LockingVector<VectorType>& vec)
-{
-  vec.sum_in(num_indices, indices, coefs);
-}
-#endif
-
-//------------------------------------------------------------
-//Compute the update of a vector with the sum of two scaled vectors where:
-//
-// w = alpha*x + beta*y
-//
-// x,y - input vectors
-//
-// alpha,beta - scalars applied to x and y respectively
-//
-// w - output vector
-//
-template<typename VectorType>
-void
-  waxpby(typename VectorType::ScalarType alpha, const VectorType& x,
-         typename VectorType::ScalarType beta, const VectorType& y,
-         VectorType& w)
-{
-  typedef typename VectorType::ScalarType ScalarType;
-  typedef typename VectorType::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& compute_node = x.compute_node;
-
-  WaxpbyOp<ScalarType> waxpbyop;
-
-  waxpbyop.w = compute_node.get_buffer(&w.coefs[0], w.coefs.size());
-  waxpbyop.x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  waxpbyop.y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-  waxpbyop.alpha = alpha;
-  waxpbyop.beta  = beta;
-  waxpbyop.n = x.local_size;
-
-#ifdef MINIFE_DEBUG
-  if (y.local_size < x.local_size || w.local_size < x.local_size) {
-    std::cerr << "miniFE::waxpby ERROR, y and w must be at least as long as x." << std::endl;
-    return;
-  }
-#endif
-
-  compute_node.parallel_for(waxpbyop.n, waxpbyop);
-}
-
-//Like waxpby above, except operates on two sets of arguments.
-//In other words, performs two waxpby operations in one loop.
-template<typename VectorType>
-void
-  fused_waxpby(typename VectorType::ScalarType alpha, const VectorType& x,
-         typename VectorType::ScalarType beta, const VectorType& y,
-         VectorType& w,
-         typename VectorType::ScalarType alpha2, const VectorType& x2,
-         typename VectorType::ScalarType beta2, const VectorType& y2,
-         VectorType& w2)
-{
-  typedef typename VectorType::ScalarType ScalarType;
-  typedef typename VectorType::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& compute_node = x.compute_node;
-
-  FusedWaxpbyOp<ScalarType> waxpbyop;
-
-  waxpbyop.w = compute_node.get_buffer(&w.coefs[0], w.coefs.size());
-  waxpbyop.x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  waxpbyop.y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-  waxpbyop.alpha = alpha;
-  waxpbyop.beta  = beta;
-  waxpbyop.w2 = compute_node.get_buffer(&w2.coefs[0], w2.coefs.size());
-  waxpbyop.x2 = compute_node.get_buffer(&x2.coefs[0], x2.coefs.size());
-  waxpbyop.y2 = compute_node.get_buffer(&y2.coefs[0], y2.coefs.size());
-  waxpbyop.alpha2 = alpha2;
-  waxpbyop.beta2  = beta2;
-  waxpbyop.n = x.local_size;
-
-#ifdef MINIFE_DEBUG
-  if (y.local_size < x.local_size || w.local_size < x.local_size) {
-    std::cerr << "miniFE::waxpby ERROR, y and w must be at least as long as x." << std::endl;
-    return;
-  }
-#endif
-
-  compute_node.parallel_for(waxpbyop.n, waxpbyop);
-}
-
-//-----------------------------------------------------------
-//Compute the dot product of two vectors where:
-//
-// x,y - input vectors
-//
-// result - return-value
-//
-template<typename Vector>
-typename TypeTraits<typename Vector::ScalarType>::magnitude_type
-  dot(const Vector& x,
-      const Vector& y)
-{
-  size_t n = x.local_size;
-
-#ifdef MINIFE_DEBUG
-  if (y.local_size < n) {
-    std::cerr << "miniFE::dot ERROR, y must be at least as long as x."<<std::endl;
-    n = y.local_size;
-  }
-#endif
-
-  typedef typename Vector::ScalarType Scalar;
-  typedef typename TypeTraits<typename Vector::ScalarType>::magnitude_type magnitude;
-
-  typedef typename Vector::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& compute_node = x.compute_node;
-
-  DotOp<Scalar> dotop;
-  dotop.x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  dotop.y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-  dotop.n = x.local_size;
-
-  compute_node.parallel_reduce(n, dotop);
-
-#ifdef HAVE_MPI
-  magnitude local_dot = dotop.result, global_dot = 0;
-  MPI_Datatype mpi_dtype = TypeTraits<magnitude>::mpi_type();  
-  MPI_Allreduce(&local_dot, &global_dot, 1, mpi_dtype, MPI_SUM, MPI_COMM_WORLD);
-  return global_dot;
-#else
-  return dotop.result;
-#endif
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/WaxpbyOp.hpp b/kokkos/basic/WaxpbyOp.hpp
deleted file mode 100644
index 6eaaa6e..0000000
--- a/kokkos/basic/WaxpbyOp.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef WAXPBYOP_HPP_
-#define WAXPBYOP_HPP_
-
-#ifndef KERNEL_PREFIX 
-#define KERNEL_PREFIX
-#endif
-
-template <class Scalar>
-struct WaxpbyOp {
-      Scalar* w;
-  const Scalar* x;
-  const Scalar* y;
-  Scalar alpha, beta;
-  size_t n;
-  KERNEL_PREFIX void operator()(size_t i) const
-  {
-    //here we count on the caller (ComputeNode) to pass in 'i'
-    //that is in the range 0..n-1
-    w[i] = alpha*x[i] + beta*y[i];
-  }
-};
-
-template <class Scalar>
-struct FusedWaxpbyOp {
-      Scalar* w;
-  const Scalar* x;
-  const Scalar* y;
-  Scalar alpha, beta;
-      Scalar* w2;
-  const Scalar* x2;
-  const Scalar* y2;
-  Scalar alpha2, beta2;
-  size_t n;
-  KERNEL_PREFIX void operator()(size_t i) const
-  {
-    //here we count on the caller (ComputeNode) to pass in 'i'
-    //that is in the range 0..n-1
-    w[i] = alpha*x[i] + beta*y[i];
-    w2[i] = alpha2*x2[i] + beta2*y2[i];
-  }
-};
-
-#endif
diff --git a/kokkos/basic/analytic_soln.hpp b/kokkos/basic/analytic_soln.hpp
deleted file mode 100644
index 8dcdfad..0000000
--- a/kokkos/basic/analytic_soln.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef _analytic_soln_hpp_
-#define _analytic_soln_hpp_
-
-//@HEADER
-// ************************************************************************
-//
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-//
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-//
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-
-#include <cmath>
-
-#ifndef MINIFE_SCALAR
-#define MINIFE_SCALAR double;
-#endif
-
-namespace miniFE {
-
-typedef MINIFE_SCALAR Scalar;
-
-// The 'soln' function below computes the analytic solution for
-// steady state temperature in a brick-shaped domain (formally called
-// a rectangular parallelepiped). The inputs to the function are
-// the x,y,z coordinates of the point at which temperature is to be
-// computed, and the number of terms p,q in the series expansion.
-//
-// The equations used for the temperature solution are equations 9 and 10
-// in section 6.2 of Carslaw & Jaeger, "Conduction of Heat in Solids".
-//
-// The paralellepiped being used is defined by this domain:
-// 0 <= x <= 1.0
-// 0 <= y <= 1.0
-// 0 <= z <= 1.0
-//
-// With boundary conditions prescribing the temperature to be 1.0 on
-// the x==1.0 face, and 0.0 on all other faces.
-//
-// Thus, in the equations from Carslaw & Jaeger, the following constants
-// are used:
-//
-// a == b == c == 1.0  (the extents of the domain)
-// v1 == 0.0           (temperature at x == 0.0)
-// v2 == 1.0           (temperature at x == 1.0)
-//
-
-const Scalar PI = 3.141592653589793238462;
-const Scalar PI_SQR = PI*PI;
-const Scalar term0 = 16.0/(PI_SQR);
-
-inline Scalar fcn_l(int p, int q)
-{
-  return std::sqrt((2*p+1)*(2*p+1)*PI_SQR + (2*q+1)*(2*q+1)*PI_SQR);
-}
-
-inline Scalar fcn(int n, Scalar u)
-{
-  return (2*n+1)*PI*u;
-}
-
-inline Scalar soln(Scalar x, Scalar y, Scalar z, int max_p, int max_q)
-{
-  Scalar sum = 0;
-  for(int p=0; p<=max_p; ++p) {
-    const Scalar p21y = fcn(p, y);
-    const Scalar sin_py = std::sin(p21y)/(2*p+1);
-    for(int q=0; q<=max_q; ++q) {
-      const Scalar q21z = fcn(q, z);
-      const Scalar sin_qz = std::sin(q21z)/(2*q+1);
-
-      const Scalar l = fcn_l(p, q);
-
-      const Scalar sinh1 = std::sinh(l*x);
-      const Scalar sinh2 = std::sinh(l);
-
-      const Scalar tmp = (sinh1*sin_py)*(sin_qz/sinh2);
-
-      //if the scalar l gets too big, sinh(l) becomes inf.
-      //if that happens, tmp is a NaN.
-      //crude check for NaN:
-      //if tmp != tmp, tmp is NaN
-      if (tmp == tmp) {
-        sum += tmp;
-      }
-      else {
-        //if we got a NaN, break out of this inner loop and go to
-        //the next iteration of the outer loop.
-        break;
-      }
-    }
-  }
-  return term0*sum;
-}
-
-}//namespace miniFE
-
-#endif /* _analytic_soln_hpp_ */
diff --git a/kokkos/basic/assemble_FE_data.hpp b/kokkos/basic/assemble_FE_data.hpp
deleted file mode 100644
index f34b14a..0000000
--- a/kokkos/basic/assemble_FE_data.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _assemble_FE_data_hpp_
-#define _assemble_FE_data_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <box_utils.hpp>
-#include <simple_mesh_description.hpp>
-
-#ifdef MINIFE_HAVE_TBB
-//#include <perform_element_loop_TBB_pipe.hpp>
-#include <perform_element_loop_TBB_pllfor1.hpp>
-//#include <perform_element_loop_TBB_pllfor2.hpp>
-#else
-#include <perform_element_loop.hpp>
-#endif
-
-namespace miniFE {
-
-template<typename MatrixType,
-         typename VectorType>
-void
-assemble_FE_data(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
-                 MatrixType& A,
-                 VectorType& b,
-                 Parameters& params)
-{
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-
-  int global_elems_x = mesh.global_box[0][1];
-  int global_elems_y = mesh.global_box[1][1];
-  int global_elems_z = mesh.global_box[2][1];
-
-  Box local_elem_box;
-  copy_box(mesh.local_box, local_elem_box);
-
-  if (get_num_ids<GlobalOrdinal>(local_elem_box) < 1) {
-    return;
-  }
-
-  //
-  //We want the element-loop to loop over our (processor-local) domain plus a
-  //ghost layer, so we can assemble the complete linear-system without doing
-  //any communication.
-  //
-  int ghost = 1;
-  if (local_elem_box[0][0] > 0) local_elem_box[0][0] -= ghost;
-  if (local_elem_box[1][0] > 0) local_elem_box[1][0] -= ghost;
-  if (local_elem_box[2][0] > 0) local_elem_box[2][0] -= ghost;
-  if (local_elem_box[0][1] < global_elems_x) local_elem_box[0][1] += ghost;
-  if (local_elem_box[1][1] < global_elems_y) local_elem_box[1][1] += ghost;
-  if (local_elem_box[2][1] < global_elems_z) local_elem_box[2][1] += ghost;
-
-  perform_element_loop(mesh, local_elem_box, A, b, params);
-}
-                      
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/box_utils.hpp b/kokkos/basic/box_utils.hpp
deleted file mode 100644
index ee10975..0000000
--- a/kokkos/basic/box_utils.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-#ifndef _box_utils_hpp_
-#define _box_utils_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <vector>
-#include <map>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <TypeTraits.hpp>
-#include <Box.hpp>
-
-namespace miniFE {
-
-inline void copy_box(const Box& from_box, Box& to_box)
-{
-  for(int i=0; i<3; ++i) {
-    to_box[i][0] = from_box[i][0];
-    to_box[i][1] = from_box[i][1];
-  }
-}
-
-template<typename GlobalOrdinal>
-void get_int_coords(GlobalOrdinal ID, int nx, int ny, int nz,
-                int& x, int& y, int& z)
-{
-  z = ID/(nx*ny);
-  y = (ID%(nx*ny))/nx;
-  x = ID%nx;
-}
-
-template<typename GlobalOrdinal,typename Scalar>
-void get_coords(GlobalOrdinal ID, int nx, int ny, int nz,
-                Scalar& x, Scalar& y, Scalar& z)
-{
-  const int xdiv = nx>1 ? nx-1 : 1;
-  const int ydiv = ny>1 ? ny-1 : 1;
-  const int zdiv = nz>1 ? nz-1 : 1;
-
-//This code assumes that ID is 0-based.
-//
-//compute coordinates that lie on (or in) the unit cube.
-//that's why we're dividing by nz,ny,nx:
-  z = (1.0*(ID/(nx*ny)))/zdiv;
-  y = 1.0*((ID%(nx*ny))/nx)/ydiv;
-  x = 1.0*(ID%nx)/xdiv;
-}
-
-template<typename GlobalOrdinal>
-GlobalOrdinal get_num_ids(const Box& box)
-{
-  int nx = box[0][1] - box[0][0];
-  int ny = box[1][1] - box[1][0];
-  int nz = box[2][1] - box[2][0];
-  GlobalOrdinal tmp = nx*ny;
-  tmp *= nz;
-  return tmp;
-}
-
-template<typename GlobalOrdinal>
-GlobalOrdinal get_id(int nx, int ny, int nz,
-                     int x, int y, int z)
-{
-  if (x<0 || y<0 || z<0) return -1;
-  if (x>=nx || y>=ny || z>=nz) return -1;
-
-  //form x + nx*y + nx*ny*z:
-
-  GlobalOrdinal tmp = nx*ny;
-  tmp *= z;
-  tmp = x + nx * y + tmp;
-  return tmp;
-}
-
-template<typename GlobalOrdinal>
-void get_ids(int nx, int ny, int nz,
-             const Box& box,
-             GlobalOrdinal* ids)
-{
-  unsigned offset = 0;
-  for(int z=box[2][0]; z<box[2][1]; ++z) {
-    for(int y=box[1][0]; y<box[1][1]; ++y) {
-      for(int x=box[0][0]; x<box[0][1]; ++x) {
-        ids[offset++] = get_id<GlobalOrdinal>(nx, ny, nz, x, y, z);
-      }
-    }
-  }
-}
-
-template<typename GlobalOrdinal>
-void create_map_id_to_row(int global_nx, int global_ny, int global_nz,
-                     const Box& box,
-                     std::map<GlobalOrdinal,GlobalOrdinal>& id_to_row)
-{
-  GlobalOrdinal num_my_ids = get_num_ids<GlobalOrdinal>(box);
-  GlobalOrdinal my_first_row = 0;
-
-#ifdef HAVE_MPI
-  int numprocs = 1, myproc = 0;
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-
-  typename std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0);
-  tmp_buffer[myproc] = num_my_ids;
-  typename std::vector<GlobalOrdinal> global_offsets(numprocs);
-  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
-  MPI_Allreduce(&tmp_buffer[0], &global_offsets[0], numprocs, mpi_dtype,
-                MPI_SUM, MPI_COMM_WORLD);
-  GlobalOrdinal offset = 0;
-  for(int i=0; i<numprocs; ++i) {
-    GlobalOrdinal tmp = global_offsets[i];
-    global_offsets[i] = offset;
-    offset += tmp;
-  }
-
-  my_first_row = global_offsets[myproc];
-#endif
-
-  typename std::vector<GlobalOrdinal> all_my_ids(num_my_ids);
-  get_ids(global_nx, global_ny, global_nz, box, &all_my_ids[0]);
-
-  typename std::vector<GlobalOrdinal> ids;
-  typename std::vector<GlobalOrdinal> rows;
-
-  if (all_my_ids.size() > 0) {
-    ids.push_back(all_my_ids[0]);
-    rows.push_back(my_first_row);
-  }
-
-  for(size_t i=1; i<all_my_ids.size(); ++i) {
-    if (all_my_ids[i] != all_my_ids[i-1]+1) {
-      ids.push_back(all_my_ids[i]);
-      rows.push_back(my_first_row+i);
-    }
-  }
-
-#ifdef HAVE_MPI
-  int len = ids.size();
-  std::vector<int> lengths(numprocs);
-  MPI_Allgather(&len, 1, MPI_INT, &lengths[0], 1, MPI_INT, MPI_COMM_WORLD);
-
-  std::vector<int> displs(lengths);
-  int displ = 0;
-  for(int i=0; i<numprocs; ++i) {
-    int tmp = lengths[i];
-    displs[i] = displ;
-    displ += tmp;
-  }
-
-  typename std::vector<GlobalOrdinal> global_ids(displ);
-  typename std::vector<GlobalOrdinal> global_rows(displ);
-
-  MPI_Allgatherv(&ids[0], len, mpi_dtype, &global_ids[0],
-                 &lengths[0], &displs[0], mpi_dtype, MPI_COMM_WORLD);
-  MPI_Allgatherv(&rows[0], len, mpi_dtype, &global_rows[0],
-                 &lengths[0], &displs[0], mpi_dtype, MPI_COMM_WORLD);
-
-  ids = global_ids;
-  rows = global_rows;
-#endif
-
-  for(size_t i=0; i<ids.size(); ++i) {
-    id_to_row.insert(std::make_pair(ids[i], rows[i]));
-  }
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/cg_solve.hpp b/kokkos/basic/cg_solve.hpp
deleted file mode 100644
index ccbd9b7..0000000
--- a/kokkos/basic/cg_solve.hpp
+++ /dev/null
@@ -1,273 +0,0 @@
-#ifndef _cg_solve_hpp_
-#define _cg_solve_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cmath>
-#include <limits>
-
-#include <Vector_functions.hpp>
-#include <mytimer.hpp>
-
-#include <outstream.hpp>
-
-namespace miniFE {
-
-template<typename Scalar>
-void print_vec(const std::vector<Scalar>& vec, const std::string& name)
-{
-  for(size_t i=0; i<vec.size(); ++i) {
-    std::cout << name << "["<<i<<"]: " << vec[i] << std::endl;
-  }
-}
-
-template<typename VectorType>
-bool breakdown(typename VectorType::ScalarType inner,
-               const VectorType& v,
-               const VectorType& w)
-{
-  typedef typename VectorType::ScalarType Scalar;
-  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
-
-//This is code that was copied from Aztec, and originally written
-//by my hero, Ray Tuminaro.
-//
-//Assuming that inner = <v,w> (inner product of v and w),
-//v and w are considered orthogonal if
-//  |inner| < 100 * ||v||_2 * ||w||_2 * epsilon
-
-  magnitude vnorm = std::sqrt(dot(v,v));
-  magnitude wnorm = std::sqrt(dot(w,w));
-  return std::abs(inner) <= 100*vnorm*wnorm*std::numeric_limits<magnitude>::epsilon();
-}
-
-template<typename OperatorType,
-         typename VectorType,
-         typename Matvec>
-void
-cg_solve(OperatorType& A,
-         const VectorType& b,
-         VectorType& x,
-         Matvec matvec,
-         typename OperatorType::LocalOrdinalType max_iter,
-         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& tolerance,
-         typename OperatorType::LocalOrdinalType& num_iters,
-         typename TypeTraits<typename OperatorType::ScalarType>::magnitude_type& normr,
-         timer_type* my_cg_times)
-{
-  typedef typename OperatorType::ScalarType ScalarType;
-  typedef typename OperatorType::GlobalOrdinalType GlobalOrdinalType;
-  typedef typename OperatorType::LocalOrdinalType LocalOrdinalType;
-  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
-
-  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0, tMATVECDOT = 0;
-  timer_type total_time = mytimer();
-
-  int myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (!A.has_local_indices) {
-    std::cerr << "miniFE::cg_solve ERROR, A.has_local_indices is false, needs to be true. This probably means "
-       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::cg_solve."
-       << std::endl;
-    return;
-  }
-
-  size_t nrows = A.rows.size();
-  LocalOrdinalType ncols = A.num_cols;
-
-  VectorType r(b.startIndex, nrows, b.compute_node);
-  VectorType p(0, ncols, b.compute_node);
-  VectorType Ap(b.startIndex, nrows, b.compute_node);
-
-  normr = 0;
-  magnitude_type rtrans = 0;
-  magnitude_type oldrtrans = 0;
-
-  LocalOrdinalType print_freq = max_iter/10;
-  if (print_freq>50) print_freq = 50;
-  if (print_freq<1)  print_freq = 1;
-
-  ScalarType one = 1.0;
-  ScalarType zero = 0.0;
-
-  typedef typename VectorType::ComputeNodeType ComputeNodeType;
-  ComputeNodeType& compute_node = x.compute_node;
-
-  //The following lines that create and initialize buffers are no-ops in many
-  //cases, but perform actual allocations and copies if an off-cpu device such
-  //as a GPU is being used by compute_node.
-
-  //Do any required allocations for buffers that will be needed during CG:
-  ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
-  ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
-  ScalarType* d_Ap = compute_node.get_buffer(&Ap.coefs[0], Ap.coefs.size());
-  ScalarType* d_r  = compute_node.get_buffer(&r.coefs[0], r.coefs.size());
-#ifdef MINIFE_CSR_MATRIX
-  LocalOrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
-  GlobalOrdinalType* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
-  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
-#endif
-#ifdef MINIFE_ELL_MATRIX
-  GlobalOrdinalType* d_Acols   = compute_node.get_buffer(&A.cols[0], A.cols.size());
-  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.coefs[0], A.coefs.size());
-#endif
-
-  //Copy data to buffers that need to be initialized from input data:
-  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
-  compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
-#ifdef MINIFE_CSR_MATRIX
-  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
-  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
-  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
-#endif
-#ifdef MINIFE_ELL_MATRIX
-  compute_node.copy_to_buffer(&A.cols[0], A.cols.size(), d_Acols);
-  compute_node.copy_to_buffer(&A.coefs[0], A.coefs.size(), d_Acoefs);
-#endif
-
-  TICK(); waxpby(one, x, zero, x, p); TOCK(tWAXPY);
-
-  compute_node.copy_from_buffer(&p.coefs[0], p.coefs.size(), d_p);
-//  print_vec(p.coefs, "p");
-
-  TICK();
-  matvec(A, p, Ap);
-  TOCK(tMATVEC);
-
-  TICK(); waxpby(one, b, -one, Ap, r); TOCK(tWAXPY);
-
-//  if (b.coefs.size() == r.coefs.size()) std::cout << "b.size == r.size" << std::endl;
-//  else std::cout << "b.size != r.size" << std::endl;
-//  if (b.coefs == r.coefs) std::cout << "b == r" << std::endl;
-//  else std::cout << "b != r" << std::endl;
-//  compute_node.copy_from_buffer(&r.coefs[0], r.coefs.size(), d_r);
-//  print_vec(b.coefs, "b");
-//  print_vec(r.coefs, "r");
-
-  TICK(); rtrans = dot(r, r); TOCK(tDOT);
-
-//std::cout << "rtrans="<<rtrans<<std::endl;
-
-  normr = std::sqrt(rtrans);
-
-  if (myproc == 0) {
-    std::cout << "Initial Residual = "<< normr << std::endl;
-  }
-
-  magnitude_type brkdown_tol = std::numeric_limits<magnitude_type>::epsilon();
-
-#ifdef MINIFE_DEBUG
-  std::ostream& os = outstream();
-  os << "brkdown_tol = " << brkdown_tol << std::endl;
-#endif
-
-
-  for(LocalOrdinalType k=1; k <= max_iter && normr > tolerance; ++k) {
-    if (k == 1) {
-      TICK(); waxpby(one, r, zero, r, p); TOCK(tWAXPY);
-    }
-    else {
-      oldrtrans = rtrans;
-      TICK(); rtrans = dot(r, r); TOCK(tDOT);
-      magnitude_type beta = rtrans/oldrtrans;
-      TICK(); waxpby(one, r, beta, p, p); TOCK(tWAXPY);
-    }
-
-    normr = std::sqrt(rtrans);
-
-    if (myproc == 0 && (k%print_freq==0 || k==max_iter)) {
-      std::cout << "Iteration = "<<k<<"   Residual = "<<normr<<std::endl;
-    }
-
-    magnitude_type alpha = 0;
-    magnitude_type p_ap_dot = 0;
-
-#ifdef MINIFE_FUSED
-    TICK();
-    p_ap_dot = matvec_and_dot(A, p, Ap);
-    TOCK(tMATVECDOT);
-#else
-    TICK(); matvec(A, p, Ap); TOCK(tMATVEC);
-
-    TICK(); p_ap_dot = dot(Ap, p); TOCK(tDOT);
-#endif
-
-#ifdef MINIFE_DEBUG
-    os << "iter " << k << ", p_ap_dot = " << p_ap_dot;
-    os.flush();
-#endif
-    if (p_ap_dot < brkdown_tol) {
-      if (p_ap_dot < 0 || breakdown(p_ap_dot, Ap, p)) {
-        std::cerr << "miniFE::cg_solve ERROR, numerical breakdown!"<<std::endl;
-#ifdef MINIFE_DEBUG
-        os << "ERROR, numerical breakdown!"<<std::endl;
-#endif
-        //update the timers before jumping out.
-        my_cg_times[WAXPY] = tWAXPY;
-        my_cg_times[DOT] = tDOT;
-        my_cg_times[MATVEC] = tMATVEC;
-        my_cg_times[TOTAL] = mytimer() - total_time;
-        return;
-      }
-      else brkdown_tol = 0.1 * p_ap_dot;
-    }
-    alpha = rtrans/p_ap_dot;
-#ifdef MINIFE_DEBUG
-    os << ", rtrans = " << rtrans << ", alpha = " << alpha << std::endl;
-#endif
-
-#ifdef MINIFE_FUSED
-    TICK();
-    fused_waxpby(one, x, alpha, p, x, one, r, -alpha, Ap, r);
-    TOCK(tWAXPY);
-#else
-    TICK(); waxpby(one, x, alpha, p, x);
-            waxpby(one, r, -alpha, Ap, r); TOCK(tWAXPY);
-#endif
-
-    num_iters = k;
-  }
-
-  compute_node.copy_from_buffer(&x.coefs[0], x.coefs.size(), d_x);
-
-  my_cg_times[WAXPY] = tWAXPY;
-  my_cg_times[DOT] = tDOT;
-  my_cg_times[MATVEC] = tMATVEC;
-  my_cg_times[MATVECDOT] = tMATVECDOT;
-  my_cg_times[TOTAL] = mytimer() - total_time;
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/compute_matrix_stats.hpp b/kokkos/basic/compute_matrix_stats.hpp
deleted file mode 100644
index f035eec..0000000
--- a/kokkos/basic/compute_matrix_stats.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-#ifndef _compute_matrix_stats_hpp_
-#define _compute_matrix_stats_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cstddef>
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-#include <sstream>
-#include <iomanip>
-
-#include <outstream.hpp>
-#include <utils.hpp>
-#include <YAML_Doc.hpp>
-
-namespace miniFE {
-
-template<typename MatrixType>
-size_t
-compute_matrix_stats(const MatrixType& A, int myproc, int numprocs, YAML_Doc& ydoc)
-{
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
-  int min_proc = 0, max_proc = 0;
-
-  GlobalOrdinal local_nrows = A.rows.size();
-
-  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
-                     max_nrows, max_proc);
-
-  //Gather stats on global, min/max matrix num-nonzeros:
-
-  double local_nnz = A.num_nonzeros();
-  double dglobal_nnz = 0, dmin_nnz = 0, dmax_nnz = 0;
-
-  get_global_min_max(local_nnz, dglobal_nnz, dmin_nnz, min_proc,
-                     dmax_nnz, max_proc);
-
-  double avg_nrows = global_nrows;
-  avg_nrows /= numprocs;
-  double avg_nnz = dglobal_nnz;
-  avg_nnz /= numprocs;
-
-  double mem_overhead_MB = parallel_memory_overhead_MB(A);
-
-  size_t global_nnz = static_cast<size_t>(std::ceil(dglobal_nnz));
-  size_t min_nnz = static_cast<size_t>(std::ceil(dmin_nnz));
-  size_t max_nnz = static_cast<size_t>(std::ceil(dmax_nnz));
-
-  if (myproc == 0) {
-    ydoc.add("Matrix attributes","");
-    ydoc.get("Matrix attributes")->add("Global Nrows",global_nrows);
-    ydoc.get("Matrix attributes")->add("Global NNZ",global_nnz);
-
-    //compute how much memory the matrix occupies:
-    //num-bytes = sizeof(GlobalOrdinal)*global_nrows   for A.rows
-    //          + sizeof(LocalOrdinal)*global_nrows    for A.rows_offsets
-    //          + sizeof(GlobalOrdinal)*global_nnz     for A.packed_cols
-    //          + sizeof(Scalar)*global_nnz            for A.packed_coefs
-
-    double invGB = 1.0/(1024*1024*1024);
-    double memGB = invGB*global_nrows*sizeof(GlobalOrdinal);
-    memGB += invGB*global_nrows*sizeof(LocalOrdinal);
-    memGB += invGB*global_nnz*sizeof(GlobalOrdinal);
-    memGB += invGB*global_nnz*sizeof(Scalar);
-    ydoc.get("Matrix attributes")->add("Global Memory (GB)",memGB);
-
-    ydoc.get("Matrix attributes")->add("Pll Memory Overhead (MB)",mem_overhead_MB);
-
-    ydoc.get("Matrix attributes")->add("Rows per proc MIN",min_nrows);
-    ydoc.get("Matrix attributes")->add("Rows per proc MAX",max_nrows);
-    ydoc.get("Matrix attributes")->add("Rows per proc AVG",avg_nrows);
-    ydoc.get("Matrix attributes")->add("NNZ per proc MIN",min_nnz);
-    ydoc.get("Matrix attributes")->add("NNZ per proc MAX",max_nnz);
-    ydoc.get("Matrix attributes")->add("NNZ per proc AVG",avg_nnz);
-  }
-
-  return global_nnz;
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/driver.hpp b/kokkos/basic/driver.hpp
deleted file mode 100644
index d3966eb..0000000
--- a/kokkos/basic/driver.hpp
+++ /dev/null
@@ -1,403 +0,0 @@
-#ifndef _driver_hpp_
-#define _driver_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cstddef>
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-#include <sstream>
-#include <iomanip>
-
-#include <box_utils.hpp>
-#include <Vector.hpp>
-
-#ifdef MINIFE_CSR_MATRIX
-#include <CSRMatrix.hpp>
-#elif defined(MINIFE_ELL_MATRIX)
-#include <ELLMatrix.hpp>
-#else
-#include <CSRMatrix.hpp>
-#endif
-
-#include <simple_mesh_description.hpp>
-
-#include <SparseMatrix_functions.hpp>
-
-#include <generate_matrix_structure.hpp>
-#include <assemble_FE_data.hpp>
-
-#include <verify_solution.hpp>
-
-#include <compute_matrix_stats.hpp>
-#include <make_local_matrix.hpp>
-#include <imbalance.hpp>
-#include <cg_solve.hpp>
-#if MINIFE_KERNELS != 0
-#include <time_kernels.hpp>
-#endif
-#include <outstream.hpp>
-#include <utils.hpp>
-#include <mytimer.hpp>
-#include <YAML_Doc.hpp>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#define RUN_TIMED_FUNCTION(msg, fn, time_inc, time_total) \
-{                                   \
-  if (myproc==0) {                  \
-    std::cout.width(30);            \
-    std::cout << msg;               \
-    std::cout.flush();              \
-  }                                 \
-  timer_type rtf_t0 = mytimer();    \
-  fn;                               \
-  time_inc = mytimer() - rtf_t0;    \
-  time_total += time_inc;           \
-  if (myproc==0) {                  \
-    std::cout << time_inc << "s, total time: " << time_total << std::endl; \
-  }                                 \
-}
-
-//This program assembles finite-element matrices into a global matrix and
-//vector, then solves the linear-system using Conjugate Gradients.
-//Each finite-element is a hexahedron with 8 vertex-nodes.
-//
-//Notes:
-//- In finite-element terms, the box dimensions are in elements, not nodes.
-//  In other words, a 2x2x2 box describes 8 elements, each of which has 8 nodes,
-//  so it is a 3x3x3 node domain (27 nodes).
-//  The assembled linear system will have 1 equation for each finite element node.
-//
-//- The coordinate origin is at the corner of the global box where x=0,
-//  y=0, z=0, and the box extends along the positive x-axis, positive y-axis,
-//  and the positive z-axis.
-//
-//- Some aspects of matrix-structure generation and finite-element assembly
-//  are convenient to do using global node identifiers.
-//  A global identifier for each node is obtained from coordinates plus
-//  global box dimensions. See the function 'get_id' in box_utils.hpp.
-//
-//- Each node corresponds to a row in the matrix. The RCB partitioning method
-//  we use to split the global box among processors results in some
-//  processors owning non-contiguous blocks of global node identifiers.
-//  Since it is convenient for matrices and vectors to store contiguously-
-//  numbered blocks of rows, we map global node identifiers to a separate
-//  space of row numbers such that each processor's nodes correspond to a
-//  contiguous block of row numbers.
-//
-
-namespace miniFE {
-
-template<typename Scalar,
-         typename LocalOrdinal,
-         typename GlobalOrdinal,
-         typename ComputeNodeType>
-void
-driver(const Box& global_box, Box& my_box, ComputeNodeType& compute_node,
-       Parameters& params, YAML_Doc& ydoc)
-{
-  int global_nx = global_box[0][1];
-  int global_ny = global_box[1][1];
-  int global_nz = global_box[2][1];
-
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (params.load_imbalance > 0) {
-    add_imbalance<GlobalOrdinal>(global_box, my_box, params.load_imbalance, ydoc);
-  }
-
-  float largest_imbalance = 0, std_dev = 0;
-  compute_imbalance<GlobalOrdinal>(global_box, my_box, largest_imbalance,
-                                   std_dev, ydoc, true);
-
-
-  //Create a representation of the mesh:
-  //Note that 'simple_mesh_description' is a virtual or conceptual
-  //mesh that doesn't actually store mesh data.
-
-  if (myproc==0) {
-    std::cout.width(30);
-    std::cout << "creating/filling mesh...";
-    std::cout.flush();
-  }
-
-  timer_type t_start = mytimer();
-  timer_type t0 = mytimer();
-
-  simple_mesh_description<GlobalOrdinal> mesh(global_box, my_box);
-
-  timer_type mesh_fill = mytimer() - t0;
-  timer_type t_total = mytimer() - t_start;
-
-  if (myproc==0) {
-    std::cout << mesh_fill << "s, total time: " << t_total << std::endl;
-  }
-
-  //next we will generate the matrix structure.
-
-  //Declare matrix object:
-
-#ifdef MINIFE_CSR_MATRIX
-  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> MatrixType;
-#elif defined(MINIFE_ELL_MATRIX)
-  typedef ELLMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> MatrixType;
-#else
-  typedef CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> MatrixType;
-#endif
-
-  MatrixType A(compute_node);
-
-  timer_type gen_structure;
-  RUN_TIMED_FUNCTION("generating matrix structure...",
-                     generate_matrix_structure(mesh, A),
-                     gen_structure, t_total);
-
-  GlobalOrdinal local_nrows = A.rows.size();
-  GlobalOrdinal my_first_row = local_nrows > 0 ? A.rows[0] : -1;
-
-  Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> b(my_first_row, local_nrows,compute_node);
-  Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(my_first_row, local_nrows,compute_node);
-
-  //Assemble finite-element sub-matrices and sub-vectors into the global
-  //linear system:
-
-  timer_type fe_assembly;
-  RUN_TIMED_FUNCTION("assembling FE data...",
-                     assemble_FE_data(mesh, A, b, params),
-                     fe_assembly, t_total);
-
-  if (myproc == 0) {
-    ydoc.add("Matrix structure generation","");
-    ydoc.get("Matrix structure generation")->add("Mat-struc-gen Time",gen_structure);
-    ydoc.add("FE assembly","");
-    ydoc.get("FE assembly")->add("FE assembly Time",fe_assembly);
-  }
-
-#ifdef MINIFE_DEBUG
-  write_matrix("A_prebc.mtx", A);
-  write_vector("b_prebc.vec", b);
-#endif
-
-  //Now apply dirichlet boundary-conditions
-  //(Apply the 0-valued surfaces first, then the 1-valued surface last.)
-
-  timer_type dirbc_time;
-  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
-            impose_dirichlet(0.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_0), dirbc_time, t_total);
-  RUN_TIMED_FUNCTION("imposing Dirichlet BC...",
-            impose_dirichlet(1.0, A, b, global_nx+1, global_ny+1, global_nz+1, mesh.bc_rows_1), dirbc_time, t_total);
-
-#ifdef MINIFE_DEBUG
-  write_matrix("A.mtx", A);
-  write_vector("b.vec", b);
-#endif
-
-  //Transform global indices to local, set up communication information:
-
-  timer_type make_local_time;
-  RUN_TIMED_FUNCTION("making matrix indices local...",
-                     make_local_matrix(A),
-                     make_local_time, t_total);
-
-#ifdef MINIFE_DEBUG
-  write_matrix("A_local.mtx", A);
-  write_vector("b_local.vec", b);
-#endif
-
-  size_t global_nnz = compute_matrix_stats(A, myproc, numprocs, ydoc);
-
-  //Prepare to perform conjugate gradient solve:
-
-  LocalOrdinal max_iters = 50;
-  LocalOrdinal num_iters = 0;
-  typedef typename TypeTraits<Scalar>::magnitude_type magnitude;
-  magnitude rnorm = 0;
-  magnitude tol = std::numeric_limits<magnitude>::epsilon();
-
-  timer_type cg_times[NUM_TIMERS];
-
-  typedef Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> VectorType;
-
-  t_total = mytimer() - t_start;
-
-  bool matvec_with_comm_overlap = params.mv_overlap_comm_comp==1;
-
-#if MINIFE_KERNELS != 0
-  if (myproc==0) {
-    std::cout.width(30);
-    std::cout << "Starting kernel timing loops ..." << std::endl;
-  }
-
-  max_iters = 500;
-  x.coefs[0] = 0.9;
-  if (matvec_with_comm_overlap) {
-    time_kernels(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
-  }
-  else {
-    time_kernels(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, rnorm, cg_times);
-  }
-  num_iters = max_iters;
-  std::string title("Kernel timings");
-#else
-  if (myproc==0) {
-    std::cout << "Starting CG solver ... " << std::endl;
-  }
-
-  if (matvec_with_comm_overlap) {
-#ifdef MINIFE_CSR_MATRIX
-    rearrange_matrix_local_external(A);
-    cg_solve(A, b, x, matvec_overlap<MatrixType,VectorType>(), max_iters, tol,
-           num_iters, rnorm, cg_times);
-#else
-    std::cout << "ERROR, matvec with overlapping comm/comp only works with CSR matrix."<<std::endl;
-#endif
-  }
-  else {
-    cg_solve(A, b, x, matvec_std<MatrixType,VectorType>(), max_iters, tol,
-           num_iters, rnorm, cg_times);
-    if (myproc == 0) {
-      std::cout << "Final Resid Norm: " << rnorm << std::endl;
-    }
-
-#ifdef MINIFE_DEBUG
-    if (myproc == 0) {
-      std::cout << "verifying solution..." << std::endl;
-    }
-    verify_solution(mesh, x);
-#endif
-  }
-
-#ifdef MINIFE_DEBUG
-  write_vector("x.vec", x);
-#endif
-  std::string title("CG solve");
-#endif
-
-  if (myproc == 0) {
-    ydoc.get("Global Run Parameters")->add("ScalarType",TypeTraits<Scalar>::name());
-    ydoc.get("Global Run Parameters")->add("GlobalOrdinalType",TypeTraits<GlobalOrdinal>::name());
-    ydoc.get("Global Run Parameters")->add("LocalOrdinalType",TypeTraits<LocalOrdinal>::name());
-    ydoc.add(title,"");
-    ydoc.get(title)->add("Iterations",num_iters);
-    ydoc.get(title)->add("Final Resid Norm",rnorm);
-
-    GlobalOrdinal global_nrows = global_nx;
-    global_nrows *= global_ny*global_nz;
-
-    //flops-per-mv, flops-per-dot, flops-per-waxpy:
-    double mv_flops = global_nnz*2.0;
-    double dot_flops = global_nrows*2.0;
-    double waxpy_flops = global_nrows*3.0;
-
-#if MINIFE_KERNELS == 0
-//if MINIFE_KERNELS == 0 then we did a CG solve, and in that case
-//there were num_iters+1 matvecs, num_iters*2 dots, and num_iters*3+2 waxpys.
-    mv_flops *= (num_iters+1);
-    dot_flops *= (2*num_iters);
-    waxpy_flops *= (3*num_iters+2);
-#else
-//if MINIFE_KERNELS then we did one of each operation per iteration.
-    mv_flops *= num_iters;
-    dot_flops *= num_iters;
-    waxpy_flops *= num_iters;
-#endif
-
-    double total_flops = mv_flops + dot_flops + waxpy_flops;
-
-    double mv_mflops = -1;
-    if (cg_times[MATVEC] > 1.e-4)
-      mv_mflops = 1.e-6 * (mv_flops/cg_times[MATVEC]);
-
-    double dot_mflops = -1;
-    if (cg_times[DOT] > 1.e-4)
-      dot_mflops = 1.e-6 * (dot_flops/cg_times[DOT]);
-
-    double waxpy_mflops = -1;
-    if (cg_times[WAXPY] > 1.e-4)
-      waxpy_mflops = 1.e-6 *  (waxpy_flops/cg_times[WAXPY]);
-
-    double total_mflops = -1;
-    if (cg_times[TOTAL] > 1.e-4)
-      total_mflops = 1.e-6 * (total_flops/cg_times[TOTAL]);
-
-    ydoc.get(title)->add("WAXPY Time",cg_times[WAXPY]);
-    ydoc.get(title)->add("WAXPY Flops",waxpy_flops);
-    if (waxpy_mflops >= 0)
-      ydoc.get(title)->add("WAXPY Mflops",waxpy_mflops);
-    else
-      ydoc.get(title)->add("WAXPY Mflops","inf");
-
-    ydoc.get(title)->add("DOT Time",cg_times[DOT]);
-    ydoc.get(title)->add("DOT Flops",dot_flops);
-    if (dot_mflops >= 0)
-      ydoc.get(title)->add("DOT Mflops",dot_mflops);
-    else
-      ydoc.get(title)->add("DOT Mflops","inf");
-
-    ydoc.get(title)->add("MATVEC Time",cg_times[MATVEC]);
-    ydoc.get(title)->add("MATVEC Flops",mv_flops);
-    if (mv_mflops >= 0)
-      ydoc.get(title)->add("MATVEC Mflops",mv_mflops);
-    else
-      ydoc.get(title)->add("MATVEC Mflops","inf");
-
-#ifdef MINIFE_FUSED
-    ydoc.get(title)->add("MATVECDOT Time",cg_times[MATVECDOT]);
-    ydoc.get(title)->add("MATVECDOT Flops",mv_flops);
-    if (mv_mflops >= 0)
-      ydoc.get(title)->add("MATVECDOT Mflops",mv_mflops);
-    else
-      ydoc.get(title)->add("MATVECDOT Mflops","inf");
-#endif
-
-#if MINIFE_KERNELS == 0
-    ydoc.get(title)->add("Total","");
-    ydoc.get(title)->get("Total")->add("Total CG Time",cg_times[TOTAL]);
-    ydoc.get(title)->get("Total")->add("Total CG Flops",total_flops);
-    if (total_mflops >= 0)
-      ydoc.get(title)->get("Total")->add("Total CG Mflops",total_mflops);
-    else
-      ydoc.get(title)->get("Total")->add("Total CG Mflops","inf");
-    ydoc.get(title)->add("Time per iteration",cg_times[TOTAL]/num_iters);
-#endif
-  }
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/exchange_externals.hpp b/kokkos/basic/exchange_externals.hpp
deleted file mode 100644
index 167ba1b..0000000
--- a/kokkos/basic/exchange_externals.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-#ifndef _exchange_externals_hpp_
-#define _exchange_externals_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cstdlib>
-#include <iostream>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <outstream.hpp>
-
-#include <TypeTraits.hpp>
-
-namespace miniFE {
-
-template<typename MatrixType,
-         typename VectorType>
-void
-exchange_externals(MatrixType& A,
-                   VectorType& x)
-{
-#ifdef HAVE_MPI
-#ifdef MINIFE_DEBUG
-  std::ostream& os = outstream();
-  os << "entering exchange_externals\n";
-#endif
-
-  int numprocs = 1;
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-
-  if (numprocs < 2) return;
-
-  typedef typename MatrixType::ScalarType Scalar;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-
-  // Extract Matrix pieces
-
-  int local_nrow = A.rows.size();
-  int num_neighbors = A.neighbors.size();
-  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
-  const std::vector<LocalOrdinal>& send_length = A.send_length;
-  const std::vector<int>& neighbors = A.neighbors;
-  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
-
-  std::vector<Scalar>& send_buffer = A.send_buffer;
-
-  //
-  // first post receives, these are immediate receives
-  // Do not wait for result to come, will do that at the
-  // wait call below.
-  //
-
-  int MPI_MY_TAG = 99;
-
-  std::vector<MPI_Request>& request = A.request;
-
-  //
-  // Externals are at end of locals
-  //
-
-  std::vector<Scalar>& x_coefs = x.coefs;
-  Scalar* x_external = &(x_coefs[local_nrow]);
-
-  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
-
-  // Post receives first
-  for(int i=0; i<num_neighbors; ++i) {
-    int n_recv = recv_length[i];
-    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
-              MPI_COMM_WORLD, &request[i]);
-    x_external += n_recv;
-  }
-
-#ifdef MINIFE_DEBUG
-  os << "launched recvs\n";
-#endif
-
-  //
-  // Fill up send buffer
-  //
-
-  size_t total_to_be_sent = elements_to_send.size();
-#ifdef MINIFE_DEBUG
-  os << "total_to_be_sent: " << total_to_be_sent << std::endl;
-#endif
-
-  for(size_t i=0; i<total_to_be_sent; ++i) {
-#ifdef MINIFE_DEBUG
-    //expensive index range-check:
-    if (elements_to_send[i] < 0 || elements_to_send[i] > x.coefs.size()) {
-      os << "error, out-of-range. x.coefs.size()=="<<x.coefs.size()<<", elements_to_send[i]=="<<elements_to_send[i]<<std::endl;
-    }
-#endif
-    send_buffer[i] = x.coefs[elements_to_send[i]];
-  }
-
-  //
-  // Send to each neighbor
-  //
-
-  Scalar* s_buffer = &send_buffer[0];
-
-  for(int i=0; i<num_neighbors; ++i) {
-    int n_send = send_length[i];
-    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
-             MPI_COMM_WORLD);
-    s_buffer += n_send;
-  }
-
-#ifdef MINIFE_DEBUG
-  os << "send to " << num_neighbors << std::endl;
-#endif
-
-  //
-  // Complete the reads issued above
-  //
-
-  MPI_Status status;
-  for(int i=0; i<num_neighbors; ++i) {
-    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
-      std::cerr << "MPI_Wait error\n"<<std::endl;
-      MPI_Abort(MPI_COMM_WORLD, -1);
-    }
-  }
-
-#ifdef MINIFE_DEBUG
-  os << "leaving exchange_externals"<<std::endl;
-#endif
-
-//endif HAVE_MPI
-#endif
-}
-
-#ifdef HAVE_MPI
-static std::vector<MPI_Request> exch_ext_requests;
-#endif
-
-template<typename MatrixType,
-         typename VectorType>
-void
-begin_exchange_externals(MatrixType& A,
-                         VectorType& x)
-{
-#ifdef HAVE_MPI
-
-  int numprocs = 1, myproc = 0;
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-
-  if (numprocs < 2) return;
-
-  typedef typename MatrixType::ScalarType Scalar;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-
-  // Extract Matrix pieces
-
-  int local_nrow = A.rows.size();
-  int num_neighbors = A.neighbors.size();
-  const std::vector<LocalOrdinal>& recv_length = A.recv_length;
-  const std::vector<LocalOrdinal>& send_length = A.send_length;
-  const std::vector<int>& neighbors = A.neighbors;
-  const std::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
-
-  std::vector<Scalar> send_buffer(elements_to_send.size(), 0);
-
-  //
-  // first post receives, these are immediate receives
-  // Do not wait for result to come, will do that at the
-  // wait call below.
-  //
-
-  int MPI_MY_TAG = 99;
-
-  exch_ext_requests.resize(num_neighbors);
-
-  //
-  // Externals are at end of locals
-  //
-
-  std::vector<Scalar>& x_coefs = x.coefs;
-  Scalar* x_external = &(x_coefs[local_nrow]);
-
-  MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
-
-  // Post receives first
-  for(int i=0; i<num_neighbors; ++i) {
-    int n_recv = recv_length[i];
-    MPI_Irecv(x_external, n_recv, mpi_dtype, neighbors[i], MPI_MY_TAG,
-              MPI_COMM_WORLD, &exch_ext_requests[i]);
-    x_external += n_recv;
-  }
-
-  //
-  // Fill up send buffer
-  //
-
-  size_t total_to_be_sent = elements_to_send.size();
-  for(size_t i=0; i<total_to_be_sent; ++i) send_buffer[i] = x.coefs[elements_to_send[i]];
-
-  //
-  // Send to each neighbor
-  //
-
-  Scalar* s_buffer = &send_buffer[0];
-
-  for(int i=0; i<num_neighbors; ++i) {
-    int n_send = send_length[i];
-    MPI_Send(s_buffer, n_send, mpi_dtype, neighbors[i], MPI_MY_TAG,
-             MPI_COMM_WORLD);
-    s_buffer += n_send;
-  }
-#endif
-}
-
-inline
-void
-finish_exchange_externals(int num_neighbors)
-{
-#ifdef HAVE_MPI
-  //
-  // Complete the reads issued above
-  //
-
-  MPI_Status status;
-  for(int i=0; i<num_neighbors; ++i) {
-    if (MPI_Wait(&exch_ext_requests[i], &status) != MPI_SUCCESS) {
-      std::cerr << "MPI_Wait error\n"<<std::endl;
-      MPI_Abort(MPI_COMM_WORLD, -1);
-    }
-  }
-
-//endif HAVE_MPI
-#endif
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/generate_matrix_structure.hpp b/kokkos/basic/generate_matrix_structure.hpp
deleted file mode 100644
index 2413d62..0000000
--- a/kokkos/basic/generate_matrix_structure.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-#ifndef _generate_matrix_structure_hpp_
-#define _generate_matrix_structure_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <sstream>
-#include <stdexcept>
-#include <map>
-#include <algorithm>
-
-#include <simple_mesh_description.hpp>
-#include <SparseMatrix_functions.hpp>
-#include <box_utils.hpp>
-#include <utils.hpp>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-namespace miniFE {
-
-template<typename MatrixType>
-int
-generate_matrix_structure(const simple_mesh_description<typename MatrixType::GlobalOrdinalType>& mesh,
-                          MatrixType& A)
-{
-  int myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  int threw_exc = 0;
-  try {
-
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-
-  int global_nodes_x = mesh.global_box[0][1]+1;
-  int global_nodes_y = mesh.global_box[1][1]+1;
-  int global_nodes_z = mesh.global_box[2][1]+1;
-  Box box;
-  copy_box(mesh.local_box, box);
-
-  //num-owned-nodes in each dimension is num-elems+1
-  //only if num-elems > 0 in that dimension *and*
-  //we are at the high end of the global range in that dimension:
-  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
-  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
-  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
-
-  GlobalOrdinal global_nrows = global_nodes_x;
-  global_nrows *= global_nodes_y*global_nodes_z;
-
-  GlobalOrdinal nrows = get_num_ids<GlobalOrdinal>(box);
-  try {
-    A.reserve_space(nrows, 27);
-  }
-  catch(std::exception& exc) {
-    std::ostringstream osstr;
-    osstr << "One of A.rows.resize, A.row_offsets.resize, A.packed_cols.reserve or A.packed_coefs.reserve: nrows=" <<nrows<<": ";
-    osstr << exc.what();
-    std::string str1 = osstr.str();
-    throw std::runtime_error(str1);
-  }
-
-  std::vector<GlobalOrdinal> rows(nrows);
-  std::vector<LocalOrdinal> row_offsets(nrows+1);
-  std::vector<int> row_coords(nrows*3);
-
-  unsigned roffset = 0;
-  GlobalOrdinal nnz = 0;
-
-  for(int iz=box[2][0]; iz<box[2][1]; ++iz) {
-   for(int iy=box[1][0]; iy<box[1][1]; ++iy) {
-    for(int ix=box[0][0]; ix<box[0][1]; ++ix) {
-      GlobalOrdinal row_id =
-          get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
-                                ix, iy, iz);
-      rows[roffset] = mesh.map_id_to_row(row_id);
-      row_coords[roffset*3] = ix;
-      row_coords[roffset*3+1] = iy;
-      row_coords[roffset*3+2] = iz;
-      row_offsets[roffset++] = nnz;
-
-      GlobalOrdinal row_begin_offset = nnz;
-      for(int sz=-1; sz<=1; ++sz) {
-       for(int sy=-1; sy<=1; ++sy) {
-        for(int sx=-1; sx<=1; ++sx) {
-          GlobalOrdinal col_id =
-              get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
-                                   ix+sx, iy+sy, iz+sz);
-          if (col_id >= 0 && col_id < global_nrows) {
-            ++nnz;
-          }
-        }
-       }
-      }
-    }
-   }
-  }
-  row_offsets[roffset] = nnz;
-  init_matrix(A, rows, row_offsets, row_coords,
-              global_nodes_x, global_nodes_y, global_nodes_z, global_nrows, mesh);
-  }
-  catch(...) {
-    std::cout << "proc " << myproc << " threw an exception in generate_matrix_structure, probably due to running out of memory." << std::endl;
-    threw_exc = 1;
-  }
-#ifdef HAVE_MPI
-  int global_throw = 0;
-  MPI_Allreduce(&threw_exc, &global_throw, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-  threw_exc = global_throw;
-#endif
-  if (threw_exc) {
-    return 1;
-  }
-
-  return 0;
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/get_common_files b/kokkos/basic/get_common_files
deleted file mode 100755
index dec46a7..0000000
--- a/kokkos/basic/get_common_files
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-dir=../../common
-
-cp ${dir}/YAML_Doc.cpp .
-cp ${dir}/YAML_Doc.hpp .
-cp ${dir}/YAML_Element.cpp .
-cp ${dir}/YAML_Element.hpp .
-
-cp ${dir}/generate_info_header .
-
diff --git a/kokkos/basic/imbalance.hpp b/kokkos/basic/imbalance.hpp
deleted file mode 100644
index f801efc..0000000
--- a/kokkos/basic/imbalance.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-#ifndef _imbalance_hpp_
-#define _imbalance_hpp_
-
-#include <cmath>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <box_utils.hpp>
-#include <utils.hpp>
-#include <YAML_Doc.hpp>
-
-namespace miniFE {
-
-const int X = 0;
-const int Y = 1;
-const int Z = 2;
-const int NONE = 3;
-
-const int LOWER = 0;
-const int UPPER = 1;
-
-template<typename GlobalOrdinal>
-void
-compute_imbalance(const Box& global_box,
-                  const Box& local_box,
-                  float& largest_imbalance,
-                  float& std_dev,
-                  YAML_Doc& doc,
-                  bool record_in_doc)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
-  GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
-  int min_proc = myproc, max_proc = myproc;
-  get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
-                     max_nrows, max_proc);
-
-  float avg_nrows = global_nrows;
-  avg_nrows /= numprocs;
-
-  //largest_imbalance will be the difference between the min (or max)
-  //rows-per-processor and avg_nrows, represented as a percentage:
-  largest_imbalance = percentage_difference<float>(min_nrows, avg_nrows);
-
-  float tmp = percentage_difference<float>(max_nrows, avg_nrows);
-  if (tmp > largest_imbalance) largest_imbalance = tmp;
-
-  std_dev = compute_std_dev_as_percentage<float>(local_nrows, avg_nrows);
-
-  if (myproc == 0 && record_in_doc) {
-    doc.add("Rows-per-proc Load Imbalance","");
-    doc.get("Rows-per-proc Load Imbalance")->add("Largest (from avg, %)",largest_imbalance);
-    doc.get("Rows-per-proc Load Imbalance")->add("Std Dev (%)",std_dev);
-  }
-}
-
-std::pair<int,int>
-decide_how_to_grow(const Box& global_box, const Box& local_box)
-{
-  std::pair<int,int> result(NONE,UPPER);
-
-  if (local_box[Z][UPPER] < global_box[Z][UPPER]) {
-    result.first = Z;
-    result.second = UPPER;
-    return result;
-  }
-  if (local_box[Z][LOWER] > global_box[Z][LOWER]) {
-    result.first = Z;
-    result.second = LOWER;
-    return result;
-  }
-  if (local_box[Y][UPPER] < global_box[Y][UPPER]) {
-    result.first = Y;
-    result.second = UPPER;
-    return result;
-  }
-  if (local_box[Y][LOWER] > global_box[Y][LOWER]) {
-    result.first = Y;
-    result.second = LOWER;
-    return result;
-  }
-  if (local_box[X][UPPER] < global_box[X][UPPER]) {
-    result.first = X;
-    result.second = UPPER;
-    return result;
-  }
-  if (local_box[X][LOWER] > global_box[X][LOWER]) {
-    result.first = X;
-    result.second = LOWER;
-    return result;
-  }
-  return result;
-}
-
-std::pair<int,int>
-decide_how_to_shrink(const Box& global_box, const Box& local_box)
-{
-  std::pair<int,int> result(NONE,UPPER);
-
-  if (local_box[Z][UPPER] < global_box[Z][UPPER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
-    result.first = Z;
-    result.second = UPPER;
-    return result;
-  }
-  if (local_box[Z][LOWER] > global_box[Z][LOWER] && local_box[Z][UPPER]-local_box[Z][LOWER] > 2) {
-    result.first = Z;
-    result.second = LOWER;
-    return result;
-  }
-  if (local_box[Y][UPPER] < global_box[Y][UPPER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
-    result.first = Y;
-    result.second = UPPER;
-    return result;
-  }
-  if (local_box[Y][LOWER] > global_box[Y][LOWER] && local_box[Y][UPPER]-local_box[Y][LOWER] > 2) {
-    result.first = Y;
-    result.second = LOWER;
-    return result;
-  }
-  if (local_box[X][UPPER] < global_box[X][UPPER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
-    result.first = X;
-    result.second = UPPER;
-    return result;
-  }
-  if (local_box[X][LOWER] > global_box[X][LOWER] && local_box[X][UPPER]-local_box[X][LOWER] > 2) {
-    result.first = X;
-    result.second = LOWER;
-    return result;
-  }
-  return result;
-}
-
-template<typename GlobalOrdinal>
-void
-add_imbalance(const Box& global_box,
-              Box& local_box,
-              float imbalance,
-              YAML_Doc& doc)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs == 1) {
-    return;
-  }
-
-  float cur_imbalance = 0, cur_std_dev = 0;
-  compute_imbalance<GlobalOrdinal>(global_box, local_box,
-                                  cur_imbalance, cur_std_dev, doc, false);
-
-  while (cur_imbalance < imbalance) {
-    GlobalOrdinal local_nrows = get_num_ids<GlobalOrdinal>(local_box);
-    GlobalOrdinal min_nrows = 0, max_nrows = 0, global_nrows = 0;
-    int min_proc = myproc, max_proc = myproc;
-    get_global_min_max(local_nrows, global_nrows, min_nrows, min_proc,
-                       max_nrows, max_proc);
-
-    std::pair<int,int> grow(NONE,UPPER);
-    int grow_axis_val = -1;
-    std::pair<int,int> shrink(NONE,UPPER);
-    int shrink_axis_val = -1;
-
-    if (myproc == max_proc) {
-      grow = decide_how_to_grow(global_box, local_box);
-      if (grow.first != NONE) {
-        grow_axis_val = local_box[grow.first][grow.second];
-      }
-    }
-    if (myproc == min_proc) {
-      shrink = decide_how_to_shrink(global_box, local_box);
-      if (shrink.first != NONE) {
-        shrink_axis_val = local_box[shrink.first][shrink.second];
-      }
-    }
-
-    int grow_info[8] = {grow.first, grow.second,
-                        local_box[X][0], local_box[X][1],
-                        local_box[Y][0], local_box[Y][1],
-                        local_box[Z][0], local_box[Z][1]};
-
-    int shrink_info[8] = {shrink.first, shrink.second,
-                        local_box[X][0], local_box[X][1],
-                        local_box[Y][0], local_box[Y][1],
-                        local_box[Z][0], local_box[Z][1]};
-#ifdef HAVE_MPI
-    MPI_Bcast(&grow_info[0], 8, MPI_INT, max_proc, MPI_COMM_WORLD);
-    MPI_Bcast(&shrink_info[0], 8, MPI_INT, min_proc, MPI_COMM_WORLD);
-#endif
-
-    int grow_axis = grow_info[0];
-    int grow_end = grow_info[1];
-    int shrink_axis = shrink_info[0];
-    int shrink_end = shrink_info[1];
-    int grow_incr = 1;
-    if (grow_end == LOWER) grow_incr = -1;
-    int shrink_incr = -1;
-    if (shrink_end == LOWER) shrink_incr = 1;
-    if (grow_axis != NONE) grow_axis_val = grow_info[2+grow_axis*2+grow_end];
-    if (shrink_axis != NONE) shrink_axis_val = shrink_info[2+shrink_axis*2+shrink_end];
-
-    if (grow_axis == NONE && shrink_axis == NONE) break;
-
-    bool grow_status = grow_axis==NONE ? false : true;
-    if (grow_axis != NONE) {
-      if ((grow_incr ==  1 && local_box[grow_axis][0] == grow_axis_val) ||
-          (grow_incr == -1 && local_box[grow_axis][1] == grow_axis_val)) {
-        if (local_box[grow_axis][1] - local_box[grow_axis][0] < 2) {
-          grow_status = false;
-        }
-      }
-    }
-
-    bool shrink_status = shrink_axis==NONE ? false : true;
-    if (shrink_axis != NONE) {
-      if ((shrink_incr ==  1 && local_box[shrink_axis][0] == shrink_axis_val) ||
-          (shrink_incr == -1 && local_box[shrink_axis][1] == shrink_axis_val)) {
-        if (local_box[shrink_axis][1] - local_box[shrink_axis][0] < 2) {
-          shrink_status = false;
-        }
-      }
-    }
-
-#ifdef HAVE_MPI
-    int statusints[2] = { grow_status ? 0 : 1, shrink_status ? 0 : 1 };
-    int globalstatus[2] = { 0, 0 };
-    MPI_Allreduce(&statusints, &globalstatus, 2, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
-    grow_status = globalstatus[0]>0 ? false : true;
-    shrink_status = globalstatus[1]>0 ? false : true;
-#endif
-
-    if (grow_status == false && shrink_status == false) break;
-
-    if (grow_status && grow_axis != NONE) {
-      if (local_box[grow_axis][0] == grow_axis_val) {
-        local_box[grow_axis][0] += grow_incr;
-      }
-
-      if (local_box[grow_axis][1] == grow_axis_val) {
-        local_box[grow_axis][1] += grow_incr;
-      }
-    }
-
-    if (shrink_status && shrink_axis != NONE) {
-      if (local_box[shrink_axis][0] == shrink_axis_val) {
-        local_box[shrink_axis][0] += shrink_incr;
-      }
-
-      if (local_box[shrink_axis][1] == shrink_axis_val) {
-        local_box[shrink_axis][1] += shrink_incr;
-      }
-    }
-
-    compute_imbalance<GlobalOrdinal>(global_box, local_box,
-                                    cur_imbalance, cur_std_dev, doc, false);
-  }
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/main.cpp b/kokkos/basic/main.cpp
deleted file mode 100644
index ed3753f..0000000
--- a/kokkos/basic/main.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-#include <iostream>
-#include <ctime>
-#include <cstdlib>
-#include <vector>
-
-#include <miniFE_version.h>
-
-#include <outstream.hpp>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-//--------------------------------------------------------------------
-#include <ComputeNodeType.hpp>
-//--------------------------------------------------------------------
-
-#include <Box.hpp>
-#include <BoxPartition.hpp>
-#include <box_utils.hpp>
-#include <Parameters.hpp>
-#include <utils.hpp>
-#include <driver.hpp>
-#include <YAML_Doc.hpp>
-
-#if MINIFE_INFO != 0
-#include <miniFE_info.hpp>
-#else
-#include <miniFE_no_info.hpp>
-#endif
-
-//The following macros should be specified as compile-macros in the
-//makefile. They are defaulted here just in case...
-#ifndef MINIFE_SCALAR
-#define MINIFE_SCALAR double
-#endif
-#ifndef MINIFE_LOCAL_ORDINAL
-#define MINIFE_LOCAL_ORDINAL int
-#endif
-#ifndef MINIFE_GLOBAL_ORDINAL
-#define MINIFE_GLOBAL_ORDINAL int
-#endif
-
-// ************************************************************************
-
-void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params);
-void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads);
-void add_timestring_to_yaml(YAML_Doc& doc);
-
-inline void print_box(int myproc, const char* name, const Box& box,
-                      const char* name2, const Box& box2)
-{
-  std::cout << "proc " << myproc << " "<<name
-      <<" ("<<box[0][0]<<","<<box[0][1]<<") "
-      <<" ("<<box[1][0]<<","<<box[1][1]<<") "
-      <<" ("<<box[2][0]<<","<<box[2][1]<<") "
-      <<name2
-      <<" ("<<box2[0][0]<<","<<box2[0][1]<<") "
-      <<" ("<<box2[1][0]<<","<<box2[1][1]<<") "
-      <<" ("<<box2[2][0]<<","<<box2[2][1]<<") "<<std::endl;
-}
-
-//
-//We will create a 'box' of size nx X ny X nz, partition it among processors,
-//then call miniFE::driver which will use the partitioned box as the domain
-//from which to assemble finite-element matrices into a global matrix and
-//vector, then solve the linear-system using Conjugate Gradients.
-//
-
-int main(int argc, char** argv) {
-  miniFE::Parameters params;
-  miniFE::get_parameters(argc, argv, params);
-
-  int numprocs = 1, myproc = 0;
-  miniFE::initialize_mpi(argc, argv, numprocs, myproc);
-
-  miniFE::timer_type start_time = miniFE::mytimer();
-
-#ifdef MINIFE_DEBUG
-  outstream(numprocs, myproc);
-#endif
-
-  //make sure each processor has the same parameters:
-  miniFE::broadcast_parameters(params);
-
-
-  Box global_box = { 0, params.nx, 0, params.ny, 0, params.nz };
-  std::vector<Box> local_boxes(numprocs);
-
-  box_partition(0, numprocs, 2, global_box, &local_boxes[0]);
-
-  Box& my_box = local_boxes[myproc];
-
-//print_box(myproc, "global-box", global_box, "local-box", my_box);
-
-  std::ostringstream osstr;
-  osstr << "miniFE." << params.nx << "x" << params.ny << "x" << params.nz;
-#ifdef HAVE_MPI
-  osstr << ".P"<<numprocs;
-#endif
-#if defined(MINIFE_HAVE_TPI) || defined(MINIFE_HAVE_TBB)
-  osstr << "xT"<<params.numthreads;
-#endif
-  osstr << ".";
-  if (params.name != "") osstr << params.name << ".";
-
-  YAML_Doc doc("miniFE", MINIFE_VERSION, ".", osstr.str());
-  if (myproc == 0) {
-    add_params_to_yaml(doc, params);
-    add_configuration_to_yaml(doc, numprocs, params.numthreads);
-    add_timestring_to_yaml(doc);
-  }
-
-#if defined(MINIFE_HAVE_TBB)
-  TBBNode compute_node(params.numthreads);
-#ifdef MINIFE_HAVE_CUDA
-  CUDANode::singleton(0,8,512);
-#endif
-#elif defined(MINIFE_HAVE_TPI)
-  TPINode compute_node(params.numthreads);
-#elif defined(MINIFE_HAVE_CUDA)
-  CUDANode compute_node(0,2,64);
-#else
-  SerialComputeNode compute_node;
-#endif
-
-  //Most of the program is performed in the 'driver' function, which is
-  //templated on < Scalar, LocalOrdinal, GlobalOrdinal, NodeType >.
-  //To run miniFE with float instead of double, or 'long long' instead of int,
-  //etc., change these template-parameters by changing the macro definitions in
-  //the makefile or on the make command-line.
-
-  miniFE::driver< MINIFE_SCALAR, MINIFE_LOCAL_ORDINAL, MINIFE_GLOBAL_ORDINAL,
-                  ComputeNodeType>(global_box, my_box, compute_node, params, doc);
-
-  miniFE::timer_type total_time = miniFE::mytimer() - start_time;
-
-  if (myproc == 0) {
-    doc.add("Total Program Time",total_time);
-    std::cout << doc.generateYAML() << std::endl;
-  }
-
-  miniFE::finalize_mpi();
-
-  return 0;
-}
-
-void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params)
-{
-  doc.add("Global Run Parameters","");
-  doc.get("Global Run Parameters")->add("dimensions","");
-  doc.get("Global Run Parameters")->get("dimensions")->add("nx",params.nx);
-  doc.get("Global Run Parameters")->get("dimensions")->add("ny",params.ny);
-  doc.get("Global Run Parameters")->get("dimensions")->add("nz",params.nz);
-  doc.get("Global Run Parameters")->add("load_imbalance", params.load_imbalance);
-  if (params.mv_overlap_comm_comp == 1) {
-    std::string val("1 (yes)");
-    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
-  }
-  else {
-    std::string val("0 (no)");
-    doc.get("Global Run Parameters")->add("mv_overlap_comm_comp", val);
-  }
-}
-
-void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads)
-{
-  doc.get("Global Run Parameters")->add("number of processors", numprocs);
-  std::string threading("none");
-
-#ifdef MINIFE_HAVE_TPI
-  threading = "TPI";
-#endif
-#ifdef MINIFE_HAVE_TBB
-  threading = "TBB";
-#endif
-#ifdef MINIFE_HAVE_CUDA
-  threading = "CUDA";
-#endif
-  if (threading != "none") {
-    doc.get("Global Run Parameters")->add("(per proc) numthreads",numthreads);
-  }
-
-  doc.add("Platform","");
-  doc.get("Platform")->add("hostname",MINIFE_HOSTNAME);
-  doc.get("Platform")->add("kernel name",MINIFE_KERNEL_NAME);
-  doc.get("Platform")->add("kernel release",MINIFE_KERNEL_RELEASE);
-  doc.get("Platform")->add("processor",MINIFE_PROCESSOR);
-
-  doc.add("Build","");
-  doc.get("Build")->add("CXX",MINIFE_CXX);
-  doc.get("Build")->add("compiler version",MINIFE_CXX_VERSION);
-  doc.get("Build")->add("CXXFLAGS",MINIFE_CXXFLAGS);
-  std::string using_mpi("no");
-#ifdef HAVE_MPI
-  using_mpi = "yes";
-#endif
-  doc.get("Build")->add("using MPI",using_mpi);
-  doc.get("Build")->add("Threading",threading.c_str());
-}
-
-void add_timestring_to_yaml(YAML_Doc& doc)
-{
-  std::time_t rawtime;
-  struct tm * timeinfo;
-  std::time(&rawtime);
-  timeinfo = std::localtime(&rawtime);
-  std::ostringstream osstr;
-  osstr.fill('0');
-  osstr << timeinfo->tm_year+1900 << "-";
-  osstr.width(2); osstr << timeinfo->tm_mon+1 << "-";
-  osstr.width(2); osstr << timeinfo->tm_mday << ", ";
-  osstr.width(2); osstr << timeinfo->tm_hour << "-";
-  osstr.width(2); osstr << timeinfo->tm_min << "-";
-  osstr.width(2); osstr << timeinfo->tm_sec;
-  std::string timestring = osstr.str();
-  doc.add("Run Date/Time",timestring);
-}
-
diff --git a/kokkos/basic/make_local_matrix.hpp b/kokkos/basic/make_local_matrix.hpp
deleted file mode 100644
index 99c2cf7..0000000
--- a/kokkos/basic/make_local_matrix.hpp
+++ /dev/null
@@ -1,440 +0,0 @@
-#ifndef _make_local_matrix_hpp_
-#define _make_local_matrix_hpp_
-
-//@HEADER
-// ************************************************************************
-//
-// MiniFE: Simple Finite Element Assembly and Solve
-// Copyright (2006-2013) Sandia Corporation
-//
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-//
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-//
-// ************************************************************************
-//@HEADER
-
-#include <map>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-namespace miniFE {
-
-template<typename MatrixType>
-void
-make_local_matrix(MatrixType& A)
-{
-#ifdef HAVE_MPI
-  int numprocs = 1, myproc = 0;
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-
-  if (numprocs < 2) {
-    A.num_cols = A.rows.size();
-    A.has_local_indices = true;
-    return;
-  }
-
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::LocalOrdinalType LocalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-  std::map<GlobalOrdinal,GlobalOrdinal> externals;
-  LocalOrdinal num_external = 0;
-
-  //Extract Matrix pieces
-
-  size_t local_nrow = A.rows.size();
-  GlobalOrdinal start_row = local_nrow>0 ? A.rows[0] : -1;
-  GlobalOrdinal stop_row  = local_nrow>0 ? A.rows[local_nrow-1] : -1;
-
-  // We need to convert the index values for the rows on this processor
-  // to a local index space. We need to:
-  // - Determine if each index reaches to a local value or external value
-  // - If local, subtract start_row from index value to get local index
-  // - If external, find out if it is already accounted for.
-  //   - If so, then do nothing,
-  //   - otherwise
-  //     - add it to the list of external indices,
-  //     - find out which processor owns the value.
-  //     - Set up communication for sparse MV operation
-
-  ///////////////////////////////////////////
-  // Scan the indices and transform to local
-  ///////////////////////////////////////////
-
-  std::vector<GlobalOrdinal>& external_index = A.external_index;
-
-  for(size_t i=0; i<A.rows.size(); ++i) {
-    GlobalOrdinal* Acols = NULL;
-    Scalar* Acoefs = NULL;
-    size_t row_len = 0;
-    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
-
-    for(size_t j=0; j<row_len; ++j) {
-      GlobalOrdinal cur_ind = Acols[j];
-      if (start_row <= cur_ind && cur_ind <= stop_row) {
-        Acols[j] -= start_row;
-      }
-      else { // Must find out if we have already set up this point
-        if (externals.find(cur_ind) == externals.end()) {
-          externals[cur_ind] = num_external++;
-          external_index.push_back(cur_ind);
-        }
-        // Mark index as external by adding 1 and negating it
-        Acols[j] = -(Acols[j] + 1);
-      }
-    }
-  }
-
-  ////////////////////////////////////////////////////////////////////////
-  // Go through list of externals to find out which processors must be accessed.
-  ////////////////////////////////////////////////////////////////////////
-
-  std::vector<GlobalOrdinal> tmp_buffer(numprocs, 0); // Temp buffer space needed below
-
-  // Build list of global index offset
-
-  std::vector<GlobalOrdinal> global_index_offsets(numprocs, 0);
-
-  tmp_buffer[myproc] = start_row; // This is my start row
-
-  // This call sends the start_row of each ith processor to the ith
-  // entry of global_index_offsets on all processors.
-  // Thus, each processor knows the range of indices owned by all
-  // other processors.
-  // Note: There might be a better algorithm for doing this, but this
-  //       will work...
-
-  MPI_Datatype mpi_dtype = TypeTraits<GlobalOrdinal>::mpi_type();
-  MPI_Allreduce(&tmp_buffer[0], &global_index_offsets[0], numprocs, mpi_dtype,
-                MPI_SUM, MPI_COMM_WORLD);
-
-  // Go through list of externals and find the processor that owns each
-  std::vector<int> external_processor(num_external);
-
-  for(LocalOrdinal i=0; i<num_external; ++i) {
-    GlobalOrdinal cur_ind = external_index[i];
-    for(int j=numprocs-1; j>=0; --j) {
-      if (global_index_offsets[j] <= cur_ind && global_index_offsets[j] >= 0) {
-        external_processor[i] = j;
-        break;
-      }
-    }
-  }
-
-  /////////////////////////////////////////////////////////////////////////
-  // Sift through the external elements. For each newly encountered external
-  // point assign it the next index in the sequence. Then look for other
-  // external elements who are updated by the same node and assign them the next
-  // set of index numbers in the sequence (ie. elements updated by the same node
-  // have consecutive indices).
-  /////////////////////////////////////////////////////////////////////////
-
-  size_t count = local_nrow;
-  std::vector<GlobalOrdinal>& external_local_index = A.external_local_index;
-  external_local_index.assign(num_external, -1);
-
-  for(LocalOrdinal i=0; i<num_external; ++i) {
-    if (external_local_index[i] == -1) {
-      external_local_index[i] = count++;
-
-      for(LocalOrdinal j=i+1; j<num_external; ++j) {
-        if (external_processor[j] == external_processor[i])
-          external_local_index[j] = count++;
-      }
-    }
-  }
-
-  for(size_t i=0; i<local_nrow; ++i) {
-    GlobalOrdinal* Acols = NULL;
-    Scalar* Acoefs = NULL;
-    size_t row_len = 0;
-    A.get_row_pointers(A.rows[i], row_len, Acols, Acoefs);
-
-    for(size_t j=0; j<row_len; ++j) {
-      if (Acols[j] < 0) { // Change index values of externals
-        GlobalOrdinal cur_ind = -Acols[j] - 1;
-        Acols[j] = external_local_index[externals[cur_ind]];
-      }
-    }
-  }
-
-  std::vector<int> new_external_processor(num_external, 0);
-
-  for(int i=0; i<num_external; ++i) {
-    new_external_processor[external_local_index[i]-local_nrow] =
-      external_processor[i];
-  }
-
-  ////////////////////////////////////////////////////////////////////////
-  ///
-  // Count the number of neighbors from which we receive information to update
-  // our external elements. Additionally, fill the array tmp_neighbors in the
-  // following way:
-  //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
-  //                              processor i.
-  //      tmp_neighbors[i] = x   ==>  (x-1)/numprocs elements are updated from
-  //                              processor i.
-  ///
-  ////////////////////////////////////////////////////////////////////////
-
-  std::vector<GlobalOrdinal> tmp_neighbors(numprocs, 0);
-
-  int num_recv_neighbors = 0;
-  int length             = 1;
-
-  for(LocalOrdinal i=0; i<num_external; ++i) {
-    if (tmp_neighbors[new_external_processor[i]] == 0) {
-      ++num_recv_neighbors;
-      tmp_neighbors[new_external_processor[i]] = 1;
-    }
-    tmp_neighbors[new_external_processor[i]] += numprocs;
-  }
-
-  /// sum over all processor all the tmp_neighbors arrays ///
-
-  MPI_Allreduce(&tmp_neighbors[0], &tmp_buffer[0], numprocs, mpi_dtype,
-                MPI_SUM, MPI_COMM_WORLD);
-
-  // decode the combined 'tmp_neighbors' (stored in tmp_buffer)
-  // array from all the processors
-
-  GlobalOrdinal num_send_neighbors = tmp_buffer[myproc] % numprocs;
-
-  /// decode 'tmp_buffer[myproc] to deduce total number of elements
-  //  we must send
-
-  GlobalOrdinal total_to_be_sent = (tmp_buffer[myproc] - num_send_neighbors) / numprocs;
-
-  ///////////////////////////////////////////////////////////////////////
-  ///
-  // Make a list of the neighbors that will send information to update our
-  // external elements (in the order that we will receive this information).
-  ///
-  ///////////////////////////////////////////////////////////////////////
-
-  std::vector<int> recv_list;
-  recv_list.push_back(new_external_processor[0]);
-  for(LocalOrdinal i=1; i<num_external; ++i) {
-    if (new_external_processor[i-1] != new_external_processor[i]) {
-      recv_list.push_back(new_external_processor[i]);
-    }
-  }
-
-  //
-  // Send a 0 length message to each of our recv neighbors
-  //
-
-  std::vector<int> send_list(num_send_neighbors, 0);
-
-  //
-  // first post receives, these are immediate receives
-  // Do not wait for result to come, will do that at the
-  // wait call below.
-  //
-  int MPI_MY_TAG = 99;
-
-  std::vector<MPI_Request> request(num_send_neighbors);
-  for(int i=0; i<num_send_neighbors; ++i) {
-    MPI_Irecv(&tmp_buffer[i], 1, mpi_dtype, MPI_ANY_SOURCE, MPI_MY_TAG,
-              MPI_COMM_WORLD, &request[i]);
-  }
-
-  // send messages
-
-  for(int i=0; i<num_recv_neighbors; ++i) {
-    MPI_Send(&tmp_buffer[i], 1, mpi_dtype, recv_list[i], MPI_MY_TAG,
-             MPI_COMM_WORLD);
-  }
-
-  ///
-  // Receive message from each send neighbor to construct 'send_list'.
-  ///
-
-  MPI_Status status;
-  for(int i=0; i<num_send_neighbors; ++i) {
-    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
-      std::cerr << "MPI_Wait error\n"<<std::endl;
-      MPI_Abort(MPI_COMM_WORLD, -1);
-    }
-    send_list[i] = status.MPI_SOURCE;
-  }
-
-  //////////////////////////////////////////////////////////////////////
-  ///
-  // Compare the two lists. In most cases they should be the same.
-  // However, if they are not then add new entries to the recv list
-  // that are in the send list (but not already in the recv list).
-  ///
-  //////////////////////////////////////////////////////////////////////
-
-  for(int j=0; j<num_send_neighbors; ++j) {
-    int found = 0;
-    for(int i=0; i<num_recv_neighbors; ++i) {
-      if (recv_list[i] == send_list[j]) found = 1;
-    }
-
-    if (found == 0) {
-      recv_list.push_back(send_list[j]);
-      ++num_recv_neighbors;
-    }
-  }
-
-  num_send_neighbors = num_recv_neighbors;
-  request.resize(num_send_neighbors);
-
-  A.elements_to_send.assign(total_to_be_sent, 0);
-  A.send_buffer.assign(total_to_be_sent, 0);
-
-  //
-  // Create 'new_external' which explicitly put the external elements in the
-  // order given by 'external_local_index'
-  //
-
-  std::vector<GlobalOrdinal> new_external(num_external);
-  for(LocalOrdinal i=0; i<num_external; ++i) {
-    new_external[external_local_index[i] - local_nrow] = external_index[i];
-  }
-
-  /////////////////////////////////////////////////////////////////////////
-  //
-  // Send each processor the global index list of the external elements in the
-  // order that I will want to receive them when updating my external elements.
-  //
-  /////////////////////////////////////////////////////////////////////////
-
-  std::vector<int> lengths(num_recv_neighbors);
-
-  ++MPI_MY_TAG;
-
-  // First post receives
-
-  for(int i=0; i<num_recv_neighbors; ++i) {
-    int partner = recv_list[i];
-    MPI_Irecv(&lengths[i], 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD,
-              &request[i]);
-  }
-
-  std::vector<int>& neighbors = A.neighbors;
-  std::vector<int>& recv_length = A.recv_length;
-  std::vector<int>& send_length = A.send_length;
-
-  neighbors.resize(num_recv_neighbors, 0);
-  A.request.resize(num_recv_neighbors);
-  recv_length.resize(num_recv_neighbors, 0);
-  send_length.resize(num_recv_neighbors, 0);
-
-  LocalOrdinal j = 0;
-  for(int i=0; i<num_recv_neighbors; ++i) {
-    int start = j;
-    int newlength = 0;
-
-    //go through list of external elements until updating
-    //processor changes
-
-    while((j < num_external) &&
-          (new_external_processor[j] == recv_list[i])) {
-      ++newlength;
-      ++j;
-      if (j == num_external) break;
-    }
-
-    recv_length[i] = newlength;
-    neighbors[i] = recv_list[i];
-
-    length = j - start;
-    MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
-  }
-
-  // Complete the receives of the number of externals
-
-  for(int i=0; i<num_recv_neighbors; ++i) {
-    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
-      std::cerr << "MPI_Wait error\n"<<std::endl;
-      MPI_Abort(MPI_COMM_WORLD, -1);
-    }
-    send_length[i] = lengths[i];
-  }
-
-  ////////////////////////////////////////////////////////////////////////
-  // Build "elements_to_send" list. These are the x elements I own
-  // that need to be sent to other processors.
-  ////////////////////////////////////////////////////////////////////////
-
-  ++MPI_MY_TAG;
-
-  j = 0;
-  for(int i=0; i<num_recv_neighbors; ++i) {
-    MPI_Irecv(&A.elements_to_send[j], send_length[i], mpi_dtype, neighbors[i],
-              MPI_MY_TAG, MPI_COMM_WORLD, &request[i]);
-    j += send_length[i];
-  }
-
-  j = 0;
-  for(int i=0; i<num_recv_neighbors; ++i) {
-    LocalOrdinal start = j;
-    LocalOrdinal newlength = 0;
-
-    // Go through list of external elements
-    // until updating processor changes. This is redundant, but
-    // saves us from recording this information.
-
-    while((j < num_external) &&
-          (new_external_processor[j] == recv_list[i])) {
-      ++newlength;
-      ++j;
-      if (j == num_external) break;
-    }
-    MPI_Send(&new_external[start], j-start, mpi_dtype, recv_list[i],
-             MPI_MY_TAG, MPI_COMM_WORLD);
-  }
-
-  // receive from each neighbor the global index list of external elements
-
-  for(int i=0; i<num_recv_neighbors; ++i) {
-    if (MPI_Wait(&request[i], &status) != MPI_SUCCESS) {
-      std::cerr << "MPI_Wait error\n"<<std::endl;
-      MPI_Abort(MPI_COMM_WORLD, -1);
-    }
-  }
-
-  /// replace global indices by local indices ///
-
-  for(GlobalOrdinal i=0; i<total_to_be_sent; ++i) {
-    A.elements_to_send[i] -= start_row;
-  }
-
-  //////////////////
-  // Finish up !!
-  //////////////////
-
-  A.num_cols = local_nrow + num_external;
-
-#else
-  A.num_cols = A.rows.size();
-#endif
-
-  A.has_local_indices = true;
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/make_targets b/kokkos/basic/make_targets
deleted file mode 100644
index 4f9d92e..0000000
--- a/kokkos/basic/make_targets
+++ /dev/null
@@ -1,52 +0,0 @@
-
-OBJS = \
-	BoxPartition.o \
-	YAML_Doc.o \
-	YAML_Element.o \
-	TBBNode.o
-
-UTIL_OBJS = \
-	param_utils.o \
-	utils.o \
-	mytimer.o
-
-MAIN_OBJ = \
-	main.o
-
-UTEST_OBJS = \
-	utest.o
-
-MINIFE_INFO = 1
-MINIFE_KERNELS = 0
-
-vpath %.cpp ../utils
-
-all:common_files generate_info miniFE.x
-
-utest.x:common_files $(OBJS) $(OPTIONAL_OBJS) $(UTIL_OBJS) $(UTEST_OBJS) *.hpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(UTEST_OBJS) $(OBJS) $(UTIL_OBJS) -o utest.x $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
-
-miniFE.x:common_files $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) *.hpp generate_info
-	$(INSTRUMENT) $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o miniFE.x $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
-
-common_files:
-	./get_common_files
-
-generate_info:
-	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
-
-verify:all
-	./run_tests
-
-%.o:%.cpp *.hpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-
-%.o:%.c *.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
-clean:
-	rm -f *.o *.a *.x *.linkinfo miniFE_info.hpp
-
-realclean: clean
-	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* run_utest_* minife_debug*
-
diff --git a/kokkos/basic/makefile b/kokkos/basic/makefile
deleted file mode 100644
index 47c1bd5..0000000
--- a/kokkos/basic/makefile
+++ /dev/null
@@ -1,35 +0,0 @@
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-        -DMINIFE_SCALAR=double   \
-        -DMINIFE_LOCAL_ORDINAL=int      \
-        -DMINIFE_GLOBAL_ORDINAL=int
-
-MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
-# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
-
-# CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) -DMINIFE_DEBUG -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
-
-LDFLAGS=
-LIBS=
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-CXX=mpicxx
-CC=mpicc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.cuda.gnu.serial b/kokkos/basic/makefile.cuda.gnu.serial
deleted file mode 100644
index 9ba0ce1..0000000
--- a/kokkos/basic/makefile.cuda.gnu.serial
+++ /dev/null
@@ -1,36 +0,0 @@
-#-----------------------------------------------------------------------
-#  DEFINES, INCLUDES, OBJECTS, and LIBRARIES
-#  for the CUDA option.
-
-CUDA_DIR=/usr/local/cuda/3.0/cuda
-
-DEVICE_EMULATION=--device-emulation
-DEVICE_EMULATION=
-CUDAFLAGS = -arch=sm_13 -O3 $(DEVICE_EMULATION)
-
-OPTIONAL_DEFS = -DMINIFE_HAVE_CUDA
-OPTIONAL_INCS = -I$(CUDA_DIR)/include -I./optional/cuda
-OPTIONAL_OBJS = CudaNode.o CudaVector.o CudaMatrix.o
-OPTIONAL_LIBS = -L$(CUDA_DIR)/lib64 -lcublas -lcudart
-
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =	\
-	-DMINIFE_SCALAR=float	\
-	-DMINIFE_LOCAL_ORDINAL=int	\
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
-LDFLAGS =
-LIBS =
-
-CXX=g++
-CC=gcc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.cuda.tbb.gnu.serial b/kokkos/basic/makefile.cuda.tbb.gnu.serial
deleted file mode 100644
index f375512..0000000
--- a/kokkos/basic/makefile.cuda.tbb.gnu.serial
+++ /dev/null
@@ -1,37 +0,0 @@
-#-----------------------------------------------------------------------
-#  DEFINES, INCLUDES, OBJECTS, and LIBRARIES
-#  for the CUDA option *AND* for the TBB option.
-
-TBB_DIR=/sierra/Sntools/extras/compilers/intel/Compiler/11.1/064/tbb
-CUDA_DIR=/usr/local/cuda/3.0/cuda
-
-DEVICE_EMULATION=--device-emulation
-DEVICE_EMULATION=
-CUDAFLAGS = -arch=sm_13 -O3 $(DEVICE_EMULATION)
-
-OPTIONAL_DEFS = -DMINIFE_HAVE_TBB -DMINIFE_HAVE_CUDA
-OPTIONAL_INCS = -I$(CUDA_DIR)/include -I./optional/cuda -I$(TBB_DIR)/include
-OPTIONAL_OBJS = CudaNode.o CudaVector.o CudaMatrix.o
-OPTIONAL_LIBS = -L$(TBB_DIR)/intel64/cc4.1.0_libc2.4_kernel2.6.16.21/lib -ltbb -ltbbmalloc -L$(CUDA_DIR)/lib64 -lcublas -lcudart
-
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =	\
-	-DMINIFE_SCALAR=float	\
-	-DMINIFE_LOCAL_ORDINAL=int	\
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
-LDFLAGS =
-LIBS =
-
-CXX=g++
-CC=gcc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.debug b/kokkos/basic/makefile.debug
deleted file mode 100644
index c6d4efb..0000000
--- a/kokkos/basic/makefile.debug
+++ /dev/null
@@ -1,35 +0,0 @@
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-        -DMINIFE_SCALAR=double   \
-        -DMINIFE_LOCAL_ORDINAL=int      \
-        -DMINIFE_GLOBAL_ORDINAL=int
-
-MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
-# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -g
-CXXFLAGS = -g
-
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
-
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMINIFE_DEBUG -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
-# CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK
-
-LDFLAGS=
-LIBS=
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-CXX=mpicxx
-CC=mpicc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.gnu.purify b/kokkos/basic/makefile.gnu.purify
deleted file mode 100644
index e667ed4..0000000
--- a/kokkos/basic/makefile.gnu.purify
+++ /dev/null
@@ -1,25 +0,0 @@
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-
-CFLAGS = -g
-CXXFLAGS = -g
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
-
-CXX=g++
-CC=gcc
-
-# Change 'quantify' to 'purify' to do memory checking instead of performance profiling, or
-# comment this out to do no instrumentation:
-
-INSTRUMENT=/usr/local/rational/rational7/releases/PurifyPlus.7.0.0.0-012/i386_linux2/bin/purify -always-use-cache-dir -cache-dir=/var/scratch2/william/purify-cache
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.gnu.quantify b/kokkos/basic/makefile.gnu.quantify
deleted file mode 100644
index 3637812..0000000
--- a/kokkos/basic/makefile.gnu.quantify
+++ /dev/null
@@ -1,24 +0,0 @@
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -g
-CXXFLAGS = -g
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
-
-CXX=g++
-CC=gcc
-
-# Change 'quantify' to 'purify' to do memory checking instead of performance profiling, or
-# comment this out to do no instrumentation:
-
-INSTRUMENT=/usr/local/rational/rational7/releases/PurifyPlus.7.0.0.0-012/i386_linux2/bin/quantify -always-use-cache-dir -cache-dir=/var/scratch2/william/quantify-cache
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.gnu.serial b/kokkos/basic/makefile.gnu.serial
deleted file mode 100644
index e40efac..0000000
--- a/kokkos/basic/makefile.gnu.serial
+++ /dev/null
@@ -1,21 +0,0 @@
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -g
-CXXFLAGS = -g
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
-LDFLAGS =
-LIBS=-lm
-
-CXX=g++
-CC=gcc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.intel.serial b/kokkos/basic/makefile.intel.serial
deleted file mode 100644
index ca5d30f..0000000
--- a/kokkos/basic/makefile.intel.serial
+++ /dev/null
@@ -1,19 +0,0 @@
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -g
-CXXFLAGS = -g
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES)
-
-CXX=icpc
-CC=icc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.redstorm b/kokkos/basic/makefile.redstorm
deleted file mode 100644
index 526aca1..0000000
--- a/kokkos/basic/makefile.redstorm
+++ /dev/null
@@ -1,21 +0,0 @@
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) -DHAVE_MPI -DMPICH_IGNORE_CXX_SEEK  -DREDSTORM
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-CXX=CC
-CC=cc
-
-include make_targets
-include ./optional/make_targets
diff --git a/kokkos/basic/makefile.tbb b/kokkos/basic/makefile.tbb
deleted file mode 100644
index c0d5dfe..0000000
--- a/kokkos/basic/makefile.tbb
+++ /dev/null
@@ -1,28 +0,0 @@
-TBB_BASE_DIR=/home/william/packages/tbb21_20080605oss
-
-OPTIONAL_DEFS=-DMINIFE_HAVE_TBB
-OPTIONAL_INCS=-I$(TBB_BASE_DIR)/include
-OPTIONAL_LIBS=-L$(TBB_BASE_DIR)/em64t/cc3.4.3_libc2.3.4_kernel2.6.9/lib	\
-	-ltbb -ltbbmalloc -lpthread
-
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS) -DHAVE_MPI
-LDFLAGS =
-LIBS =
-
-CXX=mpicxx
-CC=mpicc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.tbb.gnu.serial b/kokkos/basic/makefile.tbb.gnu.serial
deleted file mode 100644
index c5a9722..0000000
--- a/kokkos/basic/makefile.tbb.gnu.serial
+++ /dev/null
@@ -1,28 +0,0 @@
-TBB_BASE_DIR=/home/william/packages/tbb21_20080605oss
-
-OPTIONAL_DEFS=-DMINIFE_HAVE_TBB
-OPTIONAL_INCS=-I$(TBB_BASE_DIR)/include
-OPTIONAL_LIBS=-L$(TBB_BASE_DIR)/em64t/cc3.4.3_libc2.3.4_kernel2.6.9/lib	\
-	-ltbb -ltbbmalloc -lpthread
-
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=float   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
-LDFLAGS =
-LIBS =
-
-CXX=g++
-CC=gcc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.tpi b/kokkos/basic/makefile.tpi
deleted file mode 100644
index b3ba4a1..0000000
--- a/kokkos/basic/makefile.tpi
+++ /dev/null
@@ -1,28 +0,0 @@
-#-----------------------------------------------------------------------
-
-OPTIONAL_DEFS = -DMINIFE_HAVE_TPI -DHAVE_PTHREAD
-OPTIONAL_INCS = -I./optional/ThreadPool -I./optional/ThreadPool/src
-OPTIONAL_OBJS = TPI.o
-OPTIONAL_LIBS = -lpthread
-
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) -DHAVE_MPI $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
-LDFLAGS =
-LIBS =
-
-CXX=mpicxx
-CC=mpicc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/makefile.tpi.gnu.serial b/kokkos/basic/makefile.tpi.gnu.serial
deleted file mode 100644
index a5788dc..0000000
--- a/kokkos/basic/makefile.tpi.gnu.serial
+++ /dev/null
@@ -1,28 +0,0 @@
-#-----------------------------------------------------------------------
-
-OPTIONAL_DEFS = -DMINIFE_HAVE_TPI -DHAVE_PTHREAD
-OPTIONAL_INCS = -I./optional/ThreadPool
-OPTIONAL_OBJS = TPI.o
-OPTIONAL_LIBS = -lpthread
-
-#-----------------------------------------------------------------------
-
-MINIFE_TYPES =  \
-	-DMINIFE_SCALAR=double   \
-	-DMINIFE_LOCAL_ORDINAL=int      \
-	-DMINIFE_GLOBAL_ORDINAL=int
-
-#-----------------------------------------------------------------------
-
-CFLAGS = -O3
-CXXFLAGS = -O3
-CPPFLAGS = -I. -I../utils -I../fem $(MINIFE_TYPES) $(OPTIONAL_INCS) $(OPTIONAL_DEFS)
-LDFLAGS =
-LIBS =
-
-CXX=g++
-CC=gcc
-
-include make_targets
-include ./optional/make_targets
-
diff --git a/kokkos/basic/optional/README b/kokkos/basic/optional/README
deleted file mode 100644
index e5975dc..0000000
--- a/kokkos/basic/optional/README
+++ /dev/null
@@ -1,3 +0,0 @@
-
-ThreadPool/  is extracted from  Trilinos/packages/ThreadPool/src/
-
diff --git a/kokkos/basic/optional/ThreadPool/CMakeLists.txt b/kokkos/basic/optional/ThreadPool/CMakeLists.txt
deleted file mode 100644
index e5f7729..0000000
--- a/kokkos/basic/optional/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,53 +0,0 @@
-
-INCLUDE(PackageMacros)
-INCLUDE(AddOptionAndDefine)
-
-#
-# A) Define the package
-#
-
-PACKAGE(ThreadPool)
-
-#
-# B) Set up package-specific options
-#
-
-#Pthread is a required dependency so this conditional should always be true
-#hence the assert after it. 
-SET(HAVE_PTHREAD FALSE)
-IF(TPL_ENABLE_Pthread)
-  SET(HAVE_PTHREAD TRUE)
-ENDIF()
-IF (${PROJECT_NAME}_VERBOSE_CONFIGURE)
-  PRINT_VAR(HAVE_PTHREAD)
-ENDIF()
-ASSERT_DEFINED(HAVE_PTHREAD)
-
-IF(${PACKAGE_NAME}_ENABLE_MPI)
-  SET(HAVE_MPI TRUE)
-ENDIF()
-
-#
-# C) Add the libraries, tests, and examples
-#
-
-ADD_SUBDIRECTORY(src)
-
-IF(HAVE_PTHREAD)
-  IF (${PROJECT_NAME}_VERBOSE_CONFIGURE)
-    MESSAGE(STATUS "ADDING THREADPOOL TESTS")
-  ENDIF()
-  PACKAGE_ADD_TEST_DIRECTORIES(test test/hpccg test/hhpccg)
-ENDIF()
-
-#
-# Exclude files for source package.
-#
-
-PACKAGE_ARCH_EXCLUDE_AUTOTOOLS_FILES()
-
-#
-# D) Do standard postprocessing
-#
-
-PACKAGE_POSTPROCESS()
diff --git a/kokkos/basic/optional/ThreadPool/Makefile.am b/kokkos/basic/optional/ThreadPool/Makefile.am
deleted file mode 100644
index eac6f19..0000000
--- a/kokkos/basic/optional/ThreadPool/Makefile.am
+++ /dev/null
@@ -1,199 +0,0 @@
-# @HEADER
-# ************************************************************************
-# 
-#                          ThreadPool Package
-#                 Copyright (2008) Sandia Corporation
-# 
-# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-# license for use of this work by or on behalf of the U.S. Government.
-# 
-# This library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
-#  
-# This library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#  
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
-# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
-# 
-# ************************************************************************
-# @HEADER
-
-## #######################################################################
-## Options to automake (rarely used - don't worry about it)
-## #######################################################################
-AUTOMAKE_OPTIONS = foreign
-
-## #######################################################################
-## Aclocal command (rarely used - don't worry about it)
-## #######################################################################
-ACLOCAL_AMFLAGS = -I config
-
-#
-#       I believe that by switching to AUX_DIR(../../config) one 
-#       could get rid of these.
-#
-## #######################################################################
-## Additional files to be included in distribution for 'make dist'
-## #######################################################################
-#np# For a typical package, there is no reason to distribute these files
-#np# because users should not have to bootstrap.  We distribute them with
-#np# new package so that the files can be used in creating the 
-#np# configure script for other packages. 
-EXTRA_DIST = \
-config/generate-makeoptions.pl \
-config/replace-install-prefix.pl config/string-replace.pl \
-config/strip_dup_incl_paths.pl   config/strip_dup_libs.pl \
-config/token-replace.pl
-
-## #######################################################################
-## Tools in the auxillary directory 
-## #######################################################################
-AUX_DIST = config/install-sh config/missing config/mkinstalldirs 
-#
-#  Again, I hope that AUX_DIR(../../config) eliminates these
-#  config/install-sh config/missing config/mkinstalldirs 
-
-## #######################################################################
-## Files to be deleted by 'make maintainer-clean'
-## #######################################################################
-MAINTAINERCLEANFILES = Makefile.in aclocal.m4 autom4te.cache/* \
-	configure config.status config.log \
-	src/common/config-h.in src/common/stamp-h.in \
-	$(AUX_DIST) 
-
-#The following line helps the test harness recover from build errors.
-                                                                                
-all-local:
-	@echo "Trilinos package ThreadPool built successfully"
-
-## #######################################################################
-## Export Makefile Installation
-## #######################################################################
-if USING_EXPORT_MAKEFILES
-
-install-exec-hook:
-	mkdir -p $(DESTDIR)$(includedir)
-	cp $(top_builddir)/Makefile.export.threadpool $(DESTDIR)$(includedir)/.
-	$(PERL_EXE) $(top_srcdir)/config/replace-install-prefix.pl \
-		--exec-prefix=$(exec_prefix) \
-		--my-export-makefile=Makefile.export.threadpool \
-		--my-abs-top-srcdir=@abs_top_srcdir@ \
-		--my-abs-incl-dirs=@abs_top_builddir@/src:@abs_top_srcdir@/src \
-		--my-abs-lib-dirs=@abs_top_builddir@/src
-	$(PERL_EXE) $(top_srcdir)/config/generate-makeoptions.pl $(top_builddir)/src/Makefile \
-		THREADPOOL > $(DESTDIR)$(includedir)/Makefile.export.threadpool.macros
-
-uninstall-hook:
-	rm -f $(includedir)/Makefile.export.threadpool
-	rm -f $(includedir)/Makefile.export.threadpool.macros
-
-else
-
-install-exec-hook:
-
-uninstall-hook:
-
-endif
-
-## #######################################################################
-## Subdirectories to be make'd recursively
-## #######################################################################
-#We now build tests and examples through separate make targets, rather than
-#during "make".  We still need to conditionally include the test and example
-#in SUBDIRS, even though BUILD_TESTS and BUILD_EXAMPLES will never be
-#defined, so that the tests and examples are included in the distribution
-#tarball. 
-
-if SUB_TEST
-TEST_SUBDIR=test
-endif
-
-#if SUB_EXAMPLE
-#EXAMPLE_SUBDIR=example
-#endif
-
-#  #np# - The following make targets must be defined for all packages.
-#  #np# - If the package does not have tests or examples, replace the
-#  #np# - corresponding rules with something like:
-#  #np# - @echo "new_package does not have any tests yet"
-if BUILD_TESTS
-tests:
-	@echo ""
-	@echo "Now building ThreadPool tests."
-	@echo ""
-	cd $(top_builddir)/test && $(MAKE)
-	@echo ""
-	@echo "Finished building ThreadPool tests."
-	@echo ""
-else
-tests:
-	@echo "ThreadPool tests were disabled at configure time"
-endif
-
-examples:
-	@echo "ThreadPool does not have any examples yet"
-
-install-examples:
-	@echo "ThreadPool does not have any examples yet"
-
-clean-tests:
-	cd $(top_builddir)/test && $(MAKE) clean
-
-clean-examples:
-	@echo "ThreadPool does not have any examples yet"
-
-everything:
-	$(MAKE) && $(MAKE) examples && $(MAKE) tests
-
-clean-everything:
-	$(MAKE) clean-examples && $(MAKE) clean-tests && $(MAKE) clean
-
-install-everything:
-	$(MAKE) install && $(MAKE) install-examples
-
-SUBDIRS = src $(TEST_SUBDIR)
-
-## #######################################################################
-## The below targets allow you to use the new
-## testharness to run the test suite as make targets
-## #######################################################################
-
-TRILINOS_HOME_DIR=@abs_top_srcdir@/../..
-TRILINOS_BUILD_DIR=@abs_top_builddir@/../..
-TRILINOS_TEST_CATEGORY=INSTALL
-
-runtests-serial :
-	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
-  --trilinos-dir=$(TRILINOS_HOME_DIR) \
-  --comm=serial \
-  --build-dir=$(TRILINOS_BUILD_DIR) \
-  --category=$(TRILINOS_TEST_CATEGORY) \
-  --output-dir=@abs_top_builddir@/test/runtests-results \
-  --verbosity=1 \
-  --packages=ThreadPool
-
-runtests-mpi :
-	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
-  --trilinos-dir=$(TRILINOS_HOME_DIR) \
-  --comm=mpi \
-  --mpi-go=$(TRILINOS_MPI_GO) \
-  --build-dir=$(TRILINOS_BUILD_DIR) \
-  --category=$(TRILINOS_TEST_CATEGORY) \
-  --output-dir=@abs_top_builddir@/test/runtests-results \
-  --verbosity=1 \
-  --packages=ThreadPool
-
-if HAVE_MPI
-THREADPOOL_CHECK_COMM=mpi
-else
-THREADPOOL_CHECK_COMM=serial
-endif
-
diff --git a/kokkos/basic/optional/ThreadPool/Makefile.export.threadpool.in b/kokkos/basic/optional/ThreadPool/Makefile.export.threadpool.in
deleted file mode 100644
index 66bfda9..0000000
--- a/kokkos/basic/optional/ThreadPool/Makefile.export.threadpool.in
+++ /dev/null
@@ -1,9 +0,0 @@
-_THREADPOOL_INCLUDES = -I@abs_top_srcdir@/include -I@abs_top_builddir@/include
-
-_THREADPOOL_LIBS     = @LDFLAGS@ -L@abs_top_builddir@/src -ltpi $(LIBS)
-
-@USING_GNUMAKE_TRUE@THREADPOOL_INCLUDES  = $(shell @PERL_EXE@ @abs_top_srcdir@/config/strip_dup_incl_paths.pl $(_THREADPOOL_INCLUDES))
-@USING_GNUMAKE_TRUE@THREADPOOL_LIBS      = $(shell @PERL_EXE@ @abs_top_srcdir@/config/strip_dup_libs.pl $(_THREADPOOL_LIBS))
-
-@USING_GNUMAKE_FALSE@THREADPOOL_INCLUDES = $(_THREADPOOL_INCLUDES)
-@USING_GNUMAKE_FALSE@THREADPOOL_LIBS     = $(_THREADPOOL_LIBS)
diff --git a/kokkos/basic/optional/ThreadPool/Makefile.in b/kokkos/basic/optional/ThreadPool/Makefile.in
deleted file mode 100644
index 3e4abfd..0000000
--- a/kokkos/basic/optional/ThreadPool/Makefile.in
+++ /dev/null
@@ -1,777 +0,0 @@
-# Makefile.in generated by automake 1.10 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-@SET_MAKE@
-
-# @HEADER
-# ************************************************************************
-# 
-#                          ThreadPool Package
-#                 Copyright (2008) Sandia Corporation
-# 
-# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-# license for use of this work by or on behalf of the U.S. Government.
-# 
-# This library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
-#  
-# This library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#  
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
-# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
-# 
-# ************************************************************************
-# @HEADER
-VPATH = @srcdir@
-pkgdatadir = $(datadir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-subdir = .
-DIST_COMMON = $(am__configure_deps) $(srcdir)/Makefile.am \
-	$(srcdir)/Makefile.export.threadpool.in $(srcdir)/Makefile.in \
-	$(top_srcdir)/configure config/config.guess config/config.sub \
-	config/depcomp config/install-sh config/missing
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/config/acx_pthread.m4 \
-	$(top_srcdir)/config/tac_arg_check_mpi.m4 \
-	$(top_srcdir)/config/tac_arg_config_mpi.m4 \
-	$(top_srcdir)/config/tac_arg_enable_export-makefiles.m4 \
-	$(top_srcdir)/config/tac_arg_enable_feature.m4 \
-	$(top_srcdir)/config/tac_arg_enable_feature_sub_check.m4 \
-	$(top_srcdir)/config/tac_arg_with_ar.m4 \
-	$(top_srcdir)/config/tac_arg_with_flags.m4 \
-	$(top_srcdir)/config/tac_arg_with_incdirs.m4 \
-	$(top_srcdir)/config/tac_arg_with_libdirs.m4 \
-	$(top_srcdir)/config/tac_arg_with_libs.m4 \
-	$(top_srcdir)/config/tac_arg_with_perl.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
- configure.lineno config.status.lineno
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/src/ThreadPool_config.h
-CONFIG_CLEAN_FILES = Makefile.export.threadpool
-SOURCES =
-DIST_SOURCES =
-RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
-	html-recursive info-recursive install-data-recursive \
-	install-dvi-recursive install-exec-recursive \
-	install-html-recursive install-info-recursive \
-	install-pdf-recursive install-ps-recursive install-recursive \
-	installcheck-recursive installdirs-recursive pdf-recursive \
-	ps-recursive uninstall-recursive
-RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
-  distclean-recursive maintainer-clean-recursive
-ETAGS = etags
-CTAGS = ctags
-DIST_SUBDIRS = src test
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-distdir = $(PACKAGE)-$(VERSION)
-top_distdir = $(distdir)
-am__remove_distdir = \
-  { test ! -d $(distdir) \
-    || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
-         && rm -fr $(distdir); }; }
-DIST_ARCHIVES = $(distdir).tar.gz
-GZIP_ENV = --best
-distuninstallcheck_listfiles = find . -type f -print
-distcleancheck_listfiles = find . -type f -print
-ACLOCAL = @ACLOCAL@
-ALTERNATE_AR = @ALTERNATE_AR@
-AMTAR = @AMTAR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-GREP = @GREP@
-HAVE_PERL = @HAVE_PERL@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-LDFLAGS = @LDFLAGS@
-LIBOBJS = @LIBOBJS@
-LIBS = @LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
-MAKEINFO = @MAKEINFO@
-MKDIR_P = @MKDIR_P@
-MPI_CC_EXISTS = @MPI_CC_EXISTS@
-MPI_CXX = @MPI_CXX@
-MPI_CXX_EXISTS = @MPI_CXX_EXISTS@
-MPI_F77_EXISTS = @MPI_F77_EXISTS@
-MPI_TEMP_CXX = @MPI_TEMP_CXX@
-OBJEXT = @OBJEXT@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PERL_EXE = @PERL_EXE@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-VERSION = @VERSION@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_aux_dir = @ac_aux_dir@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-AUTOMAKE_OPTIONS = foreign
-ACLOCAL_AMFLAGS = -I config
-
-#
-#       I believe that by switching to AUX_DIR(../../config) one 
-#       could get rid of these.
-#
-#np# For a typical package, there is no reason to distribute these files
-#np# because users should not have to bootstrap.  We distribute them with
-#np# new package so that the files can be used in creating the 
-#np# configure script for other packages. 
-EXTRA_DIST = \
-config/generate-makeoptions.pl \
-config/replace-install-prefix.pl config/string-replace.pl \
-config/strip_dup_incl_paths.pl   config/strip_dup_libs.pl \
-config/token-replace.pl
-
-AUX_DIST = config/install-sh config/missing config/mkinstalldirs 
-#
-#  Again, I hope that AUX_DIR(../../config) eliminates these
-#  config/install-sh config/missing config/mkinstalldirs 
-MAINTAINERCLEANFILES = Makefile.in aclocal.m4 autom4te.cache/* \
-	configure config.status config.log \
-	src/common/config-h.in src/common/stamp-h.in \
-	$(AUX_DIST) 
-
-
-#We now build tests and examples through separate make targets, rather than
-#during "make".  We still need to conditionally include the test and example
-#in SUBDIRS, even though BUILD_TESTS and BUILD_EXAMPLES will never be
-#defined, so that the tests and examples are included in the distribution
-#tarball. 
-@SUB_TEST_TRUE@TEST_SUBDIR = test
-SUBDIRS = src $(TEST_SUBDIR)
-TRILINOS_HOME_DIR = @abs_top_srcdir@/../..
-TRILINOS_BUILD_DIR = @abs_top_builddir@/../..
-TRILINOS_TEST_CATEGORY = INSTALL
-@HAVE_MPI_FALSE@THREADPOOL_CHECK_COMM = serial
-@HAVE_MPI_TRUE@THREADPOOL_CHECK_COMM = mpi
-all: all-recursive
-
-.SUFFIXES:
-am--refresh:
-	@:
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      echo ' cd $(srcdir) && $(AUTOMAKE) --foreign '; \
-	      cd $(srcdir) && $(AUTOMAKE) --foreign  \
-		&& exit 0; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  Makefile'; \
-	cd $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign  Makefile
-.PRECIOUS: Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    echo ' $(SHELL) ./config.status'; \
-	    $(SHELL) ./config.status;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	$(SHELL) ./config.status --recheck
-
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
-	cd $(srcdir) && $(AUTOCONF)
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
-	cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
-Makefile.export.threadpool: $(top_builddir)/config.status $(srcdir)/Makefile.export.threadpool.in
-	cd $(top_builddir) && $(SHELL) ./config.status $@
-
-# This directory's subdirectories are mostly independent; you can cd
-# into them and run `make' without going through this Makefile.
-# To change the values of `make' variables: instead of editing Makefiles,
-# (1) if the variable is set in `config.status', edit `config.status'
-#     (which will cause the Makefiles to be regenerated when you run `make');
-# (2) otherwise, pass the desired values on the `make' command line.
-$(RECURSIVE_TARGETS):
-	@failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	target=`echo $@ | sed s/-recursive//`; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    dot_seen=yes; \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done; \
-	if test "$$dot_seen" = "no"; then \
-	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
-	fi; test -z "$$fail"
-
-$(RECURSIVE_CLEAN_TARGETS):
-	@failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	case "$@" in \
-	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
-	  *) list='$(SUBDIRS)' ;; \
-	esac; \
-	rev=''; for subdir in $$list; do \
-	  if test "$$subdir" = "."; then :; else \
-	    rev="$$subdir $$rev"; \
-	  fi; \
-	done; \
-	rev="$$rev ."; \
-	target=`echo $@ | sed s/-recursive//`; \
-	for subdir in $$rev; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done && test -z "$$fail"
-tags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
-	done
-ctags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
-	done
-
-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	tags=; \
-	here=`pwd`; \
-	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
-	  include_option=--etags-include; \
-	  empty_fix=.; \
-	else \
-	  include_option=--include; \
-	  empty_fix=; \
-	fi; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test ! -f $$subdir/TAGS || \
-	      tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
-	  fi; \
-	done; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	    $$tags $$unique; \
-	fi
-ctags: CTAGS
-CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	tags=; \
-	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	test -z "$(CTAGS_ARGS)$$tags$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$tags $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && cd $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) $$here
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	$(am__remove_distdir)
-	test -d $(distdir) || mkdir $(distdir)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
-	    fi; \
-	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
-	  else \
-	    test -f $(distdir)/$$file \
-	    || cp -p $$d/$$file $(distdir)/$$file \
-	    || exit 1; \
-	  fi; \
-	done
-	list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test -d "$(distdir)/$$subdir" \
-	    || $(MKDIR_P) "$(distdir)/$$subdir" \
-	    || exit 1; \
-	    distdir=`$(am__cd) $(distdir) && pwd`; \
-	    top_distdir=`$(am__cd) $(top_distdir) && pwd`; \
-	    (cd $$subdir && \
-	      $(MAKE) $(AM_MAKEFLAGS) \
-	        top_distdir="$$top_distdir" \
-	        distdir="$$distdir/$$subdir" \
-		am__remove_distdir=: \
-		am__skip_length_check=: \
-	        distdir) \
-	      || exit 1; \
-	  fi; \
-	done
-	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
-	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
-	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
-	  ! -type d ! -perm -444 -exec $(install_sh) -c -m a+r {} {} \; \
-	|| chmod -R a+r $(distdir)
-dist-gzip: distdir
-	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
-	$(am__remove_distdir)
-
-dist-bzip2: distdir
-	tardir=$(distdir) && $(am__tar) | bzip2 -9 -c >$(distdir).tar.bz2
-	$(am__remove_distdir)
-
-dist-tarZ: distdir
-	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
-	$(am__remove_distdir)
-
-dist-shar: distdir
-	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
-	$(am__remove_distdir)
-
-dist-zip: distdir
-	-rm -f $(distdir).zip
-	zip -rq $(distdir).zip $(distdir)
-	$(am__remove_distdir)
-
-dist dist-all: distdir
-	tardir=$(distdir) && $(am__tar) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
-	$(am__remove_distdir)
-
-# This target untars the dist file and tries a VPATH configuration.  Then
-# it guarantees that the distribution is self-contained by making another
-# tarfile.
-distcheck: dist
-	case '$(DIST_ARCHIVES)' in \
-	*.tar.gz*) \
-	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(am__untar) ;;\
-	*.tar.bz2*) \
-	  bunzip2 -c $(distdir).tar.bz2 | $(am__untar) ;;\
-	*.tar.Z*) \
-	  uncompress -c $(distdir).tar.Z | $(am__untar) ;;\
-	*.shar.gz*) \
-	  GZIP=$(GZIP_ENV) gunzip -c $(distdir).shar.gz | unshar ;;\
-	*.zip*) \
-	  unzip $(distdir).zip ;;\
-	esac
-	chmod -R a-w $(distdir); chmod a+w $(distdir)
-	mkdir $(distdir)/_build
-	mkdir $(distdir)/_inst
-	chmod a-w $(distdir)
-	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
-	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
-	  && cd $(distdir)/_build \
-	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
-	    $(DISTCHECK_CONFIGURE_FLAGS) \
-	  && $(MAKE) $(AM_MAKEFLAGS) \
-	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
-	  && $(MAKE) $(AM_MAKEFLAGS) check \
-	  && $(MAKE) $(AM_MAKEFLAGS) install \
-	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
-	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
-	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
-	        distuninstallcheck \
-	  && chmod -R a-w "$$dc_install_base" \
-	  && ({ \
-	       (cd ../.. && umask 077 && mkdir "$$dc_destdir") \
-	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
-	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
-	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
-	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
-	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
-	  && rm -rf "$$dc_destdir" \
-	  && $(MAKE) $(AM_MAKEFLAGS) dist \
-	  && rm -rf $(DIST_ARCHIVES) \
-	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
-	$(am__remove_distdir)
-	@(echo "$(distdir) archives ready for distribution: "; \
-	  list='$(DIST_ARCHIVES)'; for i in $$list; do echo $$i; done) | \
-	  sed -e 1h -e 1s/./=/g -e 1p -e 1x -e '$$p' -e '$$x'
-distuninstallcheck:
-	@cd $(distuninstallcheck_dir) \
-	&& test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
-	   || { echo "ERROR: files left after uninstall:" ; \
-	        if test -n "$(DESTDIR)"; then \
-	          echo "  (check DESTDIR support)"; \
-	        fi ; \
-	        $(distuninstallcheck_listfiles) ; \
-	        exit 1; } >&2
-distcleancheck: distclean
-	@if test '$(srcdir)' = . ; then \
-	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
-	  exit 1 ; \
-	fi
-	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
-	  || { echo "ERROR: files left in build directory after distclean:" ; \
-	       $(distcleancheck_listfiles) ; \
-	       exit 1; } >&2
-check-am: all-am
-check: check-recursive
-all-am: Makefile all-local
-installdirs: installdirs-recursive
-installdirs-am:
-install: install-recursive
-install-exec: install-exec-recursive
-install-data: install-data-recursive
-uninstall: uninstall-recursive
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-recursive
-install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-	-test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES)
-clean: clean-recursive
-
-clean-am: clean-generic mostlyclean-am
-
-distclean: distclean-recursive
-	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-	-rm -f Makefile
-distclean-am: clean-am distclean-generic distclean-tags
-
-dvi: dvi-recursive
-
-dvi-am:
-
-html: html-recursive
-
-info: info-recursive
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-recursive
-
-install-exec-am:
-	@$(NORMAL_INSTALL)
-	$(MAKE) $(AM_MAKEFLAGS) install-exec-hook
-
-install-html: install-html-recursive
-
-install-info: install-info-recursive
-
-install-man:
-
-install-pdf: install-pdf-recursive
-
-install-ps: install-ps-recursive
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-recursive
-	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
-	-rm -rf $(top_srcdir)/autom4te.cache
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-recursive
-
-mostlyclean-am: mostlyclean-generic
-
-pdf: pdf-recursive
-
-pdf-am:
-
-ps: ps-recursive
-
-ps-am:
-
-uninstall-am:
-	@$(NORMAL_INSTALL)
-	$(MAKE) $(AM_MAKEFLAGS) uninstall-hook
-
-.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) install-am \
-	install-exec-am install-strip uninstall-am
-
-.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
-	all all-am all-local am--refresh check check-am clean \
-	clean-generic ctags ctags-recursive dist dist-all dist-bzip2 \
-	dist-gzip dist-shar dist-tarZ dist-zip distcheck distclean \
-	distclean-generic distclean-tags distcleancheck distdir \
-	distuninstallcheck dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-exec-hook \
-	install-html install-html-am install-info install-info-am \
-	install-man install-pdf install-pdf-am install-ps \
-	install-ps-am install-strip installcheck installcheck-am \
-	installdirs installdirs-am maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
-	pdf-am ps ps-am tags tags-recursive uninstall uninstall-am \
-	uninstall-hook
-
-
-#The following line helps the test harness recover from build errors.
-
-all-local:
-	@echo "Trilinos package ThreadPool built successfully"
-
-@USING_EXPORT_MAKEFILES_TRUE@install-exec-hook:
-@USING_EXPORT_MAKEFILES_TRUE@	mkdir -p $(DESTDIR)$(includedir)
-@USING_EXPORT_MAKEFILES_TRUE@	cp $(top_builddir)/Makefile.export.threadpool $(DESTDIR)$(includedir)/.
-@USING_EXPORT_MAKEFILES_TRUE@	$(PERL_EXE) $(top_srcdir)/config/replace-install-prefix.pl \
-@USING_EXPORT_MAKEFILES_TRUE@		--exec-prefix=$(exec_prefix) \
-@USING_EXPORT_MAKEFILES_TRUE@		--my-export-makefile=Makefile.export.threadpool \
-@USING_EXPORT_MAKEFILES_TRUE@		--my-abs-top-srcdir=@abs_top_srcdir@ \
-@USING_EXPORT_MAKEFILES_TRUE@		--my-abs-incl-dirs=@abs_top_builddir@/src:@abs_top_srcdir@/src \
-@USING_EXPORT_MAKEFILES_TRUE@		--my-abs-lib-dirs=@abs_top_builddir@/src
-@USING_EXPORT_MAKEFILES_TRUE@	$(PERL_EXE) $(top_srcdir)/config/generate-makeoptions.pl $(top_builddir)/src/Makefile \
-@USING_EXPORT_MAKEFILES_TRUE@		THREADPOOL > $(DESTDIR)$(includedir)/Makefile.export.threadpool.macros
-
-@USING_EXPORT_MAKEFILES_TRUE@uninstall-hook:
-@USING_EXPORT_MAKEFILES_TRUE@	rm -f $(includedir)/Makefile.export.threadpool
-@USING_EXPORT_MAKEFILES_TRUE@	rm -f $(includedir)/Makefile.export.threadpool.macros
-
-@USING_EXPORT_MAKEFILES_FALSE@install-exec-hook:
-
-@USING_EXPORT_MAKEFILES_FALSE@uninstall-hook:
-
-#if SUB_EXAMPLE
-#EXAMPLE_SUBDIR=example
-#endif
-
-#  #np# - The following make targets must be defined for all packages.
-#  #np# - If the package does not have tests or examples, replace the
-#  #np# - corresponding rules with something like:
-#  #np# - @echo "new_package does not have any tests yet"
-@BUILD_TESTS_TRUE@tests:
-@BUILD_TESTS_TRUE@	@echo ""
-@BUILD_TESTS_TRUE@	@echo "Now building ThreadPool tests."
-@BUILD_TESTS_TRUE@	@echo ""
-@BUILD_TESTS_TRUE@	cd $(top_builddir)/test && $(MAKE)
-@BUILD_TESTS_TRUE@	@echo ""
-@BUILD_TESTS_TRUE@	@echo "Finished building ThreadPool tests."
-@BUILD_TESTS_TRUE@	@echo ""
-@BUILD_TESTS_FALSE@tests:
-@BUILD_TESTS_FALSE@	@echo "ThreadPool tests were disabled at configure time"
-
-examples:
-	@echo "ThreadPool does not have any examples yet"
-
-install-examples:
-	@echo "ThreadPool does not have any examples yet"
-
-clean-tests:
-	cd $(top_builddir)/test && $(MAKE) clean
-
-clean-examples:
-	@echo "ThreadPool does not have any examples yet"
-
-everything:
-	$(MAKE) && $(MAKE) examples && $(MAKE) tests
-
-clean-everything:
-	$(MAKE) clean-examples && $(MAKE) clean-tests && $(MAKE) clean
-
-install-everything:
-	$(MAKE) install && $(MAKE) install-examples
-
-runtests-serial :
-	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
-  --trilinos-dir=$(TRILINOS_HOME_DIR) \
-  --comm=serial \
-  --build-dir=$(TRILINOS_BUILD_DIR) \
-  --category=$(TRILINOS_TEST_CATEGORY) \
-  --output-dir=@abs_top_builddir@/test/runtests-results \
-  --verbosity=1 \
-  --packages=ThreadPool
-
-runtests-mpi :
-	$(PERL_EXE) $(TRILINOS_HOME_DIR)/commonTools/test/utilities/runtests \
-  --trilinos-dir=$(TRILINOS_HOME_DIR) \
-  --comm=mpi \
-  --mpi-go=$(TRILINOS_MPI_GO) \
-  --build-dir=$(TRILINOS_BUILD_DIR) \
-  --category=$(TRILINOS_TEST_CATEGORY) \
-  --output-dir=@abs_top_builddir@/test/runtests-results \
-  --verbosity=1 \
-  --packages=ThreadPool
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/kokkos/basic/optional/ThreadPool/ThreadPool_config.h b/kokkos/basic/optional/ThreadPool/ThreadPool_config.h
deleted file mode 100644
index b941069..0000000
--- a/kokkos/basic/optional/ThreadPool/ThreadPool_config.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#ifndef HAVE_PTHREAD
-#define HAVE_PTHREAD
-#endif
diff --git a/kokkos/basic/optional/ThreadPool/aclocal.m4 b/kokkos/basic/optional/ThreadPool/aclocal.m4
deleted file mode 100644
index e1f57a9..0000000
--- a/kokkos/basic/optional/ThreadPool/aclocal.m4
+++ /dev/null
@@ -1,932 +0,0 @@
-# generated automatically by aclocal 1.10 -*- Autoconf -*-
-
-# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-# 2005, 2006  Free Software Foundation, Inc.
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-m4_if(m4_PACKAGE_VERSION, [2.61],,
-[m4_fatal([this file was generated for autoconf 2.61.
-You have another version of autoconf.  If you want to use that,
-you should regenerate the build system entirely.], [63])])
-
-# Copyright (C) 2002, 2003, 2005, 2006  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# AM_AUTOMAKE_VERSION(VERSION)
-# ----------------------------
-# Automake X.Y traces this macro to ensure aclocal.m4 has been
-# generated from the m4 files accompanying Automake X.Y.
-# (This private macro should not be called outside this file.)
-AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.10'
-dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
-dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.10], [],
-      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
-])
-
-# _AM_AUTOCONF_VERSION(VERSION)
-# -----------------------------
-# aclocal traces this macro to find the Autoconf version.
-# This is a private macro too.  Using m4_define simplifies
-# the logic in aclocal, which can simply ignore this definition.
-m4_define([_AM_AUTOCONF_VERSION], [])
-
-# AM_SET_CURRENT_AUTOMAKE_VERSION
-# -------------------------------
-# Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
-# This function is AC_REQUIREd by AC_INIT_AUTOMAKE.
-AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.10])dnl
-_AM_AUTOCONF_VERSION(m4_PACKAGE_VERSION)])
-
-# AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-
-
-# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# For projects using AC_CONFIG_AUX_DIR([foo]), Autoconf sets
-# $ac_aux_dir to `$srcdir/foo'.  In other projects, it is set to
-# `$srcdir', `$srcdir/..', or `$srcdir/../..'.
-#
-# Of course, Automake must honor this variable whenever it calls a
-# tool from the auxiliary directory.  The problem is that $srcdir (and
-# therefore $ac_aux_dir as well) can be either absolute or relative,
-# depending on how configure is run.  This is pretty annoying, since
-# it makes $ac_aux_dir quite unusable in subdirectories: in the top
-# source directory, any form will work fine, but in subdirectories a
-# relative path needs to be adjusted first.
-#
-# $ac_aux_dir/missing
-#    fails when called from a subdirectory if $ac_aux_dir is relative
-# $top_srcdir/$ac_aux_dir/missing
-#    fails if $ac_aux_dir is absolute,
-#    fails when called from a subdirectory in a VPATH build with
-#          a relative $ac_aux_dir
-#
-# The reason of the latter failure is that $top_srcdir and $ac_aux_dir
-# are both prefixed by $srcdir.  In an in-source build this is usually
-# harmless because $srcdir is `.', but things will broke when you
-# start a VPATH build or use an absolute $srcdir.
-#
-# So we could use something similar to $top_srcdir/$ac_aux_dir/missing,
-# iff we strip the leading $srcdir from $ac_aux_dir.  That would be:
-#   am_aux_dir='\$(top_srcdir)/'`expr "$ac_aux_dir" : "$srcdir//*\(.*\)"`
-# and then we would define $MISSING as
-#   MISSING="\${SHELL} $am_aux_dir/missing"
-# This will work as long as MISSING is not called from configure, because
-# unfortunately $(top_srcdir) has no meaning in configure.
-# However there are other variables, like CC, which are often used in
-# configure, and could therefore not use this "fixed" $ac_aux_dir.
-#
-# Another solution, used here, is to always expand $ac_aux_dir to an
-# absolute PATH.  The drawback is that using absolute paths prevent a
-# configured tree to be moved without reconfiguration.
-
-AC_DEFUN([AM_AUX_DIR_EXPAND],
-[dnl Rely on autoconf to set up CDPATH properly.
-AC_PREREQ([2.50])dnl
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
-])
-
-# AM_CONDITIONAL                                            -*- Autoconf -*-
-
-# Copyright (C) 1997, 2000, 2001, 2003, 2004, 2005, 2006
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 8
-
-# AM_CONDITIONAL(NAME, SHELL-CONDITION)
-# -------------------------------------
-# Define a conditional.
-AC_DEFUN([AM_CONDITIONAL],
-[AC_PREREQ(2.52)dnl
- ifelse([$1], [TRUE],  [AC_FATAL([$0: invalid condition: $1])],
-	[$1], [FALSE], [AC_FATAL([$0: invalid condition: $1])])dnl
-AC_SUBST([$1_TRUE])dnl
-AC_SUBST([$1_FALSE])dnl
-_AM_SUBST_NOTMAKE([$1_TRUE])dnl
-_AM_SUBST_NOTMAKE([$1_FALSE])dnl
-if $2; then
-  $1_TRUE=
-  $1_FALSE='#'
-else
-  $1_TRUE='#'
-  $1_FALSE=
-fi
-AC_CONFIG_COMMANDS_PRE(
-[if test -z "${$1_TRUE}" && test -z "${$1_FALSE}"; then
-  AC_MSG_ERROR([[conditional "$1" was never defined.
-Usually this means the macro was only invoked conditionally.]])
-fi])])
-
-# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 9
-
-# There are a few dirty hacks below to avoid letting `AC_PROG_CC' be
-# written in clear, in which case automake, when reading aclocal.m4,
-# will think it sees a *use*, and therefore will trigger all it's
-# C support machinery.  Also note that it means that autoscan, seeing
-# CC etc. in the Makefile, will ask for an AC_PROG_CC use...
-
-
-# _AM_DEPENDENCIES(NAME)
-# ----------------------
-# See how the compiler implements dependency checking.
-# NAME is "CC", "CXX", "GCJ", or "OBJC".
-# We try a few techniques and use that to set a single cache variable.
-#
-# We don't AC_REQUIRE the corresponding AC_PROG_CC since the latter was
-# modified to invoke _AM_DEPENDENCIES(CC); we would have a circular
-# dependency, and given that the user is not expected to run this macro,
-# just rely on AC_PROG_CC.
-AC_DEFUN([_AM_DEPENDENCIES],
-[AC_REQUIRE([AM_SET_DEPDIR])dnl
-AC_REQUIRE([AM_OUTPUT_DEPENDENCY_COMMANDS])dnl
-AC_REQUIRE([AM_MAKE_INCLUDE])dnl
-AC_REQUIRE([AM_DEP_TRACK])dnl
-
-ifelse([$1], CC,   [depcc="$CC"   am_compiler_list=],
-       [$1], CXX,  [depcc="$CXX"  am_compiler_list=],
-       [$1], OBJC, [depcc="$OBJC" am_compiler_list='gcc3 gcc'],
-       [$1], UPC,  [depcc="$UPC"  am_compiler_list=],
-       [$1], GCJ,  [depcc="$GCJ"  am_compiler_list='gcc3 gcc'],
-                   [depcc="$$1"   am_compiler_list=])
-
-AC_CACHE_CHECK([dependency style of $depcc],
-               [am_cv_$1_dependencies_compiler_type],
-[if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_$1_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n ['s/^#*\([a-zA-Z0-9]*\))$/\1/p'] < ./depcomp`
-  fi
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    case $depmode in
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    none) break ;;
-    esac
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.
-    if depmode=$depmode \
-       source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_$1_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_$1_dependencies_compiler_type=none
-fi
-])
-AC_SUBST([$1DEPMODE], [depmode=$am_cv_$1_dependencies_compiler_type])
-AM_CONDITIONAL([am__fastdep$1], [
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_$1_dependencies_compiler_type" = gcc3])
-])
-
-
-# AM_SET_DEPDIR
-# -------------
-# Choose a directory name for dependency files.
-# This macro is AC_REQUIREd in _AM_DEPENDENCIES
-AC_DEFUN([AM_SET_DEPDIR],
-[AC_REQUIRE([AM_SET_LEADING_DOT])dnl
-AC_SUBST([DEPDIR], ["${am__leading_dot}deps"])dnl
-])
-
-
-# AM_DEP_TRACK
-# ------------
-AC_DEFUN([AM_DEP_TRACK],
-[AC_ARG_ENABLE(dependency-tracking,
-[  --disable-dependency-tracking  speeds up one-time build
-  --enable-dependency-tracking   do not reject slow dependency extractors])
-if test "x$enable_dependency_tracking" != xno; then
-  am_depcomp="$ac_aux_dir/depcomp"
-  AMDEPBACKSLASH='\'
-fi
-AM_CONDITIONAL([AMDEP], [test "x$enable_dependency_tracking" != xno])
-AC_SUBST([AMDEPBACKSLASH])dnl
-_AM_SUBST_NOTMAKE([AMDEPBACKSLASH])dnl
-])
-
-# Generate code to set up dependency tracking.              -*- Autoconf -*-
-
-# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004, 2005
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-#serial 3
-
-# _AM_OUTPUT_DEPENDENCY_COMMANDS
-# ------------------------------
-AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
-[for mf in $CONFIG_FILES; do
-  # Strip MF so we end up with the name of the file.
-  mf=`echo "$mf" | sed -e 's/:.*$//'`
-  # Check whether this is an Automake generated Makefile or not.
-  # We used to match only the files named `Makefile.in', but
-  # some people rename them; so instead we look at the file content.
-  # Grep'ing the first line is not enough: some people post-process
-  # each Makefile.in and add a new line on top of each file to say so.
-  # Grep'ing the whole file is not good either: AIX grep has a line
-  # limit of 2048, but all sed's we know have understand at least 4000.
-  if sed 10q "$mf" | grep '^#.*generated by automake' > /dev/null 2>&1; then
-    dirpart=`AS_DIRNAME("$mf")`
-  else
-    continue
-  fi
-  # Extract the definition of DEPDIR, am__include, and am__quote
-  # from the Makefile without running `make'.
-  DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
-  test -z "$DEPDIR" && continue
-  am__include=`sed -n 's/^am__include = //p' < "$mf"`
-  test -z "am__include" && continue
-  am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-  # When using ansi2knr, U may be empty or an underscore; expand it
-  U=`sed -n 's/^U = //p' < "$mf"`
-  # Find all dependency output files, they are included files with
-  # $(DEPDIR) in their names.  We invoke sed twice because it is the
-  # simplest approach to changing $(DEPDIR) to its actual value in the
-  # expansion.
-  for file in `sed -n "
-    s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
-    # Make sure the directory exists.
-    test -f "$dirpart/$file" && continue
-    fdir=`AS_DIRNAME(["$file"])`
-    AS_MKDIR_P([$dirpart/$fdir])
-    # echo "creating $dirpart/$file"
-    echo '# dummy' > "$dirpart/$file"
-  done
-done
-])# _AM_OUTPUT_DEPENDENCY_COMMANDS
-
-
-# AM_OUTPUT_DEPENDENCY_COMMANDS
-# -----------------------------
-# This macro should only be invoked once -- use via AC_REQUIRE.
-#
-# This code is only required when automatic dependency tracking
-# is enabled.  FIXME.  This creates each `.P' file that we will
-# need in order to bootstrap the dependency handling code.
-AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
-[AC_CONFIG_COMMANDS([depfiles],
-     [test x"$AMDEP_TRUE" != x"" || _AM_OUTPUT_DEPENDENCY_COMMANDS],
-     [AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"])
-])
-
-# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 8
-
-# AM_CONFIG_HEADER is obsolete.  It has been replaced by AC_CONFIG_HEADERS.
-AU_DEFUN([AM_CONFIG_HEADER], [AC_CONFIG_HEADERS($@)])
-
-# Do all the work for Automake.                             -*- Autoconf -*-
-
-# Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
-# 2005, 2006 Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 12
-
-# This macro actually does too much.  Some checks are only needed if
-# your package does certain things.  But this isn't really a big deal.
-
-# AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
-# AM_INIT_AUTOMAKE([OPTIONS])
-# -----------------------------------------------
-# The call with PACKAGE and VERSION arguments is the old style
-# call (pre autoconf-2.50), which is being phased out.  PACKAGE
-# and VERSION should now be passed to AC_INIT and removed from
-# the call to AM_INIT_AUTOMAKE.
-# We support both call styles for the transition.  After
-# the next Automake release, Autoconf can make the AC_INIT
-# arguments mandatory, and then we can depend on a new Autoconf
-# release and drop the old call support.
-AC_DEFUN([AM_INIT_AUTOMAKE],
-[AC_PREREQ([2.60])dnl
-dnl Autoconf wants to disallow AM_ names.  We explicitly allow
-dnl the ones we care about.
-m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
-AC_REQUIRE([AM_SET_CURRENT_AUTOMAKE_VERSION])dnl
-AC_REQUIRE([AC_PROG_INSTALL])dnl
-if test "`cd $srcdir && pwd`" != "`pwd`"; then
-  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
-  # is not polluted with repeated "-I."
-  AC_SUBST([am__isrc], [' -I$(srcdir)'])_AM_SUBST_NOTMAKE([am__isrc])dnl
-  # test to see if srcdir already configured
-  if test -f $srcdir/config.status; then
-    AC_MSG_ERROR([source directory already configured; run "make distclean" there first])
-  fi
-fi
-
-# test whether we have cygpath
-if test -z "$CYGPATH_W"; then
-  if (cygpath --version) >/dev/null 2>/dev/null; then
-    CYGPATH_W='cygpath -w'
-  else
-    CYGPATH_W=echo
-  fi
-fi
-AC_SUBST([CYGPATH_W])
-
-# Define the identity of the package.
-dnl Distinguish between old-style and new-style calls.
-m4_ifval([$2],
-[m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
- AC_SUBST([PACKAGE], [$1])dnl
- AC_SUBST([VERSION], [$2])],
-[_AM_SET_OPTIONS([$1])dnl
-dnl Diagnose old-style AC_INIT with new-style AM_AUTOMAKE_INIT.
-m4_if(m4_ifdef([AC_PACKAGE_NAME], 1)m4_ifdef([AC_PACKAGE_VERSION], 1), 11,,
-  [m4_fatal([AC_INIT should be called with package and version arguments])])dnl
- AC_SUBST([PACKAGE], ['AC_PACKAGE_TARNAME'])dnl
- AC_SUBST([VERSION], ['AC_PACKAGE_VERSION'])])dnl
-
-_AM_IF_OPTION([no-define],,
-[AC_DEFINE_UNQUOTED(PACKAGE, "$PACKAGE", [Name of package])
- AC_DEFINE_UNQUOTED(VERSION, "$VERSION", [Version number of package])])dnl
-
-# Some tools Automake needs.
-AC_REQUIRE([AM_SANITY_CHECK])dnl
-AC_REQUIRE([AC_ARG_PROGRAM])dnl
-AM_MISSING_PROG(ACLOCAL, aclocal-${am__api_version})
-AM_MISSING_PROG(AUTOCONF, autoconf)
-AM_MISSING_PROG(AUTOMAKE, automake-${am__api_version})
-AM_MISSING_PROG(AUTOHEADER, autoheader)
-AM_MISSING_PROG(MAKEINFO, makeinfo)
-AM_PROG_INSTALL_SH
-AM_PROG_INSTALL_STRIP
-AC_REQUIRE([AM_PROG_MKDIR_P])dnl
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
-AC_REQUIRE([AC_PROG_AWK])dnl
-AC_REQUIRE([AC_PROG_MAKE_SET])dnl
-AC_REQUIRE([AM_SET_LEADING_DOT])dnl
-_AM_IF_OPTION([tar-ustar], [_AM_PROG_TAR([ustar])],
-              [_AM_IF_OPTION([tar-pax], [_AM_PROG_TAR([pax])],
-	      		     [_AM_PROG_TAR([v7])])])
-_AM_IF_OPTION([no-dependencies],,
-[AC_PROVIDE_IFELSE([AC_PROG_CC],
-                  [_AM_DEPENDENCIES(CC)],
-                  [define([AC_PROG_CC],
-                          defn([AC_PROG_CC])[_AM_DEPENDENCIES(CC)])])dnl
-AC_PROVIDE_IFELSE([AC_PROG_CXX],
-                  [_AM_DEPENDENCIES(CXX)],
-                  [define([AC_PROG_CXX],
-                          defn([AC_PROG_CXX])[_AM_DEPENDENCIES(CXX)])])dnl
-AC_PROVIDE_IFELSE([AC_PROG_OBJC],
-                  [_AM_DEPENDENCIES(OBJC)],
-                  [define([AC_PROG_OBJC],
-                          defn([AC_PROG_OBJC])[_AM_DEPENDENCIES(OBJC)])])dnl
-])
-])
-
-
-# When config.status generates a header, we must update the stamp-h file.
-# This file resides in the same directory as the config header
-# that is generated.  The stamp files are numbered to have different names.
-
-# Autoconf calls _AC_AM_CONFIG_HEADER_HOOK (when defined) in the
-# loop where config.status creates the headers, so we can generate
-# our stamp files there.
-AC_DEFUN([_AC_AM_CONFIG_HEADER_HOOK],
-[# Compute $1's index in $config_headers.
-_am_stamp_count=1
-for _am_header in $config_headers :; do
-  case $_am_header in
-    $1 | $1:* )
-      break ;;
-    * )
-      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
-  esac
-done
-echo "timestamp for $1" >`AS_DIRNAME([$1])`/stamp-h[]$_am_stamp_count])
-
-# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# AM_PROG_INSTALL_SH
-# ------------------
-# Define $install_sh.
-AC_DEFUN([AM_PROG_INSTALL_SH],
-[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-install_sh=${install_sh-"\$(SHELL) $am_aux_dir/install-sh"}
-AC_SUBST(install_sh)])
-
-# Copyright (C) 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 2
-
-# Check whether the underlying file-system supports filenames
-# with a leading dot.  For instance MS-DOS doesn't.
-AC_DEFUN([AM_SET_LEADING_DOT],
-[rm -rf .tst 2>/dev/null
-mkdir .tst 2>/dev/null
-if test -d .tst; then
-  am__leading_dot=.
-else
-  am__leading_dot=_
-fi
-rmdir .tst 2>/dev/null
-AC_SUBST([am__leading_dot])])
-
-# Add --enable-maintainer-mode option to configure.         -*- Autoconf -*-
-# From Jim Meyering
-
-# Copyright (C) 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2005
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 4
-
-AC_DEFUN([AM_MAINTAINER_MODE],
-[AC_MSG_CHECKING([whether to enable maintainer-specific portions of Makefiles])
-  dnl maintainer-mode is disabled by default
-  AC_ARG_ENABLE(maintainer-mode,
-[  --enable-maintainer-mode  enable make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer],
-      USE_MAINTAINER_MODE=$enableval,
-      USE_MAINTAINER_MODE=no)
-  AC_MSG_RESULT([$USE_MAINTAINER_MODE])
-  AM_CONDITIONAL(MAINTAINER_MODE, [test $USE_MAINTAINER_MODE = yes])
-  MAINT=$MAINTAINER_MODE_TRUE
-  AC_SUBST(MAINT)dnl
-]
-)
-
-AU_DEFUN([jm_MAINTAINER_MODE], [AM_MAINTAINER_MODE])
-
-# Check to see how 'make' treats includes.	            -*- Autoconf -*-
-
-# Copyright (C) 2001, 2002, 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 3
-
-# AM_MAKE_INCLUDE()
-# -----------------
-# Check to see how make treats includes.
-AC_DEFUN([AM_MAKE_INCLUDE],
-[am_make=${MAKE-make}
-cat > confinc << 'END'
-am__doit:
-	@echo done
-.PHONY: am__doit
-END
-# If we don't find an include directive, just comment out the code.
-AC_MSG_CHECKING([for style of include used by $am_make])
-am__include="#"
-am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# We grep out `Entering directory' and `Leaving directory'
-# messages which can occur if `w' ends up in MAKEFLAGS.
-# In particular we don't look at `^make:' because GNU make might
-# be invoked under some other name (usually "gmake"), in which
-# case it prints its new name instead of `make'.
-if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then
-   am__include=include
-   am__quote=
-   _am_result=GNU
-fi
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then
-      am__include=.include
-      am__quote="\""
-      _am_result=BSD
-   fi
-fi
-AC_SUBST([am__include])
-AC_SUBST([am__quote])
-AC_MSG_RESULT([$_am_result])
-rm -f confinc confmf
-])
-
-# Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-
-
-# Copyright (C) 1997, 1999, 2000, 2001, 2003, 2004, 2005
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 5
-
-# AM_MISSING_PROG(NAME, PROGRAM)
-# ------------------------------
-AC_DEFUN([AM_MISSING_PROG],
-[AC_REQUIRE([AM_MISSING_HAS_RUN])
-$1=${$1-"${am_missing_run}$2"}
-AC_SUBST($1)])
-
-
-# AM_MISSING_HAS_RUN
-# ------------------
-# Define MISSING if not defined so far and test if it supports --run.
-# If it does, set am_missing_run to use it, otherwise, to nothing.
-AC_DEFUN([AM_MISSING_HAS_RUN],
-[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
-AC_REQUIRE_AUX_FILE([missing])dnl
-test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing"
-# Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
-else
-  am_missing_run=
-  AC_MSG_WARN([`missing' script is too old or missing])
-fi
-])
-
-# Copyright (C) 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# AM_PROG_MKDIR_P
-# ---------------
-# Check for `mkdir -p'.
-AC_DEFUN([AM_PROG_MKDIR_P],
-[AC_PREREQ([2.60])dnl
-AC_REQUIRE([AC_PROG_MKDIR_P])dnl
-dnl Automake 1.8 to 1.9.6 used to define mkdir_p.  We now use MKDIR_P,
-dnl while keeping a definition of mkdir_p for backward compatibility.
-dnl @MKDIR_P@ is magic: AC_OUTPUT adjusts its value for each Makefile.
-dnl However we cannot define mkdir_p as $(MKDIR_P) for the sake of
-dnl Makefile.ins that do not define MKDIR_P, so we do our own
-dnl adjustment using top_builddir (which is defined more often than
-dnl MKDIR_P).
-AC_SUBST([mkdir_p], ["$MKDIR_P"])dnl
-case $mkdir_p in
-  [[\\/$]]* | ?:[[\\/]]*) ;;
-  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
-esac
-])
-
-# Helper functions for option handling.                     -*- Autoconf -*-
-
-# Copyright (C) 2001, 2002, 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 3
-
-# _AM_MANGLE_OPTION(NAME)
-# -----------------------
-AC_DEFUN([_AM_MANGLE_OPTION],
-[[_AM_OPTION_]m4_bpatsubst($1, [[^a-zA-Z0-9_]], [_])])
-
-# _AM_SET_OPTION(NAME)
-# ------------------------------
-# Set option NAME.  Presently that only means defining a flag for this option.
-AC_DEFUN([_AM_SET_OPTION],
-[m4_define(_AM_MANGLE_OPTION([$1]), 1)])
-
-# _AM_SET_OPTIONS(OPTIONS)
-# ----------------------------------
-# OPTIONS is a space-separated list of Automake options.
-AC_DEFUN([_AM_SET_OPTIONS],
-[AC_FOREACH([_AM_Option], [$1], [_AM_SET_OPTION(_AM_Option)])])
-
-# _AM_IF_OPTION(OPTION, IF-SET, [IF-NOT-SET])
-# -------------------------------------------
-# Execute IF-SET if OPTION is set, IF-NOT-SET otherwise.
-AC_DEFUN([_AM_IF_OPTION],
-[m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])
-
-# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# AM_RUN_LOG(COMMAND)
-# -------------------
-# Run COMMAND, save the exit status in ac_status, and log it.
-# (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
-AC_DEFUN([AM_RUN_LOG],
-[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
-   ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
-   ac_status=$?
-   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
-   (exit $ac_status); }])
-
-# Check to make sure that the build environment is sane.    -*- Autoconf -*-
-
-# Copyright (C) 1996, 1997, 2000, 2001, 2003, 2005
-# Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 4
-
-# AM_SANITY_CHECK
-# ---------------
-AC_DEFUN([AM_SANITY_CHECK],
-[AC_MSG_CHECKING([whether build environment is sane])
-# Just in case
-sleep 1
-echo timestamp > conftest.file
-# Do `set' in a subshell so we don't clobber the current shell's
-# arguments.  Must try -L first in case configure is actually a
-# symlink; some systems play weird games with the mod time of symlinks
-# (eg FreeBSD returns the mod time of the symlink's containing
-# directory).
-if (
-   set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
-   if test "$[*]" = "X"; then
-      # -L didn't work.
-      set X `ls -t $srcdir/configure conftest.file`
-   fi
-   rm -f conftest.file
-   if test "$[*]" != "X $srcdir/configure conftest.file" \
-      && test "$[*]" != "X conftest.file $srcdir/configure"; then
-
-      # If neither matched, then we have a broken ls.  This can happen
-      # if, for instance, CONFIG_SHELL is bash and it inherits a
-      # broken ls alias from the environment.  This has actually
-      # happened.  Such a system could not be considered "sane".
-      AC_MSG_ERROR([ls -t appears to fail.  Make sure there is not a broken
-alias in your environment])
-   fi
-
-   test "$[2]" = conftest.file
-   )
-then
-   # Ok.
-   :
-else
-   AC_MSG_ERROR([newly created file is older than distributed files!
-Check your system clock])
-fi
-AC_MSG_RESULT(yes)])
-
-# Copyright (C) 2001, 2003, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# AM_PROG_INSTALL_STRIP
-# ---------------------
-# One issue with vendor `install' (even GNU) is that you can't
-# specify the program used to strip binaries.  This is especially
-# annoying in cross-compiling environments, where the build's strip
-# is unlikely to handle the host's binaries.
-# Fortunately install-sh will honor a STRIPPROG variable, so we
-# always use install-sh in `make install-strip', and initialize
-# STRIPPROG with the value of the STRIP variable (set by the user).
-AC_DEFUN([AM_PROG_INSTALL_STRIP],
-[AC_REQUIRE([AM_PROG_INSTALL_SH])dnl
-# Installed binaries are usually stripped using `strip' when the user
-# run `make install-strip'.  However `strip' might not be the right
-# tool to use in cross-compilation environments, therefore Automake
-# will honor the `STRIP' environment variable to overrule this program.
-dnl Don't test for $cross_compiling = yes, because it might be `maybe'.
-if test "$cross_compiling" != no; then
-  AC_CHECK_TOOL([STRIP], [strip], :)
-fi
-INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
-AC_SUBST([INSTALL_STRIP_PROGRAM])])
-
-# Copyright (C) 2006  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# _AM_SUBST_NOTMAKE(VARIABLE)
-# ---------------------------
-# Prevent Automake from outputing VARIABLE = @VARIABLE@ in Makefile.in.
-# This macro is traced by Automake.
-AC_DEFUN([_AM_SUBST_NOTMAKE])
-
-# Check how to create a tarball.                            -*- Autoconf -*-
-
-# Copyright (C) 2004, 2005  Free Software Foundation, Inc.
-#
-# This file is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# serial 2
-
-# _AM_PROG_TAR(FORMAT)
-# --------------------
-# Check how to create a tarball in format FORMAT.
-# FORMAT should be one of `v7', `ustar', or `pax'.
-#
-# Substitute a variable $(am__tar) that is a command
-# writing to stdout a FORMAT-tarball containing the directory
-# $tardir.
-#     tardir=directory && $(am__tar) > result.tar
-#
-# Substitute a variable $(am__untar) that extract such
-# a tarball read from stdin.
-#     $(am__untar) < result.tar
-AC_DEFUN([_AM_PROG_TAR],
-[# Always define AMTAR for backward compatibility.
-AM_MISSING_PROG([AMTAR], [tar])
-m4_if([$1], [v7],
-     [am__tar='${AMTAR} chof - "$$tardir"'; am__untar='${AMTAR} xf -'],
-     [m4_case([$1], [ustar],, [pax],,
-              [m4_fatal([Unknown tar format])])
-AC_MSG_CHECKING([how to create a $1 tar archive])
-# Loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
-_am_tools=${am_cv_prog_tar_$1-$_am_tools}
-# Do not fold the above two line into one, because Tru64 sh and
-# Solaris sh will not grok spaces in the rhs of `-'.
-for _am_tool in $_am_tools
-do
-  case $_am_tool in
-  gnutar)
-    for _am_tar in tar gnutar gtar;
-    do
-      AM_RUN_LOG([$_am_tar --version]) && break
-    done
-    am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
-    am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
-    am__untar="$_am_tar -xf -"
-    ;;
-  plaintar)
-    # Must skip GNU tar: if it does not support --format= it doesn't create
-    # ustar tarball either.
-    (tar --version) >/dev/null 2>&1 && continue
-    am__tar='tar chf - "$$tardir"'
-    am__tar_='tar chf - "$tardir"'
-    am__untar='tar xf -'
-    ;;
-  pax)
-    am__tar='pax -L -x $1 -w "$$tardir"'
-    am__tar_='pax -L -x $1 -w "$tardir"'
-    am__untar='pax -r'
-    ;;
-  cpio)
-    am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
-    am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
-    am__untar='cpio -i -H $1 -d'
-    ;;
-  none)
-    am__tar=false
-    am__tar_=false
-    am__untar=false
-    ;;
-  esac
-
-  # If the value was cached, stop now.  We just wanted to have am__tar
-  # and am__untar set.
-  test -n "${am_cv_prog_tar_$1}" && break
-
-  # tar/untar a dummy directory, and stop if the command works
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  echo GrepMe > conftest.dir/file
-  AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
-  rm -rf conftest.dir
-  if test -s conftest.tar; then
-    AM_RUN_LOG([$am__untar <conftest.tar])
-    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
-  fi
-done
-rm -rf conftest.dir
-
-AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
-AC_MSG_RESULT([$am_cv_prog_tar_$1])])
-AC_SUBST([am__tar])
-AC_SUBST([am__untar])
-]) # _AM_PROG_TAR
-
-m4_include([config/acx_pthread.m4])
-m4_include([config/tac_arg_check_mpi.m4])
-m4_include([config/tac_arg_config_mpi.m4])
-m4_include([config/tac_arg_enable_export-makefiles.m4])
-m4_include([config/tac_arg_enable_feature.m4])
-m4_include([config/tac_arg_enable_feature_sub_check.m4])
-m4_include([config/tac_arg_with_ar.m4])
-m4_include([config/tac_arg_with_flags.m4])
-m4_include([config/tac_arg_with_incdirs.m4])
-m4_include([config/tac_arg_with_libdirs.m4])
-m4_include([config/tac_arg_with_libs.m4])
-m4_include([config/tac_arg_with_perl.m4])
diff --git a/kokkos/basic/optional/ThreadPool/bootstrap b/kokkos/basic/optional/ThreadPool/bootstrap
deleted file mode 100755
index 8706e9e..0000000
--- a/kokkos/basic/optional/ThreadPool/bootstrap
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /bin/sh
-#np# This file does not need to be edited, other than removing this line.
-set -x
-# Only run aclocal if we need to create aclocal.m4
-aclocal -I config 
-# autoheader is smart and doesn't change anything unless it's necessary
-autoheader 
-automake --foreign --add-missing --copy
-autoconf
diff --git a/kokkos/basic/optional/ThreadPool/cmake/Dependencies.cmake b/kokkos/basic/optional/ThreadPool/cmake/Dependencies.cmake
deleted file mode 100644
index 746d066..0000000
--- a/kokkos/basic/optional/ThreadPool/cmake/Dependencies.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-SET(LIB_REQUIRED_DEP_PACKAGES)
-SET(LIB_OPTIONAL_DEP_PACKAGES)
-SET(TEST_REQUIRED_DEP_PACKAGES)
-SET(TEST_OPTIONAL_DEP_PACKAGES)
-SET(LIB_REQUIRED_DEP_TPLS)
-SET(LIB_OPTIONAL_DEP_TPLS Pthread MPI)
-SET(TEST_REQUIRED_DEP_TPLS)
-SET(TEST_OPTIONAL_DEP_TPLS)
-
-TPL_TENTATIVELY_ENABLE(Pthread)
-
diff --git a/kokkos/basic/optional/ThreadPool/cmake/ThreadPool_config.h.in b/kokkos/basic/optional/ThreadPool/cmake/ThreadPool_config.h.in
deleted file mode 100644
index 55614b9..0000000
--- a/kokkos/basic/optional/ThreadPool/cmake/ThreadPool_config.h.in
+++ /dev/null
@@ -1,2 +0,0 @@
-#cmakedefine HAVE_MPI
-#cmakedefine HAVE_PTHREAD
diff --git a/kokkos/basic/optional/ThreadPool/config/acx_pthread.m4 b/kokkos/basic/optional/ThreadPool/config/acx_pthread.m4
deleted file mode 100644
index 3bd3ec2..0000000
--- a/kokkos/basic/optional/ThreadPool/config/acx_pthread.m4
+++ /dev/null
@@ -1,224 +0,0 @@
-dnl @synopsis ACX_PTHREAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
-dnl
-dnl This macro figures out how to build C programs using POSIX
-dnl threads.  It sets the PTHREAD_LIBS output variable to the threads
-dnl library and linker flags, and the PTHREAD_CFLAGS output variable
-dnl to any special C compiler flags that are needed.  (The user can also
-dnl force certain compiler flags/libs to be tested by setting these
-dnl environment variables.)
-dnl
-dnl Also sets PTHREAD_CC to any special C compiler that is needed for
-dnl multi-threaded programs (defaults to the value of CC otherwise).
-dnl (This is necessary on AIX to use the special cc_r compiler alias.)
-dnl
-dnl If you are only building threads programs, you may wish to
-dnl use these variables in your default LIBS, CFLAGS, and CC:
-dnl
-dnl        LIBS="$PTHREAD_LIBS $LIBS"
-dnl        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-dnl        CC="$PTHREAD_CC"
-dnl
-dnl In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute
-dnl constant has a nonstandard name, defines PTHREAD_CREATE_JOINABLE
-dnl to that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX).
-dnl
-dnl ACTION-IF-FOUND is a list of shell commands to run if a threads
-dnl library is found, and ACTION-IF-NOT-FOUND is a list of commands
-dnl to run it if it is not found.  If ACTION-IF-FOUND is not specified,
-dnl the default action will define HAVE_PTHREAD.
-dnl
-dnl Please let the authors know if this macro fails on any platform,
-dnl or if you have any other suggestions or comments.  This macro was
-dnl based on work by SGJ on autoconf scripts for FFTW (www.fftw.org)
-dnl (with help from M. Frigo), as well as ac_pthread and hb_pthread
-dnl macros posted by AFC to the autoconf macro repository.  We are also
-dnl grateful for the helpful feedback of numerous users.
-dnl
-dnl @version $Id$
-dnl @author Steven G. Johnson <stevenj@alum.mit.edu> and Alejandro Forero Cuervo <bachue@bachue.com>
-
-AC_DEFUN([ACX_PTHREAD], [
-AC_REQUIRE([AC_CANONICAL_HOST])
-acx_pthread_ok=no
-
-# First, check if the POSIX threads header, pthread.h, is available.
-# If it isn't, don't bother looking for the threads libraries.
-AC_CHECK_HEADER(pthread.h, , acx_pthread_ok=noheader)
-
-# We must check for the threads library under a number of different
-# names; the ordering is very important because some systems
-# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
-# libraries is broken (non-POSIX).
-
-# First of all, check if the user has set any of the PTHREAD_LIBS,
-# etcetera environment variables, and if threads linking works using
-# them:
-if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
-        save_CFLAGS="$CFLAGS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-        save_LIBS="$LIBS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS])
-        AC_TRY_LINK_FUNC(pthread_join, acx_pthread_ok=yes)
-        AC_MSG_RESULT($acx_pthread_ok)
-        if test x"$acx_pthread_ok" = xno; then
-                PTHREAD_LIBS=""
-                PTHREAD_CFLAGS=""
-        fi
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
-fi
-
-# Create a list of thread flags to try.  Items starting with a "-" are
-# C compiler flags, and other items are library names, except for "none"
-# which indicates that we try without any flags at all.
-
-acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt"
-
-# The ordering *is* (sometimes) important.  Some notes on the
-# individual items follow:
-
-# pthreads: AIX (must check this before -lpthread)
-# none: in case threads are in libc; should be tried before -Kthread and
-#       other compiler flags to prevent continual compiler warnings
-# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
-# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
-# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
-# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
-# -pthreads: Solaris/gcc
-# -mthreads: Mingw32/gcc, Lynx/gcc
-# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
-#      doesn't hurt to check since this sometimes defines pthreads too;
-#      also defines -D_REENTRANT)
-# pthread: Linux, etcetera
-# --thread-safe: KAI C++
-
-case "${host_cpu}-${host_os}" in
-        *solaris*)
-
-        # On Solaris (at least, for some versions), libc contains stubbed
-        # (non-functional) versions of the pthreads routines, so link-based
-        # tests will erroneously succeed.  (We need to link with -pthread or
-        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
-        # a function called by this macro, so we could check for that, but
-        # who knows whether they'll stub that too in a future libc.)  So,
-        # we'll just look for -pthreads and -lpthread first:
-
-        acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags"
-        ;;
-esac
-
-if test x"$acx_pthread_ok" = xno; then
-for flag in $acx_pthread_flags; do
-
-        case $flag in
-                none)
-                AC_MSG_CHECKING([whether pthreads work without any flags])
-                ;;
-
-                -*)
-                AC_MSG_CHECKING([whether pthreads work with $flag])
-                PTHREAD_CFLAGS="$flag"
-                ;;
-
-                *)
-                AC_MSG_CHECKING([for the pthreads library -l$flag])
-                PTHREAD_LIBS="-l$flag"
-                ;;
-        esac
-
-        save_LIBS="$LIBS"
-        save_CFLAGS="$CFLAGS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-
-        # Check for various functions.  We must include pthread.h,
-        # since some functions may be macros.  (On the Sequent, we
-        # need a special flag -Kthread to make this header compile.)
-        # We check for pthread_join because it is in -lpthread on IRIX
-        # while pthread_create is in libc.  We check for pthread_attr_init
-        # due to DEC craziness with -lpthreads.  We check for
-        # pthread_cleanup_push because it is one of the few pthread
-        # functions on Solaris that doesn't have a non-functional libc stub.
-        # We try pthread_create on general principles.
-        AC_TRY_LINK([#include <pthread.h>],
-                    [pthread_t th; pthread_join(th, 0);
-                     pthread_attr_init(0); pthread_cleanup_push(0, 0);
-                     pthread_create(0,0,0,0); pthread_cleanup_pop(0); ],
-                    [acx_pthread_ok=yes])
-
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
-
-        AC_MSG_RESULT($acx_pthread_ok)
-        if test "x$acx_pthread_ok" = xyes; then
-                break;
-        fi
-
-        PTHREAD_LIBS=""
-        PTHREAD_CFLAGS=""
-done
-fi
-
-# Various other checks:
-if test "x$acx_pthread_ok" = xyes; then
-        save_LIBS="$LIBS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        save_CFLAGS="$CFLAGS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-
-        # Detect AIX lossage: threads are created detached by default
-        # and the JOINABLE attribute has a nonstandard name (UNDETACHED).
-        AC_MSG_CHECKING([for joinable pthread attribute])
-        AC_TRY_LINK([#include <pthread.h>],
-                    [int attr=PTHREAD_CREATE_JOINABLE;],
-                    ok=PTHREAD_CREATE_JOINABLE, ok=unknown)
-        if test x"$ok" = xunknown; then
-                AC_TRY_LINK([#include <pthread.h>],
-                            [int attr=PTHREAD_CREATE_UNDETACHED;],
-                            ok=PTHREAD_CREATE_UNDETACHED, ok=unknown)
-        fi
-        if test x"$ok" != xPTHREAD_CREATE_JOINABLE; then
-                AC_DEFINE(PTHREAD_CREATE_JOINABLE, $ok,
-                          [Define to the necessary symbol if this constant
-                           uses a non-standard name on your system.])
-        fi
-        AC_MSG_RESULT(${ok})
-        if test x"$ok" = xunknown; then
-                AC_MSG_WARN([we do not know how to create joinable pthreads])
-        fi
-
-        AC_MSG_CHECKING([if more special flags are required for pthreads])
-        flag=no
-        case "${host_cpu}-${host_os}" in
-                *-aix* | *-freebsd*)     flag="-D_THREAD_SAFE";;
-                *solaris* | alpha*-osf*) flag="-D_REENTRANT";;
-        esac
-        AC_MSG_RESULT(${flag})
-        if test "x$flag" != xno; then
-                PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
-        fi
-
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
-
-        # More AIX lossage: must compile with cc_r
-        AC_CHECK_PROG(PTHREAD_CC, cc_r, cc_r, ${CC})
-else
-        PTHREAD_CC="$CC"
-fi
-
-AC_SUBST(PTHREAD_LIBS)
-AC_SUBST(PTHREAD_CFLAGS)
-AC_SUBST(PTHREAD_CC)
-
-# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-if test x"$acx_pthread_ok" = xyes; then
-        ifelse([$1],,AC_DEFINE(HAVE_PTHREAD,1,[Define if you have POSIX threads libraries and header files.]),[$1])
-        :
-else
-        acx_pthread_ok=no
-        $2
-fi
-
-])dnl ACX_PTHREAD
diff --git a/kokkos/basic/optional/ThreadPool/config/config.guess b/kokkos/basic/optional/ThreadPool/config/config.guess
deleted file mode 100755
index 396482d..0000000
--- a/kokkos/basic/optional/ThreadPool/config/config.guess
+++ /dev/null
@@ -1,1500 +0,0 @@
-#! /bin/sh
-# Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
-#   Inc.
-
-timestamp='2006-07-02'
-
-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Originally written by Per Bothner <per@bothner.com>.
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
-#
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
-#
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit build system type.
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION]
-
-Output the configuration name of the system \`$me' is run on.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.guess ($timestamp)
-
-Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
-Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help" >&2
-       exit 1 ;;
-    * )
-       break ;;
-  esac
-done
-
-if test $# != 0; then
-  echo "$me: too many arguments$help" >&2
-  exit 1
-fi
-
-trap 'exit 1' 1 2 15
-
-# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
-# compiler to aid in system detection is discouraged as it requires
-# temporary files to be created and, as you can see below, it is a
-# headache to deal with in a portable fashion.
-
-# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
-# use `HOST_CC' if defined, but it is deprecated.
-
-# Portable tmp directory creation inspired by the Autoconf team.
-
-set_cc_for_build='
-trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
-trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
-: ${TMPDIR=/tmp} ;
- { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
- { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
- { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
- { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
-dummy=$tmp/dummy ;
-tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
-case $CC_FOR_BUILD,$HOST_CC,$CC in
- ,,)    echo "int x;" > $dummy.c ;
-	for c in cc gcc c89 c99 ; do
-	  if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
-	     CC_FOR_BUILD="$c"; break ;
-	  fi ;
-	done ;
-	if test x"$CC_FOR_BUILD" = x ; then
-	  CC_FOR_BUILD=no_compiler_found ;
-	fi
-	;;
- ,,*)   CC_FOR_BUILD=$CC ;;
- ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
-esac ; set_cc_for_build= ;'
-
-# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
-# (ghazi@noc.rutgers.edu 1994-08-24)
-if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
-	PATH=$PATH:/.attbin ; export PATH
-fi
-
-UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
-UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
-UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
-UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
-
-# Note: order is significant - the case branches are not exclusive.
-
-case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
-    *:NetBSD:*:*)
-	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
-	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
-	# switched to ELF, *-*-netbsd* would select the old
-	# object file format.  This provides both forward
-	# compatibility and a consistent mechanism for selecting the
-	# object file format.
-	#
-	# Note: NetBSD doesn't particularly care about the vendor
-	# portion of the name.  We always set it to "unknown".
-	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
-	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
-	case "${UNAME_MACHINE_ARCH}" in
-	    armeb) machine=armeb-unknown ;;
-	    arm*) machine=arm-unknown ;;
-	    sh3el) machine=shl-unknown ;;
-	    sh3eb) machine=sh-unknown ;;
-	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
-	esac
-	# The Operating System including object format, if it has switched
-	# to ELF recently, or will in the future.
-	case "${UNAME_MACHINE_ARCH}" in
-	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
-		eval $set_cc_for_build
-		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep __ELF__ >/dev/null
-		then
-		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
-		    # Return netbsd for either.  FIX?
-		    os=netbsd
-		else
-		    os=netbsdelf
-		fi
-		;;
-	    *)
-	        os=netbsd
-		;;
-	esac
-	# The OS release
-	# Debian GNU/NetBSD machines have a different userland, and
-	# thus, need a distinct triplet. However, they do not need
-	# kernel version information, so it can be replaced with a
-	# suitable tag, in the style of linux-gnu.
-	case "${UNAME_VERSION}" in
-	    Debian*)
-		release='-gnu'
-		;;
-	    *)
-		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
-		;;
-	esac
-	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
-	# contains redundant information, the shorter form:
-	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "${machine}-${os}${release}"
-	exit ;;
-    *:OpenBSD:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
-	exit ;;
-    *:ekkoBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
-	exit ;;
-    *:SolidBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
-	exit ;;
-    macppc:MirBSD:*:*)
-	echo powerpc-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    *:MirBSD:*:*)
-	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
-	exit ;;
-    alpha:OSF1:*:*)
-	case $UNAME_RELEASE in
-	*4.0)
-		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
-		;;
-	*5.*)
-	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
-		;;
-	esac
-	# According to Compaq, /usr/sbin/psrinfo has been available on
-	# OSF/1 and Tru64 systems produced since 1995.  I hope that
-	# covers most systems running today.  This code pipes the CPU
-	# types through head -n 1, so we only detect the type of CPU 0.
-	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
-	case "$ALPHA_CPU_TYPE" in
-	    "EV4 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV4.5 (21064)")
-		UNAME_MACHINE="alpha" ;;
-	    "LCA4 (21066/21068)")
-		UNAME_MACHINE="alpha" ;;
-	    "EV5 (21164)")
-		UNAME_MACHINE="alphaev5" ;;
-	    "EV5.6 (21164A)")
-		UNAME_MACHINE="alphaev56" ;;
-	    "EV5.6 (21164PC)")
-		UNAME_MACHINE="alphapca56" ;;
-	    "EV5.7 (21164PC)")
-		UNAME_MACHINE="alphapca57" ;;
-	    "EV6 (21264)")
-		UNAME_MACHINE="alphaev6" ;;
-	    "EV6.7 (21264A)")
-		UNAME_MACHINE="alphaev67" ;;
-	    "EV6.8CB (21264C)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8AL (21264B)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.8CX (21264D)")
-		UNAME_MACHINE="alphaev68" ;;
-	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE="alphaev69" ;;
-	    "EV7 (21364)")
-		UNAME_MACHINE="alphaev7" ;;
-	    "EV7.9 (21364A)")
-		UNAME_MACHINE="alphaev79" ;;
-	esac
-	# A Pn.n version is a patched version.
-	# A Vn.n version is a released version.
-	# A Tn.n version is a released field test version.
-	# A Xn.n version is an unreleased experimental baselevel.
-	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	exit ;;
-    Alpha\ *:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# Should we change UNAME_MACHINE based on the output of uname instead
-	# of the specific Alpha model?
-	echo alpha-pc-interix
-	exit ;;
-    21064:Windows_NT:50:3)
-	echo alpha-dec-winnt3.5
-	exit ;;
-    Amiga*:UNIX_System_V:4.0:*)
-	echo m68k-unknown-sysv4
-	exit ;;
-    *:[Aa]miga[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-amigaos
-	exit ;;
-    *:[Mm]orph[Oo][Ss]:*:*)
-	echo ${UNAME_MACHINE}-unknown-morphos
-	exit ;;
-    *:OS/390:*:*)
-	echo i370-ibm-openedition
-	exit ;;
-    *:z/VM:*:*)
-	echo s390-ibm-zvmoe
-	exit ;;
-    *:OS400:*:*)
-        echo powerpc-ibm-os400
-	exit ;;
-    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
-	echo arm-acorn-riscix${UNAME_RELEASE}
-	exit ;;
-    arm:riscos:*:*|arm:RISCOS:*:*)
-	echo arm-unknown-riscos
-	exit ;;
-    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
-	echo hppa1.1-hitachi-hiuxmpp
-	exit ;;
-    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
-	# akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
-	if test "`(/bin/universe) 2>/dev/null`" = att ; then
-		echo pyramid-pyramid-sysv3
-	else
-		echo pyramid-pyramid-bsd
-	fi
-	exit ;;
-    NILE*:*:*:dcosx)
-	echo pyramid-pyramid-svr4
-	exit ;;
-    DRS?6000:unix:4.0:6*)
-	echo sparc-icl-nx6
-	exit ;;
-    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
-	case `/usr/bin/uname -p` in
-	    sparc) echo sparc-icl-nx7; exit ;;
-	esac ;;
-    sun4H:SunOS:5.*:*)
-	echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
-	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    i86pc:SunOS:5.*:*)
-	echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:6*:*)
-	# According to config.sub, this is the proper way to canonicalize
-	# SunOS6.  Hard to guess exactly what SunOS6 will be like, but
-	# it's likely to be more like Solaris than SunOS4.
-	echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    sun4*:SunOS:*:*)
-	case "`/usr/bin/arch -k`" in
-	    Series*|S4*)
-		UNAME_RELEASE=`uname -v`
-		;;
-	esac
-	# Japanese Language versions have a version number like `4.1.3-JL'.
-	echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
-	exit ;;
-    sun3*:SunOS:*:*)
-	echo m68k-sun-sunos${UNAME_RELEASE}
-	exit ;;
-    sun*:*:4.2BSD:*)
-	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
-	case "`/bin/arch`" in
-	    sun3)
-		echo m68k-sun-sunos${UNAME_RELEASE}
-		;;
-	    sun4)
-		echo sparc-sun-sunos${UNAME_RELEASE}
-		;;
-	esac
-	exit ;;
-    aushp:SunOS:*:*)
-	echo sparc-auspex-sunos${UNAME_RELEASE}
-	exit ;;
-    # The situation for MiNT is a little confusing.  The machine name
-    # can be virtually everything (everything which is not
-    # "atarist" or "atariste" at least should have a processor
-    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
-    # to the lowercase version "mint" (or "freemint").  Finally
-    # the system name "TOS" denotes a system which is actually not
-    # MiNT.  But MiNT is downward compatible to TOS, so this should
-    # be no problem.
-    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
-	echo m68k-atari-mint${UNAME_RELEASE}
-        exit ;;
-    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
-	exit ;;
-    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-        echo m68k-milan-mint${UNAME_RELEASE}
-        exit ;;
-    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-        echo m68k-hades-mint${UNAME_RELEASE}
-        exit ;;
-    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-        echo m68k-unknown-mint${UNAME_RELEASE}
-        exit ;;
-    m68k:machten:*:*)
-	echo m68k-apple-machten${UNAME_RELEASE}
-	exit ;;
-    powerpc:machten:*:*)
-	echo powerpc-apple-machten${UNAME_RELEASE}
-	exit ;;
-    RISC*:Mach:*:*)
-	echo mips-dec-mach_bsd4.3
-	exit ;;
-    RISC*:ULTRIX:*:*)
-	echo mips-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    VAX*:ULTRIX*:*:*)
-	echo vax-dec-ultrix${UNAME_RELEASE}
-	exit ;;
-    2020:CLIX:*:* | 2430:CLIX:*:*)
-	echo clipper-intergraph-clix${UNAME_RELEASE}
-	exit ;;
-    mips:*:*:UMIPS | mips:*:*:RISCos)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-#ifdef __cplusplus
-#include <stdio.h>  /* for printf() prototype */
-	int main (int argc, char *argv[]) {
-#else
-	int main (argc, argv) int argc; char *argv[]; {
-#endif
-	#if defined (host_mips) && defined (MIPSEB)
-	#if defined (SYSTYPE_SYSV)
-	  printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_SVR4)
-	  printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
-	#endif
-	#if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
-	  printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
-	#endif
-	#endif
-	  exit (-1);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c &&
-	  dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
-	  SYSTEM_NAME=`$dummy $dummyarg` &&
-	    { echo "$SYSTEM_NAME"; exit; }
-	echo mips-mips-riscos${UNAME_RELEASE}
-	exit ;;
-    Motorola:PowerMAX_OS:*:*)
-	echo powerpc-motorola-powermax
-	exit ;;
-    Motorola:*:4.3:PL8-*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
-	echo powerpc-harris-powermax
-	exit ;;
-    Night_Hawk:Power_UNIX:*:*)
-	echo powerpc-harris-powerunix
-	exit ;;
-    m88k:CX/UX:7*:*)
-	echo m88k-harris-cxux7
-	exit ;;
-    m88k:*:4*:R4*)
-	echo m88k-motorola-sysv4
-	exit ;;
-    m88k:*:3*:R3*)
-	echo m88k-motorola-sysv3
-	exit ;;
-    AViiON:dgux:*:*)
-        # DG/UX returns AViiON for all architectures
-        UNAME_PROCESSOR=`/usr/bin/uname -p`
-	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
-	then
-	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
-	       [ ${TARGET_BINARY_INTERFACE}x = x ]
-	    then
-		echo m88k-dg-dgux${UNAME_RELEASE}
-	    else
-		echo m88k-dg-dguxbcs${UNAME_RELEASE}
-	    fi
-	else
-	    echo i586-dg-dgux${UNAME_RELEASE}
-	fi
- 	exit ;;
-    M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
-	echo m88k-dolphin-sysv3
-	exit ;;
-    M88*:*:R3*:*)
-	# Delta 88k system running SVR3
-	echo m88k-motorola-sysv3
-	exit ;;
-    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
-	echo m88k-tektronix-sysv3
-	exit ;;
-    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
-	echo m68k-tektronix-bsd
-	exit ;;
-    *:IRIX*:*:*)
-	echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
-	exit ;;
-    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
-	echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
-	exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
-    i*86:AIX:*:*)
-	echo i386-ibm-aix
-	exit ;;
-    ia64:AIX:*:*)
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:2:3)
-	if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
-		eval $set_cc_for_build
-		sed 's/^		//' << EOF >$dummy.c
-		#include <sys/systemcfg.h>
-
-		main()
-			{
-			if (!__power_pc())
-				exit(1);
-			puts("powerpc-ibm-aix3.2.5");
-			exit(0);
-			}
-EOF
-		if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
-		then
-			echo "$SYSTEM_NAME"
-		else
-			echo rs6000-ibm-aix3.2.5
-		fi
-	elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
-		echo rs6000-ibm-aix3.2.4
-	else
-		echo rs6000-ibm-aix3.2
-	fi
-	exit ;;
-    *:AIX:*:[45])
-	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
-	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
-		IBM_ARCH=rs6000
-	else
-		IBM_ARCH=powerpc
-	fi
-	if [ -x /usr/bin/oslevel ] ; then
-		IBM_REV=`/usr/bin/oslevel`
-	else
-		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
-	fi
-	echo ${IBM_ARCH}-ibm-aix${IBM_REV}
-	exit ;;
-    *:AIX:*:*)
-	echo rs6000-ibm-aix
-	exit ;;
-    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
-	echo romp-ibm-bsd4.4
-	exit ;;
-    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
-	echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
-	exit ;;                             # report: romp-ibm BSD 4.3
-    *:BOSX:*:*)
-	echo rs6000-bull-bosx
-	exit ;;
-    DPX/2?00:B.O.S.:*:*)
-	echo m68k-bull-sysv3
-	exit ;;
-    9000/[34]??:4.3bsd:1.*:*)
-	echo m68k-hp-bsd
-	exit ;;
-    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
-	echo m68k-hp-bsd4.4
-	exit ;;
-    9000/[34678]??:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	case "${UNAME_MACHINE}" in
-	    9000/31? )            HP_ARCH=m68000 ;;
-	    9000/[34]?? )         HP_ARCH=m68k ;;
-	    9000/[678][0-9][0-9])
-		if [ -x /usr/bin/getconf ]; then
-		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
-                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-                    case "${sc_cpu_version}" in
-                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
-                      532)                      # CPU_PA_RISC2_0
-                        case "${sc_kernel_bits}" in
-                          32) HP_ARCH="hppa2.0n" ;;
-                          64) HP_ARCH="hppa2.0w" ;;
-			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
-                        esac ;;
-                    esac
-		fi
-		if [ "${HP_ARCH}" = "" ]; then
-		    eval $set_cc_for_build
-		    sed 's/^              //' << EOF >$dummy.c
-
-              #define _HPUX_SOURCE
-              #include <stdlib.h>
-              #include <unistd.h>
-
-              int main ()
-              {
-              #if defined(_SC_KERNEL_BITS)
-                  long bits = sysconf(_SC_KERNEL_BITS);
-              #endif
-                  long cpu  = sysconf (_SC_CPU_VERSION);
-
-                  switch (cpu)
-              	{
-              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-              	case CPU_PA_RISC2_0:
-              #if defined(_SC_KERNEL_BITS)
-              	    switch (bits)
-              		{
-              		case 64: puts ("hppa2.0w"); break;
-              		case 32: puts ("hppa2.0n"); break;
-              		default: puts ("hppa2.0"); break;
-              		} break;
-              #else  /* !defined(_SC_KERNEL_BITS) */
-              	    puts ("hppa2.0"); break;
-              #endif
-              	default: puts ("hppa1.0"); break;
-              	}
-                  exit (0);
-              }
-EOF
-		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
-		    test -z "$HP_ARCH" && HP_ARCH=hppa
-		fi ;;
-	esac
-	if [ ${HP_ARCH} = "hppa2.0w" ]
-	then
-	    eval $set_cc_for_build
-
-	    # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
-	    # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
-	    # generating 64-bit code.  GNU and HP use different nomenclature:
-	    #
-	    # $ CC_FOR_BUILD=cc ./config.guess
-	    # => hppa2.0w-hp-hpux11.23
-	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
-	    # => hppa64-hp-hpux11.23
-
-	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep __LP64__ >/dev/null
-	    then
-		HP_ARCH="hppa2.0w"
-	    else
-		HP_ARCH="hppa64"
-	    fi
-	fi
-	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
-	exit ;;
-    ia64:HP-UX:*:*)
-	HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
-	echo ia64-hp-hpux${HPUX_REV}
-	exit ;;
-    3050*:HI-UX:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <unistd.h>
-	int
-	main ()
-	{
-	  long cpu = sysconf (_SC_CPU_VERSION);
-	  /* The order matters, because CPU_IS_HP_MC68K erroneously returns
-	     true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
-	     results, however.  */
-	  if (CPU_IS_PA_RISC (cpu))
-	    {
-	      switch (cpu)
-		{
-		  case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
-		  case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
-		  default: puts ("hppa-hitachi-hiuxwe2"); break;
-		}
-	    }
-	  else if (CPU_IS_HP_MC68K (cpu))
-	    puts ("m68k-hitachi-hiuxwe2");
-	  else puts ("unknown-hitachi-hiuxwe2");
-	  exit (0);
-	}
-EOF
-	$CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
-		{ echo "$SYSTEM_NAME"; exit; }
-	echo unknown-hitachi-hiuxwe2
-	exit ;;
-    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
-	echo hppa1.1-hp-bsd
-	exit ;;
-    9000/8??:4.3bsd:*:*)
-	echo hppa1.0-hp-bsd
-	exit ;;
-    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
-	echo hppa1.0-hp-mpeix
-	exit ;;
-    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
-	echo hppa1.1-hp-osf
-	exit ;;
-    hp8??:OSF1:*:*)
-	echo hppa1.0-hp-osf
-	exit ;;
-    i*86:OSF1:*:*)
-	if [ -x /usr/sbin/sysversion ] ; then
-	    echo ${UNAME_MACHINE}-unknown-osf1mk
-	else
-	    echo ${UNAME_MACHINE}-unknown-osf1
-	fi
-	exit ;;
-    parisc*:Lites*:*:*)
-	echo hppa1.1-hp-lites
-	exit ;;
-    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
-	echo c1-convex-bsd
-        exit ;;
-    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-        exit ;;
-    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
-	echo c34-convex-bsd
-        exit ;;
-    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
-	echo c38-convex-bsd
-        exit ;;
-    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
-	echo c4-convex-bsd
-        exit ;;
-    CRAY*Y-MP:*:*:*)
-	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*[A-Z]90:*:*:*)
-	echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
-	| sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
-	      -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
-	      -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*TS:*:*:*)
-	echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*T3E:*:*:*)
-	echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    CRAY*SV1:*:*:*)
-	echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    *:UNICOS/mp:*:*)
-	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
-	exit ;;
-    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit ;;
-    5000:UNIX_System_V:4.*:*)
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
-        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-	exit ;;
-    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
-	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
-	exit ;;
-    sparc*:BSD/OS:*:*)
-	echo sparc-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:BSD/OS:*:*)
-	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
-	exit ;;
-    *:FreeBSD:*:*)
-	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    amd64)
-		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
-	esac
-	exit ;;
-    i*:CYGWIN*:*)
-	echo ${UNAME_MACHINE}-pc-cygwin
-	exit ;;
-    i*:MINGW*:*)
-	echo ${UNAME_MACHINE}-pc-mingw32
-	exit ;;
-    i*:windows32*:*)
-    	# uname -m includes "-pc" on this system.
-    	echo ${UNAME_MACHINE}-mingw32
-	exit ;;
-    i*:PW*:*)
-	echo ${UNAME_MACHINE}-pc-pw32
-	exit ;;
-    x86:Interix*:[3456]*)
-	echo i586-pc-interix${UNAME_RELEASE}
-	exit ;;
-    EM64T:Interix*:[3456]*)
-	echo x86_64-unknown-interix${UNAME_RELEASE}
-	exit ;;
-    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
-	echo i${UNAME_MACHINE}-pc-mks
-	exit ;;
-    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
-	# How do we know it's Interix rather than the generic POSIX subsystem?
-	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
-	# UNAME_MACHINE based on the output of uname instead of i386?
-	echo i586-pc-interix
-	exit ;;
-    i*:UWIN*:*)
-	echo ${UNAME_MACHINE}-pc-uwin
-	exit ;;
-    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
-	echo x86_64-unknown-cygwin
-	exit ;;
-    p*:CYGWIN*:*)
-	echo powerpcle-unknown-cygwin
-	exit ;;
-    prep*:SunOS:5.*:*)
-	echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
-	exit ;;
-    *:GNU:*:*)
-	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
-	exit ;;
-    *:GNU/*:*:*)
-	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
-	exit ;;
-    i*86:Minix:*:*)
-	echo ${UNAME_MACHINE}-pc-minix
-	exit ;;
-    arm*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    cris:Linux:*:*)
-	echo cris-axis-linux-gnu
-	exit ;;
-    crisv32:Linux:*:*)
-	echo crisv32-axis-linux-gnu
-	exit ;;
-    frv:Linux:*:*)
-    	echo frv-unknown-linux-gnu
-	exit ;;
-    ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    mips:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips
-	#undef mipsel
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mipsel
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips64
-	#undef mips64el
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mips64el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips64
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    or32:Linux:*:*)
-	echo or32-unknown-linux-gnu
-	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
-	exit ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-        esac
-	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
-	exit ;;
-    parisc:Linux:*:* | hppa:Linux:*:*)
-	# Look for CPU level
-	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
-	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
-	  *)    echo hppa-unknown-linux-gnu ;;
-	esac
-	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
-	exit ;;
-    s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
-	exit ;;
-    sh64*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
-	exit ;;
-    vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-gnu
-	exit ;;
-    x86_64:Linux:*:*)
-	echo x86_64-unknown-linux-gnu
-	exit ;;
-    i*86:Linux:*:*)
-	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us. cd to the root directory to prevent
-	# problems with other programs or directories called `ld' in the path.
-	# Set LC_ALL=C to ensure ld outputs messages in English.
-	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
-			 | sed -ne '/supported targets:/!d
-				    s/[ 	][ 	]*/ /g
-				    s/.*supported targets: *//
-				    s/ .*//
-				    p'`
-        case "$ld_supported_targets" in
-	  elf32-i386)
-		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
-		;;
-	  a.out-i386-linux)
-		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
-		exit ;;
-	  coff-i386)
-		echo "${UNAME_MACHINE}-pc-linux-gnucoff"
-		exit ;;
-	  "")
-		# Either a pre-BFD a.out linker (linux-gnuoldld) or
-		# one that does not give us useful --help.
-		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
-		exit ;;
-	esac
-	# Determine whether the default compiler is a.out or elf
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <features.h>
-	#ifdef __ELF__
-	# ifdef __GLIBC__
-	#  if __GLIBC__ >= 2
-	LIBC=gnu
-	#  else
-	LIBC=gnulibc1
-	#  endif
-	# else
-	LIBC=gnulibc1
-	# endif
-	#else
-	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-	LIBC=gnu
-	#else
-	LIBC=gnuaout
-	#endif
-	#endif
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^LIBC/{
-		s: ::g
-		p
-	    }'`"
-	test x"${LIBC}" != x && {
-		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
-		exit
-	}
-	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
-	;;
-    i*86:DYNIX/ptx:4*:*)
-	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
-	# earlier versions are messed up and put the nodename in both
-	# sysname and nodename.
-	echo i386-sequent-sysv4
-	exit ;;
-    i*86:UNIX_SV:4.2MP:2.*)
-        # Unixware is an offshoot of SVR4, but it has its own version
-        # number series starting with 2...
-        # I am not positive that other SVR4 systems won't match this,
-	# I just have to hope.  -- rms.
-        # Use sysv4.2uw... so that sysv4* matches it.
-	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
-	exit ;;
-    i*86:OS/2:*:*)
-	# If we were able to find `uname', then EMX Unix compatibility
-	# is probably installed.
-	echo ${UNAME_MACHINE}-pc-os2-emx
-	exit ;;
-    i*86:XTS-300:*:STOP)
-	echo ${UNAME_MACHINE}-unknown-stop
-	exit ;;
-    i*86:atheos:*:*)
-	echo ${UNAME_MACHINE}-unknown-atheos
-	exit ;;
-    i*86:syllable:*:*)
-	echo ${UNAME_MACHINE}-pc-syllable
-	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
-	echo i386-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    i*86:*DOS:*:*)
-	echo ${UNAME_MACHINE}-pc-msdosdjgpp
-	exit ;;
-    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
-	UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
-	if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
-		echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
-	else
-		echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
-	fi
-	exit ;;
-    i*86:*:5:[678]*)
-    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
-	case `/bin/uname -X | grep "^Machine"` in
-	    *486*)	     UNAME_MACHINE=i486 ;;
-	    *Pentium)	     UNAME_MACHINE=i586 ;;
-	    *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
-	esac
-	echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
-	exit ;;
-    i*86:*:3.2:*)
-	if test -f /usr/options/cb.name; then
-		UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
-		echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
-	elif /bin/uname -X 2>/dev/null >/dev/null ; then
-		UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
-		(/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
-		(/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
-			&& UNAME_MACHINE=i586
-		(/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		(/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
-			&& UNAME_MACHINE=i686
-		echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
-	else
-		echo ${UNAME_MACHINE}-pc-sysv32
-	fi
-	exit ;;
-    pc:*:*:*)
-	# Left here for compatibility:
-        # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i386.
-	echo i386-pc-msdosdjgpp
-        exit ;;
-    Intel:Mach:3*:*)
-	echo i386-pc-mach3
-	exit ;;
-    paragon:*:*:*)
-	echo i860-intel-osf1
-	exit ;;
-    i860:*:4.*:*) # i860-SVR4
-	if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
-	  echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
-	else # Add other i860-SVR4 vendors below as they are discovered.
-	  echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
-	fi
-	exit ;;
-    mini*:CTIX:SYS*5:*)
-	# "miniframe"
-	echo m68010-convergent-sysv
-	exit ;;
-    mc68k:UNIX:SYSTEM5:3.51m)
-	echo m68k-convergent-sysv
-	exit ;;
-    M680?0:D-NIX:5.3:*)
-	echo m68k-diab-dnix
-	exit ;;
-    M68*:*:R3V[5678]*:*)
-	test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
-    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
-	OS_REL=''
-	test -r /etc/.relid \
-	&& OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
-	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-	  && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
-	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
-	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
-    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && { echo i486-ncr-sysv4; exit; } ;;
-    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
-	echo m68k-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    mc68030:UNIX_System_V:4.*:*)
-	echo m68k-atari-sysv4
-	exit ;;
-    TSUNAMI:LynxOS:2.*:*)
-	echo sparc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    rs6000:LynxOS:2.*:*)
-	echo rs6000-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
-	echo powerpc-unknown-lynxos${UNAME_RELEASE}
-	exit ;;
-    SM[BE]S:UNIX_SV:*:*)
-	echo mips-dde-sysv${UNAME_RELEASE}
-	exit ;;
-    RM*:ReliantUNIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    RM*:SINIX-*:*:*)
-	echo mips-sni-sysv4
-	exit ;;
-    *:SINIX-*:*:*)
-	if uname -p 2>/dev/null >/dev/null ; then
-		UNAME_MACHINE=`(uname -p) 2>/dev/null`
-		echo ${UNAME_MACHINE}-sni-sysv4
-	else
-		echo ns32k-sni-sysv
-	fi
-	exit ;;
-    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                      # says <Richard.M.Bartel@ccMail.Census.GOV>
-        echo i586-unisys-sysv4
-        exit ;;
-    *:UNIX_System_V:4*:FTX*)
-	# From Gerald Hewes <hewes@openmarket.com>.
-	# How about differentiating between stratus architectures? -djm
-	echo hppa1.1-stratus-sysv4
-	exit ;;
-    *:*:*:FTX*)
-	# From seanf@swdc.stratus.com.
-	echo i860-stratus-sysv4
-	exit ;;
-    i*86:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo ${UNAME_MACHINE}-stratus-vos
-	exit ;;
-    *:VOS:*:*)
-	# From Paul.Green@stratus.com.
-	echo hppa1.1-stratus-vos
-	exit ;;
-    mc68*:A/UX:*:*)
-	echo m68k-apple-aux${UNAME_RELEASE}
-	exit ;;
-    news*:NEWS-OS:6*:*)
-	echo mips-sony-newsos6
-	exit ;;
-    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
-	if [ -d /usr/nec ]; then
-	        echo mips-nec-sysv${UNAME_RELEASE}
-	else
-	        echo mips-unknown-sysv${UNAME_RELEASE}
-	fi
-        exit ;;
-    BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
-	echo powerpc-be-beos
-	exit ;;
-    BeMac:BeOS:*:*)	# BeOS running on Mac or Mac clone, PPC only.
-	echo powerpc-apple-beos
-	exit ;;
-    BePC:BeOS:*:*)	# BeOS running on Intel PC compatible.
-	echo i586-pc-beos
-	exit ;;
-    SX-4:SUPER-UX:*:*)
-	echo sx4-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-5:SUPER-UX:*:*)
-	echo sx5-nec-superux${UNAME_RELEASE}
-	exit ;;
-    SX-6:SUPER-UX:*:*)
-	echo sx6-nec-superux${UNAME_RELEASE}
-	exit ;;
-    Power*:Rhapsody:*:*)
-	echo powerpc-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Rhapsody:*:*)
-	echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
-	exit ;;
-    *:Darwin:*:*)
-	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	case $UNAME_PROCESSOR in
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
-	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
-	exit ;;
-    *:procnto*:*:* | *:QNX:[0123456789]*:*)
-	UNAME_PROCESSOR=`uname -p`
-	if test "$UNAME_PROCESSOR" = "x86"; then
-		UNAME_PROCESSOR=i386
-		UNAME_MACHINE=pc
-	fi
-	echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
-	exit ;;
-    *:QNX:*:4*)
-	echo i386-pc-qnx
-	exit ;;
-    NSE-?:NONSTOP_KERNEL:*:*)
-	echo nse-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    NSR-?:NONSTOP_KERNEL:*:*)
-	echo nsr-tandem-nsk${UNAME_RELEASE}
-	exit ;;
-    *:NonStop-UX:*:*)
-	echo mips-compaq-nonstopux
-	exit ;;
-    BS2000:POSIX*:*:*)
-	echo bs2000-siemens-sysv
-	exit ;;
-    DS/*:UNIX_System_V:*:*)
-	echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
-	exit ;;
-    *:Plan9:*:*)
-	# "uname -m" is not consistent, so use $cputype instead. 386
-	# is converted to i386 for consistency with other x86
-	# operating systems.
-	if test "$cputype" = "386"; then
-	    UNAME_MACHINE=i386
-	else
-	    UNAME_MACHINE="$cputype"
-	fi
-	echo ${UNAME_MACHINE}-unknown-plan9
-	exit ;;
-    *:TOPS-10:*:*)
-	echo pdp10-unknown-tops10
-	exit ;;
-    *:TENEX:*:*)
-	echo pdp10-unknown-tenex
-	exit ;;
-    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
-	echo pdp10-dec-tops20
-	exit ;;
-    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
-	echo pdp10-xkl-tops20
-	exit ;;
-    *:TOPS-20:*:*)
-	echo pdp10-unknown-tops20
-	exit ;;
-    *:ITS:*:*)
-	echo pdp10-unknown-its
-	exit ;;
-    SEI:*:*:SEIUX)
-        echo mips-sei-seiux${UNAME_RELEASE}
-	exit ;;
-    *:DragonFly:*:*)
-	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
-	exit ;;
-    *:*VMS:*:*)
-    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
-	case "${UNAME_MACHINE}" in
-	    A*) echo alpha-dec-vms ; exit ;;
-	    I*) echo ia64-dec-vms ; exit ;;
-	    V*) echo vax-dec-vms ; exit ;;
-	esac ;;
-    *:XENIX:*:SysV)
-	echo i386-pc-xenix
-	exit ;;
-    i*86:skyos:*:*)
-	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
-	exit ;;
-    i*86:rdos:*:*)
-	echo ${UNAME_MACHINE}-pc-rdos
-	exit ;;
-esac
-
-#echo '(No uname command or uname output not recognized.)' 1>&2
-#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2
-
-eval $set_cc_for_build
-cat >$dummy.c <<EOF
-#ifdef _SEQUENT_
-# include <sys/types.h>
-# include <sys/utsname.h>
-#endif
-main ()
-{
-#if defined (sony)
-#if defined (MIPSEB)
-  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
-     I don't know....  */
-  printf ("mips-sony-bsd\n"); exit (0);
-#else
-#include <sys/param.h>
-  printf ("m68k-sony-newsos%s\n",
-#ifdef NEWSOS4
-          "4"
-#else
-	  ""
-#endif
-         ); exit (0);
-#endif
-#endif
-
-#if defined (__arm) && defined (__acorn) && defined (__unix)
-  printf ("arm-acorn-riscix\n"); exit (0);
-#endif
-
-#if defined (hp300) && !defined (hpux)
-  printf ("m68k-hp-bsd\n"); exit (0);
-#endif
-
-#if defined (NeXT)
-#if !defined (__ARCHITECTURE__)
-#define __ARCHITECTURE__ "m68k"
-#endif
-  int version;
-  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
-  if (version < 4)
-    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
-  else
-    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
-  exit (0);
-#endif
-
-#if defined (MULTIMAX) || defined (n16)
-#if defined (UMAXV)
-  printf ("ns32k-encore-sysv\n"); exit (0);
-#else
-#if defined (CMU)
-  printf ("ns32k-encore-mach\n"); exit (0);
-#else
-  printf ("ns32k-encore-bsd\n"); exit (0);
-#endif
-#endif
-#endif
-
-#if defined (__386BSD__)
-  printf ("i386-pc-bsd\n"); exit (0);
-#endif
-
-#if defined (sequent)
-#if defined (i386)
-  printf ("i386-sequent-dynix\n"); exit (0);
-#endif
-#if defined (ns32000)
-  printf ("ns32k-sequent-dynix\n"); exit (0);
-#endif
-#endif
-
-#if defined (_SEQUENT_)
-    struct utsname un;
-
-    uname(&un);
-
-    if (strncmp(un.version, "V2", 2) == 0) {
-	printf ("i386-sequent-ptx2\n"); exit (0);
-    }
-    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
-	printf ("i386-sequent-ptx1\n"); exit (0);
-    }
-    printf ("i386-sequent-ptx\n"); exit (0);
-
-#endif
-
-#if defined (vax)
-# if !defined (ultrix)
-#  include <sys/param.h>
-#  if defined (BSD)
-#   if BSD == 43
-      printf ("vax-dec-bsd4.3\n"); exit (0);
-#   else
-#    if BSD == 199006
-      printf ("vax-dec-bsd4.3reno\n"); exit (0);
-#    else
-      printf ("vax-dec-bsd\n"); exit (0);
-#    endif
-#   endif
-#  else
-    printf ("vax-dec-bsd\n"); exit (0);
-#  endif
-# else
-    printf ("vax-dec-ultrix\n"); exit (0);
-# endif
-#endif
-
-#if defined (alliant) && defined (i860)
-  printf ("i860-alliant-bsd\n"); exit (0);
-#endif
-
-  exit (1);
-}
-EOF
-
-$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
-	{ echo "$SYSTEM_NAME"; exit; }
-
-# Apollos put the system type in the environment.
-
-test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
-
-# Convex versions that predate uname can use getsysinfo(1)
-
-if [ -x /usr/convex/getsysinfo ]
-then
-    case `getsysinfo -f cpu_type` in
-    c1*)
-	echo c1-convex-bsd
-	exit ;;
-    c2*)
-	if getsysinfo -f scalar_acc
-	then echo c32-convex-bsd
-	else echo c2-convex-bsd
-	fi
-	exit ;;
-    c34*)
-	echo c34-convex-bsd
-	exit ;;
-    c38*)
-	echo c38-convex-bsd
-	exit ;;
-    c4*)
-	echo c4-convex-bsd
-	exit ;;
-    esac
-fi
-
-cat >&2 <<EOF
-$0: unable to guess system type
-
-This script, last modified $timestamp, has failed to recognize
-the operating system you are using. It is advised that you
-download the most up to date version of the config scripts from
-
-  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.guess
-and
-  http://savannah.gnu.org/cgi-bin/viewcvs/*checkout*/config/config/config.sub
-
-If the version you run ($0) is already up to date, please
-send the following data and any information you think might be
-pertinent to <config-patches@gnu.org> in order to provide the needed
-information to handle your system.
-
-config.guess timestamp = $timestamp
-
-uname -m = `(uname -m) 2>/dev/null || echo unknown`
-uname -r = `(uname -r) 2>/dev/null || echo unknown`
-uname -s = `(uname -s) 2>/dev/null || echo unknown`
-uname -v = `(uname -v) 2>/dev/null || echo unknown`
-
-/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
-/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
-
-hostinfo               = `(hostinfo) 2>/dev/null`
-/bin/universe          = `(/bin/universe) 2>/dev/null`
-/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
-/bin/arch              = `(/bin/arch) 2>/dev/null`
-/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
-/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
-
-UNAME_MACHINE = ${UNAME_MACHINE}
-UNAME_RELEASE = ${UNAME_RELEASE}
-UNAME_SYSTEM  = ${UNAME_SYSTEM}
-UNAME_VERSION = ${UNAME_VERSION}
-EOF
-
-exit 1
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/kokkos/basic/optional/ThreadPool/config/config.sub b/kokkos/basic/optional/ThreadPool/config/config.sub
deleted file mode 100755
index fab0aa3..0000000
--- a/kokkos/basic/optional/ThreadPool/config/config.sub
+++ /dev/null
@@ -1,1616 +0,0 @@
-#! /bin/sh
-# Configuration validation subroutine script.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation,
-#   Inc.
-
-timestamp='2006-09-20'
-
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine.  It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
-#
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
-#
-# Configuration subroutine to validate and canonicalize a configuration type.
-# Supply the specified configuration type as an argument.
-# If it is invalid, we print an error message on stderr and exit with code 1.
-# Otherwise, we print the canonical config type on stdout and succeed.
-
-# This file is supposed to be the same for all GNU packages
-# and recognize all the CPU types, system types and aliases
-# that are meaningful with *any* GNU software.
-# Each package is responsible for reporting which valid configurations
-# it does not support.  The user should be able to distinguish
-# a failure to support a valid configuration from a meaningless
-# configuration.
-
-# The goal of this file is to map all the various variations of a given
-# machine specification into a single specification in the form:
-#	CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
-# or in some cases, the newer four-part form:
-#	CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
-# It is wrong to echo any other type of specification.
-
-me=`echo "$0" | sed -e 's,.*/,,'`
-
-usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS
-       $0 [OPTION] ALIAS
-
-Canonicalize a configuration name.
-
-Operation modes:
-  -h, --help         print this help, then exit
-  -t, --time-stamp   print date of last modification, then exit
-  -v, --version      print version number, then exit
-
-Report bugs and patches to <config-patches@gnu.org>."
-
-version="\
-GNU config.sub ($timestamp)
-
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005
-Free Software Foundation, Inc.
-
-This is free software; see the source for copying conditions.  There is NO
-warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
-
-help="
-Try \`$me --help' for more information."
-
-# Parse command line
-while test $# -gt 0 ; do
-  case $1 in
-    --time-stamp | --time* | -t )
-       echo "$timestamp" ; exit ;;
-    --version | -v )
-       echo "$version" ; exit ;;
-    --help | --h* | -h )
-       echo "$usage"; exit ;;
-    -- )     # Stop option processing
-       shift; break ;;
-    - )	# Use stdin as input.
-       break ;;
-    -* )
-       echo "$me: invalid option $1$help"
-       exit 1 ;;
-
-    *local*)
-       # First pass through any local machine types.
-       echo $1
-       exit ;;
-
-    * )
-       break ;;
-  esac
-done
-
-case $# in
- 0) echo "$me: missing argument$help" >&2
-    exit 1;;
- 1) ;;
- *) echo "$me: too many arguments$help" >&2
-    exit 1;;
-esac
-
-# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
-# Here we must recognize all the valid KERNEL-OS combinations.
-maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
-case $maybe_os in
-  nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \
-  uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \
-  storm-chaos* | os2-emx* | rtmk-nova*)
-    os=-$maybe_os
-    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
-    ;;
-  *)
-    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
-    if [ $basic_machine != $1 ]
-    then os=`echo $1 | sed 's/.*-/-/'`
-    else os=; fi
-    ;;
-esac
-
-### Let's recognize common machines as not being operating systems so
-### that things like config.sub decstation-3100 work.  We also
-### recognize some manufacturers as not being operating systems, so we
-### can provide default operating systems below.
-case $os in
-	-sun*os*)
-		# Prevent following clause from handling this invalid input.
-		;;
-	-dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
-	-att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
-	-unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
-	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray)
-		os=
-		basic_machine=$1
-		;;
-	-sim | -cisco | -oki | -wec | -winbond)
-		os=
-		basic_machine=$1
-		;;
-	-scout)
-		;;
-	-wrs)
-		os=-vxworks
-		basic_machine=$1
-		;;
-	-chorusos*)
-		os=-chorusos
-		basic_machine=$1
-		;;
- 	-chorusrdb)
- 		os=-chorusrdb
-		basic_machine=$1
- 		;;
-	-hiux*)
-		os=-hiuxwe2
-		;;
-	-sco6)
-		os=-sco5v6
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5)
-		os=-sco3.2v5
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco4)
-		os=-sco3.2v4
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2.[4-9]*)
-		os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco3.2v[4-9]*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco5v6*)
-		# Don't forget version if it is 3.2v4 or newer.
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-sco*)
-		os=-sco3.2v2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-udk*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-isc)
-		os=-isc2.2
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-clix*)
-		basic_machine=clipper-intergraph
-		;;
-	-isc*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
-		;;
-	-lynx*)
-		os=-lynxos
-		;;
-	-ptx*)
-		basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
-		;;
-	-windowsnt*)
-		os=`echo $os | sed -e 's/windowsnt/winnt/'`
-		;;
-	-psos*)
-		os=-psos
-		;;
-	-mint | -mint[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-esac
-
-# Decode aliases for certain CPU-COMPANY combinations.
-case $basic_machine in
-	# Recognize the basic CPU types without company name.
-	# Some are omitted here because they have special meanings below.
-	1750a | 580 \
-	| a29k \
-	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
-	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
-	| am33_2.0 \
-	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-	| bfin \
-	| c4x | clipper \
-	| d10v | d30v | dlx | dsp16xx \
-	| fr30 | frv \
-	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
-	| i370 | i860 | i960 | ia64 \
-	| ip2k | iq2000 \
-	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | mcore \
-	| mips | mipsbe | mipseb | mipsel | mipsle \
-	| mips16 \
-	| mips64 | mips64el \
-	| mips64vr | mips64vrel \
-	| mips64orion | mips64orionel \
-	| mips64vr4100 | mips64vr4100el \
-	| mips64vr4300 | mips64vr4300el \
-	| mips64vr5000 | mips64vr5000el \
-	| mips64vr5900 | mips64vr5900el \
-	| mipsisa32 | mipsisa32el \
-	| mipsisa32r2 | mipsisa32r2el \
-	| mipsisa64 | mipsisa64el \
-	| mipsisa64r2 | mipsisa64r2el \
-	| mipsisa64sb1 | mipsisa64sb1el \
-	| mipsisa64sr71k | mipsisa64sr71kel \
-	| mipstx39 | mipstx39el \
-	| mn10200 | mn10300 \
-	| mt \
-	| msp430 \
-	| nios | nios2 \
-	| ns16k | ns32k \
-	| or32 \
-	| pdp10 | pdp11 | pj | pjl \
-	| powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \
-	| pyramid \
-	| score \
-	| sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
-	| sh64 | sh64le \
-	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
-	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
-	| spu | strongarm \
-	| tahoe | thumb | tic4x | tic80 | tron \
-	| v850 | v850e \
-	| we32k \
-	| x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \
-	| z8k)
-		basic_machine=$basic_machine-unknown
-		;;
-	m6811 | m68hc11 | m6812 | m68hc12)
-		# Motorola 68HC11/12.
-		basic_machine=$basic_machine-unknown
-		os=-none
-		;;
-	m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
-		;;
-	ms1)
-		basic_machine=mt-unknown
-		;;
-
-	# We use `pc' rather than `unknown'
-	# because (1) that's what they normally are, and
-	# (2) the word "unknown" tends to confuse beginning users.
-	i*86 | x86_64)
-	  basic_machine=$basic_machine-pc
-	  ;;
-	# Object if more than one company name word.
-	*-*-*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-	# Recognize the basic CPU types with company name.
-	580-* \
-	| a29k-* \
-	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
-	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
-	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
-	| avr-* | avr32-* \
-	| bfin-* | bs2000-* \
-	| c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \
-	| clipper-* | craynv-* | cydra-* \
-	| d10v-* | d30v-* | dlx-* \
-	| elxsi-* \
-	| f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \
-	| h8300-* | h8500-* \
-	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
-	| i*86-* | i860-* | i960-* | ia64-* \
-	| ip2k-* | iq2000-* \
-	| m32c-* | m32r-* | m32rle-* \
-	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* \
-	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
-	| mips16-* \
-	| mips64-* | mips64el-* \
-	| mips64vr-* | mips64vrel-* \
-	| mips64orion-* | mips64orionel-* \
-	| mips64vr4100-* | mips64vr4100el-* \
-	| mips64vr4300-* | mips64vr4300el-* \
-	| mips64vr5000-* | mips64vr5000el-* \
-	| mips64vr5900-* | mips64vr5900el-* \
-	| mipsisa32-* | mipsisa32el-* \
-	| mipsisa32r2-* | mipsisa32r2el-* \
-	| mipsisa64-* | mipsisa64el-* \
-	| mipsisa64r2-* | mipsisa64r2el-* \
-	| mipsisa64sb1-* | mipsisa64sb1el-* \
-	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
-	| mipstx39-* | mipstx39el-* \
-	| mmix-* \
-	| mt-* \
-	| msp430-* \
-	| nios-* | nios2-* \
-	| none-* | np1-* | ns16k-* | ns32k-* \
-	| orion-* \
-	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
-	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \
-	| pyramid-* \
-	| romp-* | rs6000-* \
-	| sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
-	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
-	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
-	| sparclite-* \
-	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \
-	| tahoe-* | thumb-* \
-	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
-	| tron-* \
-	| v850-* | v850e-* | vax-* \
-	| we32k-* \
-	| x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \
-	| xstormy16-* | xtensa-* \
-	| ymp-* \
-	| z8k-*)
-		;;
-	# Recognize the various machine names and aliases which stand
-	# for a CPU type and a company and sometimes even an OS.
-	386bsd)
-		basic_machine=i386-unknown
-		os=-bsd
-		;;
-	3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
-		basic_machine=m68000-att
-		;;
-	3b*)
-		basic_machine=we32k-att
-		;;
-	a29khif)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-    	abacus)
-		basic_machine=abacus-unknown
-		;;
-	adobe68k)
-		basic_machine=m68010-adobe
-		os=-scout
-		;;
-	alliant | fx80)
-		basic_machine=fx80-alliant
-		;;
-	altos | altos3068)
-		basic_machine=m68k-altos
-		;;
-	am29k)
-		basic_machine=a29k-none
-		os=-bsd
-		;;
-	amd64)
-		basic_machine=x86_64-pc
-		;;
-	amd64-*)
-		basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	amdahl)
-		basic_machine=580-amdahl
-		os=-sysv
-		;;
-	amiga | amiga-*)
-		basic_machine=m68k-unknown
-		;;
-	amigaos | amigados)
-		basic_machine=m68k-unknown
-		os=-amigaos
-		;;
-	amigaunix | amix)
-		basic_machine=m68k-unknown
-		os=-sysv4
-		;;
-	apollo68)
-		basic_machine=m68k-apollo
-		os=-sysv
-		;;
-	apollo68bsd)
-		basic_machine=m68k-apollo
-		os=-bsd
-		;;
-	aux)
-		basic_machine=m68k-apple
-		os=-aux
-		;;
-	balance)
-		basic_machine=ns32k-sequent
-		os=-dynix
-		;;
-	c90)
-		basic_machine=c90-cray
-		os=-unicos
-		;;
-	convex-c1)
-		basic_machine=c1-convex
-		os=-bsd
-		;;
-	convex-c2)
-		basic_machine=c2-convex
-		os=-bsd
-		;;
-	convex-c32)
-		basic_machine=c32-convex
-		os=-bsd
-		;;
-	convex-c34)
-		basic_machine=c34-convex
-		os=-bsd
-		;;
-	convex-c38)
-		basic_machine=c38-convex
-		os=-bsd
-		;;
-	cray | j90)
-		basic_machine=j90-cray
-		os=-unicos
-		;;
-	craynv)
-		basic_machine=craynv-cray
-		os=-unicosmp
-		;;
-	cr16c)
-		basic_machine=cr16c-unknown
-		os=-elf
-		;;
-	crds | unos)
-		basic_machine=m68k-crds
-		;;
-	crisv32 | crisv32-* | etraxfs*)
-		basic_machine=crisv32-axis
-		;;
-	cris | cris-* | etrax*)
-		basic_machine=cris-axis
-		;;
-	crx)
-		basic_machine=crx-unknown
-		os=-elf
-		;;
-	da30 | da30-*)
-		basic_machine=m68k-da30
-		;;
-	decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
-		basic_machine=mips-dec
-		;;
-	decsystem10* | dec10*)
-		basic_machine=pdp10-dec
-		os=-tops10
-		;;
-	decsystem20* | dec20*)
-		basic_machine=pdp10-dec
-		os=-tops20
-		;;
-	delta | 3300 | motorola-3300 | motorola-delta \
-	      | 3300-motorola | delta-motorola)
-		basic_machine=m68k-motorola
-		;;
-	delta88)
-		basic_machine=m88k-motorola
-		os=-sysv3
-		;;
-	djgpp)
-		basic_machine=i586-pc
-		os=-msdosdjgpp
-		;;
-	dpx20 | dpx20-*)
-		basic_machine=rs6000-bull
-		os=-bosx
-		;;
-	dpx2* | dpx2*-bull)
-		basic_machine=m68k-bull
-		os=-sysv3
-		;;
-	ebmon29k)
-		basic_machine=a29k-amd
-		os=-ebmon
-		;;
-	elxsi)
-		basic_machine=elxsi-elxsi
-		os=-bsd
-		;;
-	encore | umax | mmax)
-		basic_machine=ns32k-encore
-		;;
-	es1800 | OSE68k | ose68k | ose | OSE)
-		basic_machine=m68k-ericsson
-		os=-ose
-		;;
-	fx2800)
-		basic_machine=i860-alliant
-		;;
-	genix)
-		basic_machine=ns32k-ns
-		;;
-	gmicro)
-		basic_machine=tron-gmicro
-		os=-sysv
-		;;
-	go32)
-		basic_machine=i386-pc
-		os=-go32
-		;;
-	h3050r* | hiux*)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	h8300hms)
-		basic_machine=h8300-hitachi
-		os=-hms
-		;;
-	h8300xray)
-		basic_machine=h8300-hitachi
-		os=-xray
-		;;
-	h8500hms)
-		basic_machine=h8500-hitachi
-		os=-hms
-		;;
-	harris)
-		basic_machine=m88k-harris
-		os=-sysv3
-		;;
-	hp300-*)
-		basic_machine=m68k-hp
-		;;
-	hp300bsd)
-		basic_machine=m68k-hp
-		os=-bsd
-		;;
-	hp300hpux)
-		basic_machine=m68k-hp
-		os=-hpux
-		;;
-	hp3k9[0-9][0-9] | hp9[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k2[0-9][0-9] | hp9k31[0-9])
-		basic_machine=m68000-hp
-		;;
-	hp9k3[2-9][0-9])
-		basic_machine=m68k-hp
-		;;
-	hp9k6[0-9][0-9] | hp6[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hp9k7[0-79][0-9] | hp7[0-79][0-9])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k78[0-9] | hp78[0-9])
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
-		# FIXME: really hppa2.0-hp
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][13679] | hp8[0-9][13679])
-		basic_machine=hppa1.1-hp
-		;;
-	hp9k8[0-9][0-9] | hp8[0-9][0-9])
-		basic_machine=hppa1.0-hp
-		;;
-	hppa-next)
-		os=-nextstep3
-		;;
-	hppaosf)
-		basic_machine=hppa1.1-hp
-		os=-osf
-		;;
-	hppro)
-		basic_machine=hppa1.1-hp
-		os=-proelf
-		;;
-	i370-ibm* | ibm*)
-		basic_machine=i370-ibm
-		;;
-# I'm not sure what "Sysv32" means.  Should this be sysv3.2?
-	i*86v32)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv32
-		;;
-	i*86v4*)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv4
-		;;
-	i*86v)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-sysv
-		;;
-	i*86sol2)
-		basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
-		os=-solaris2
-		;;
-	i386mach)
-		basic_machine=i386-mach
-		os=-mach
-		;;
-	i386-vsta | vsta)
-		basic_machine=i386-unknown
-		os=-vsta
-		;;
-	iris | iris4d)
-		basic_machine=mips-sgi
-		case $os in
-		    -irix*)
-			;;
-		    *)
-			os=-irix4
-			;;
-		esac
-		;;
-	isi68 | isi)
-		basic_machine=m68k-isi
-		os=-sysv
-		;;
-	m88k-omron*)
-		basic_machine=m88k-omron
-		;;
-	magnum | m3230)
-		basic_machine=mips-mips
-		os=-sysv
-		;;
-	merlin)
-		basic_machine=ns32k-utek
-		os=-sysv
-		;;
-	mingw32)
-		basic_machine=i386-pc
-		os=-mingw32
-		;;
-	miniframe)
-		basic_machine=m68000-convergent
-		;;
-	*mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
-		basic_machine=m68k-atari
-		os=-mint
-		;;
-	mips3*-*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
-		;;
-	mips3*)
-		basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
-		;;
-	monitor)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	morphos)
-		basic_machine=powerpc-unknown
-		os=-morphos
-		;;
-	msdos)
-		basic_machine=i386-pc
-		os=-msdos
-		;;
-	ms1-*)
-		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
-		;;
-	mvs)
-		basic_machine=i370-ibm
-		os=-mvs
-		;;
-	ncr3000)
-		basic_machine=i486-ncr
-		os=-sysv4
-		;;
-	netbsd386)
-		basic_machine=i386-unknown
-		os=-netbsd
-		;;
-	netwinder)
-		basic_machine=armv4l-rebel
-		os=-linux
-		;;
-	news | news700 | news800 | news900)
-		basic_machine=m68k-sony
-		os=-newsos
-		;;
-	news1000)
-		basic_machine=m68030-sony
-		os=-newsos
-		;;
-	news-3600 | risc-news)
-		basic_machine=mips-sony
-		os=-newsos
-		;;
-	necv70)
-		basic_machine=v70-nec
-		os=-sysv
-		;;
-	next | m*-next )
-		basic_machine=m68k-next
-		case $os in
-		    -nextstep* )
-			;;
-		    -ns2*)
-		      os=-nextstep2
-			;;
-		    *)
-		      os=-nextstep3
-			;;
-		esac
-		;;
-	nh3000)
-		basic_machine=m68k-harris
-		os=-cxux
-		;;
-	nh[45]000)
-		basic_machine=m88k-harris
-		os=-cxux
-		;;
-	nindy960)
-		basic_machine=i960-intel
-		os=-nindy
-		;;
-	mon960)
-		basic_machine=i960-intel
-		os=-mon960
-		;;
-	nonstopux)
-		basic_machine=mips-compaq
-		os=-nonstopux
-		;;
-	np1)
-		basic_machine=np1-gould
-		;;
-	nsr-tandem)
-		basic_machine=nsr-tandem
-		;;
-	op50n-* | op60c-*)
-		basic_machine=hppa1.1-oki
-		os=-proelf
-		;;
-	openrisc | openrisc-*)
-		basic_machine=or32-unknown
-		;;
-	os400)
-		basic_machine=powerpc-ibm
-		os=-os400
-		;;
-	OSE68000 | ose68000)
-		basic_machine=m68000-ericsson
-		os=-ose
-		;;
-	os68k)
-		basic_machine=m68k-none
-		os=-os68k
-		;;
-	pa-hitachi)
-		basic_machine=hppa1.1-hitachi
-		os=-hiuxwe2
-		;;
-	paragon)
-		basic_machine=i860-intel
-		os=-osf
-		;;
-	pbd)
-		basic_machine=sparc-tti
-		;;
-	pbb)
-		basic_machine=m68k-tti
-		;;
-	pc532 | pc532-*)
-		basic_machine=ns32k-pc532
-		;;
-	pc98)
-		basic_machine=i386-pc
-		;;
-	pc98-*)
-		basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium | p5 | k5 | k6 | nexgen | viac3)
-		basic_machine=i586-pc
-		;;
-	pentiumpro | p6 | 6x86 | athlon | athlon_*)
-		basic_machine=i686-pc
-		;;
-	pentiumii | pentium2 | pentiumiii | pentium3)
-		basic_machine=i686-pc
-		;;
-	pentium4)
-		basic_machine=i786-pc
-		;;
-	pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
-		basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumpro-* | p6-* | 6x86-* | athlon-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
-		basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pentium4-*)
-		basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	pn)
-		basic_machine=pn-gould
-		;;
-	power)	basic_machine=power-ibm
-		;;
-	ppc)	basic_machine=powerpc-unknown
-		;;
-	ppc-*)	basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppcle | powerpclittle | ppc-le | powerpc-little)
-		basic_machine=powerpcle-unknown
-		;;
-	ppcle-* | powerpclittle-*)
-		basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64)	basic_machine=powerpc64-unknown
-		;;
-	ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ppc64le | powerpc64little | ppc64-le | powerpc64-little)
-		basic_machine=powerpc64le-unknown
-		;;
-	ppc64le-* | powerpc64little-*)
-		basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
-		;;
-	ps2)
-		basic_machine=i386-ibm
-		;;
-	pw32)
-		basic_machine=i586-unknown
-		os=-pw32
-		;;
-	rdos)
-		basic_machine=i386-pc
-		os=-rdos
-		;;
-	rom68k)
-		basic_machine=m68k-rom68k
-		os=-coff
-		;;
-	rm[46]00)
-		basic_machine=mips-siemens
-		;;
-	rtpc | rtpc-*)
-		basic_machine=romp-ibm
-		;;
-	s390 | s390-*)
-		basic_machine=s390-ibm
-		;;
-	s390x | s390x-*)
-		basic_machine=s390x-ibm
-		;;
-	sa29200)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	sb1)
-		basic_machine=mipsisa64sb1-unknown
-		;;
-	sb1el)
-		basic_machine=mipsisa64sb1el-unknown
-		;;
-	sde)
-		basic_machine=mipsisa32-sde
-		os=-elf
-		;;
-	sei)
-		basic_machine=mips-sei
-		os=-seiux
-		;;
-	sequent)
-		basic_machine=i386-sequent
-		;;
-	sh)
-		basic_machine=sh-hitachi
-		os=-hms
-		;;
-	sh64)
-		basic_machine=sh64-unknown
-		;;
-	sparclite-wrs | simso-wrs)
-		basic_machine=sparclite-wrs
-		os=-vxworks
-		;;
-	sps7)
-		basic_machine=m68k-bull
-		os=-sysv2
-		;;
-	spur)
-		basic_machine=spur-unknown
-		;;
-	st2000)
-		basic_machine=m68k-tandem
-		;;
-	stratus)
-		basic_machine=i860-stratus
-		os=-sysv4
-		;;
-	sun2)
-		basic_machine=m68000-sun
-		;;
-	sun2os3)
-		basic_machine=m68000-sun
-		os=-sunos3
-		;;
-	sun2os4)
-		basic_machine=m68000-sun
-		os=-sunos4
-		;;
-	sun3os3)
-		basic_machine=m68k-sun
-		os=-sunos3
-		;;
-	sun3os4)
-		basic_machine=m68k-sun
-		os=-sunos4
-		;;
-	sun4os3)
-		basic_machine=sparc-sun
-		os=-sunos3
-		;;
-	sun4os4)
-		basic_machine=sparc-sun
-		os=-sunos4
-		;;
-	sun4sol2)
-		basic_machine=sparc-sun
-		os=-solaris2
-		;;
-	sun3 | sun3-*)
-		basic_machine=m68k-sun
-		;;
-	sun4)
-		basic_machine=sparc-sun
-		;;
-	sun386 | sun386i | roadrunner)
-		basic_machine=i386-sun
-		;;
-	sv1)
-		basic_machine=sv1-cray
-		os=-unicos
-		;;
-	symmetry)
-		basic_machine=i386-sequent
-		os=-dynix
-		;;
-	t3e)
-		basic_machine=alphaev5-cray
-		os=-unicos
-		;;
-	t90)
-		basic_machine=t90-cray
-		os=-unicos
-		;;
-	tic54x | c54x*)
-		basic_machine=tic54x-unknown
-		os=-coff
-		;;
-	tic55x | c55x*)
-		basic_machine=tic55x-unknown
-		os=-coff
-		;;
-	tic6x | c6x*)
-		basic_machine=tic6x-unknown
-		os=-coff
-		;;
-	tx39)
-		basic_machine=mipstx39-unknown
-		;;
-	tx39el)
-		basic_machine=mipstx39el-unknown
-		;;
-	toad1)
-		basic_machine=pdp10-xkl
-		os=-tops20
-		;;
-	tower | tower-32)
-		basic_machine=m68k-ncr
-		;;
-	tpf)
-		basic_machine=s390x-ibm
-		os=-tpf
-		;;
-	udi29k)
-		basic_machine=a29k-amd
-		os=-udi
-		;;
-	ultra3)
-		basic_machine=a29k-nyu
-		os=-sym1
-		;;
-	v810 | necv810)
-		basic_machine=v810-nec
-		os=-none
-		;;
-	vaxv)
-		basic_machine=vax-dec
-		os=-sysv
-		;;
-	vms)
-		basic_machine=vax-dec
-		os=-vms
-		;;
-	vpp*|vx|vx-*)
-		basic_machine=f301-fujitsu
-		;;
-	vxworks960)
-		basic_machine=i960-wrs
-		os=-vxworks
-		;;
-	vxworks68)
-		basic_machine=m68k-wrs
-		os=-vxworks
-		;;
-	vxworks29k)
-		basic_machine=a29k-wrs
-		os=-vxworks
-		;;
-	w65*)
-		basic_machine=w65-wdc
-		os=-none
-		;;
-	w89k-*)
-		basic_machine=hppa1.1-winbond
-		os=-proelf
-		;;
-	xbox)
-		basic_machine=i686-pc
-		os=-mingw32
-		;;
-	xps | xps100)
-		basic_machine=xps100-honeywell
-		;;
-	ymp)
-		basic_machine=ymp-cray
-		os=-unicos
-		;;
-	z8k-*-coff)
-		basic_machine=z8k-unknown
-		os=-sim
-		;;
-	none)
-		basic_machine=none-none
-		os=-none
-		;;
-
-# Here we handle the default manufacturer of certain CPU types.  It is in
-# some cases the only manufacturer, in others, it is the most popular.
-	w89k)
-		basic_machine=hppa1.1-winbond
-		;;
-	op50n)
-		basic_machine=hppa1.1-oki
-		;;
-	op60c)
-		basic_machine=hppa1.1-oki
-		;;
-	romp)
-		basic_machine=romp-ibm
-		;;
-	mmix)
-		basic_machine=mmix-knuth
-		;;
-	rs6000)
-		basic_machine=rs6000-ibm
-		;;
-	vax)
-		basic_machine=vax-dec
-		;;
-	pdp10)
-		# there are many clones, so DEC is not a safe bet
-		basic_machine=pdp10-unknown
-		;;
-	pdp11)
-		basic_machine=pdp11-dec
-		;;
-	we32k)
-		basic_machine=we32k-att
-		;;
-	sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele)
-		basic_machine=sh-unknown
-		;;
-	sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
-		basic_machine=sparc-sun
-		;;
-	cydra)
-		basic_machine=cydra-cydrome
-		;;
-	orion)
-		basic_machine=orion-highlevel
-		;;
-	orion105)
-		basic_machine=clipper-highlevel
-		;;
-	mac | mpw | mac-mpw)
-		basic_machine=m68k-apple
-		;;
-	pmac | pmac-mpw)
-		basic_machine=powerpc-apple
-		;;
-	*-unknown)
-		# Make sure to match an already-canonicalized machine name.
-		;;
-	*)
-		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
-		exit 1
-		;;
-esac
-
-# Here we canonicalize certain aliases for manufacturers.
-case $basic_machine in
-	*-digital*)
-		basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
-		;;
-	*-commodore*)
-		basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
-		;;
-	*)
-		;;
-esac
-
-# Decode manufacturer-specific aliases for certain operating systems.
-
-if [ x"$os" != x"" ]
-then
-case $os in
-        # First match some system type aliases
-        # that might get confused with valid system types.
-	# -solaris* is a basic system type, with this one exception.
-	-solaris1 | -solaris1.*)
-		os=`echo $os | sed -e 's|solaris1|sunos4|'`
-		;;
-	-solaris)
-		os=-solaris2
-		;;
-	-svr4*)
-		os=-sysv4
-		;;
-	-unixware*)
-		os=-sysv4.2uw
-		;;
-	-gnu/linux*)
-		os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
-		;;
-	# First accept the basic system types.
-	# The portable systems comes first.
-	# Each alternative MUST END IN A *, to match a version number.
-	# -sysv* is not here because it comes later, after sysvr4.
-	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
-	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\
-	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \
-	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* \
-	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
-	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
-	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -openbsd* | -solidbsd* \
-	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
-	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
-	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
-	      | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
-	      | -chorusos* | -chorusrdb* \
-	      | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
-	      | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \
-	      | -uxpv* | -beos* | -mpeix* | -udk* \
-	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
-	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
-	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
-	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
-	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
-	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers*)
-	# Remember, each alternative MUST END IN *, to match a version number.
-		;;
-	-qnx*)
-		case $basic_machine in
-		    x86-* | i*86-*)
-			;;
-		    *)
-			os=-nto$os
-			;;
-		esac
-		;;
-	-nto-qnx*)
-		;;
-	-nto*)
-		os=`echo $os | sed -e 's|nto|nto-qnx|'`
-		;;
-	-sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
-	      | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
-	      | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
-		;;
-	-mac*)
-		os=`echo $os | sed -e 's|mac|macos|'`
-		;;
-	-linux-dietlibc)
-		os=-linux-dietlibc
-		;;
-	-linux*)
-		os=`echo $os | sed -e 's|linux|linux-gnu|'`
-		;;
-	-sunos5*)
-		os=`echo $os | sed -e 's|sunos5|solaris2|'`
-		;;
-	-sunos6*)
-		os=`echo $os | sed -e 's|sunos6|solaris3|'`
-		;;
-	-opened*)
-		os=-openedition
-		;;
-        -os400*)
-		os=-os400
-		;;
-	-wince*)
-		os=-wince
-		;;
-	-osfrose*)
-		os=-osfrose
-		;;
-	-osf*)
-		os=-osf
-		;;
-	-utek*)
-		os=-bsd
-		;;
-	-dynix*)
-		os=-bsd
-		;;
-	-acis*)
-		os=-aos
-		;;
-	-atheos*)
-		os=-atheos
-		;;
-	-syllable*)
-		os=-syllable
-		;;
-	-386bsd)
-		os=-bsd
-		;;
-	-ctix* | -uts*)
-		os=-sysv
-		;;
-	-nova*)
-		os=-rtmk-nova
-		;;
-	-ns2 )
-		os=-nextstep2
-		;;
-	-nsk*)
-		os=-nsk
-		;;
-	# Preserve the version number of sinix5.
-	-sinix5.*)
-		os=`echo $os | sed -e 's|sinix|sysv|'`
-		;;
-	-sinix*)
-		os=-sysv4
-		;;
-        -tpf*)
-		os=-tpf
-		;;
-	-triton*)
-		os=-sysv3
-		;;
-	-oss*)
-		os=-sysv3
-		;;
-	-svr4)
-		os=-sysv4
-		;;
-	-svr3)
-		os=-sysv3
-		;;
-	-sysvr4)
-		os=-sysv4
-		;;
-	# This must come after -sysvr4.
-	-sysv*)
-		;;
-	-ose*)
-		os=-ose
-		;;
-	-es1800*)
-		os=-ose
-		;;
-	-xenix)
-		os=-xenix
-		;;
-	-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-		os=-mint
-		;;
-	-aros*)
-		os=-aros
-		;;
-	-kaos*)
-		os=-kaos
-		;;
-	-zvmoe)
-		os=-zvmoe
-		;;
-	-none)
-		;;
-	*)
-		# Get rid of the `-' at the beginning of $os.
-		os=`echo $os | sed 's/[^-]*-//'`
-		echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
-		exit 1
-		;;
-esac
-else
-
-# Here we handle the default operating systems that come with various machines.
-# The value should be what the vendor currently ships out the door with their
-# machine or put another way, the most popular os provided with the machine.
-
-# Note that if you're going to try to match "-MANUFACTURER" here (say,
-# "-sun"), then you have to tell the case statement up towards the top
-# that MANUFACTURER isn't an operating system.  Otherwise, code above
-# will signal an error saying that MANUFACTURER isn't an operating
-# system, and we'll never get to this point.
-
-case $basic_machine in
-        score-*)
-		os=-elf
-		;;
-        spu-*)
-		os=-elf
-		;;
-	*-acorn)
-		os=-riscix1.2
-		;;
-	arm*-rebel)
-		os=-linux
-		;;
-	arm*-semi)
-		os=-aout
-		;;
-        c4x-* | tic4x-*)
-        	os=-coff
-		;;
-	# This must come before the *-dec entry.
-	pdp10-*)
-		os=-tops20
-		;;
-	pdp11-*)
-		os=-none
-		;;
-	*-dec | vax-*)
-		os=-ultrix4.2
-		;;
-	m68*-apollo)
-		os=-domain
-		;;
-	i386-sun)
-		os=-sunos4.0.2
-		;;
-	m68000-sun)
-		os=-sunos3
-		# This also exists in the configure program, but was not the
-		# default.
-		# os=-sunos4
-		;;
-	m68*-cisco)
-		os=-aout
-		;;
-	mips*-cisco)
-		os=-elf
-		;;
-	mips*-*)
-		os=-elf
-		;;
-	or32-*)
-		os=-coff
-		;;
-	*-tti)	# must be before sparc entry or we get the wrong os.
-		os=-sysv3
-		;;
-	sparc-* | *-sun)
-		os=-sunos4.1.1
-		;;
-	*-be)
-		os=-beos
-		;;
-	*-haiku)
-		os=-haiku
-		;;
-	*-ibm)
-		os=-aix
-		;;
-    	*-knuth)
-		os=-mmixware
-		;;
-	*-wec)
-		os=-proelf
-		;;
-	*-winbond)
-		os=-proelf
-		;;
-	*-oki)
-		os=-proelf
-		;;
-	*-hp)
-		os=-hpux
-		;;
-	*-hitachi)
-		os=-hiux
-		;;
-	i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
-		os=-sysv
-		;;
-	*-cbm)
-		os=-amigaos
-		;;
-	*-dg)
-		os=-dgux
-		;;
-	*-dolphin)
-		os=-sysv3
-		;;
-	m68k-ccur)
-		os=-rtu
-		;;
-	m88k-omron*)
-		os=-luna
-		;;
-	*-next )
-		os=-nextstep
-		;;
-	*-sequent)
-		os=-ptx
-		;;
-	*-crds)
-		os=-unos
-		;;
-	*-ns)
-		os=-genix
-		;;
-	i370-*)
-		os=-mvs
-		;;
-	*-next)
-		os=-nextstep3
-		;;
-	*-gould)
-		os=-sysv
-		;;
-	*-highlevel)
-		os=-bsd
-		;;
-	*-encore)
-		os=-bsd
-		;;
-	*-sgi)
-		os=-irix
-		;;
-	*-siemens)
-		os=-sysv4
-		;;
-	*-masscomp)
-		os=-rtu
-		;;
-	f30[01]-fujitsu | f700-fujitsu)
-		os=-uxpv
-		;;
-	*-rom68k)
-		os=-coff
-		;;
-	*-*bug)
-		os=-coff
-		;;
-	*-apple)
-		os=-macos
-		;;
-	*-atari*)
-		os=-mint
-		;;
-	*)
-		os=-none
-		;;
-esac
-fi
-
-# Here we handle the case where we know the os, and the CPU type, but not the
-# manufacturer.  We pick the logical manufacturer.
-vendor=unknown
-case $basic_machine in
-	*-unknown)
-		case $os in
-			-riscix*)
-				vendor=acorn
-				;;
-			-sunos*)
-				vendor=sun
-				;;
-			-aix*)
-				vendor=ibm
-				;;
-			-beos*)
-				vendor=be
-				;;
-			-hpux*)
-				vendor=hp
-				;;
-			-mpeix*)
-				vendor=hp
-				;;
-			-hiux*)
-				vendor=hitachi
-				;;
-			-unos*)
-				vendor=crds
-				;;
-			-dgux*)
-				vendor=dg
-				;;
-			-luna*)
-				vendor=omron
-				;;
-			-genix*)
-				vendor=ns
-				;;
-			-mvs* | -opened*)
-				vendor=ibm
-				;;
-			-os400*)
-				vendor=ibm
-				;;
-			-ptx*)
-				vendor=sequent
-				;;
-			-tpf*)
-				vendor=ibm
-				;;
-			-vxsim* | -vxworks* | -windiss*)
-				vendor=wrs
-				;;
-			-aux*)
-				vendor=apple
-				;;
-			-hms*)
-				vendor=hitachi
-				;;
-			-mpw* | -macos*)
-				vendor=apple
-				;;
-			-*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
-				vendor=atari
-				;;
-			-vos*)
-				vendor=stratus
-				;;
-		esac
-		basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
-		;;
-esac
-
-echo $basic_machine$os
-exit
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "timestamp='"
-# time-stamp-format: "%:y-%02m-%02d"
-# time-stamp-end: "'"
-# End:
diff --git a/kokkos/basic/optional/ThreadPool/config/depcomp b/kokkos/basic/optional/ThreadPool/config/depcomp
deleted file mode 100755
index ca5ea4e..0000000
--- a/kokkos/basic/optional/ThreadPool/config/depcomp
+++ /dev/null
@@ -1,584 +0,0 @@
-#! /bin/sh
-# depcomp - compile a program generating dependencies as side-effects
-
-scriptversion=2006-10-15.18
-
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006 Free Software
-# Foundation, Inc.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-# 02110-1301, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Originally written by Alexandre Oliva <oliva@dcc.unicamp.br>.
-
-case $1 in
-  '')
-     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: depcomp [--help] [--version] PROGRAM [ARGS]
-
-Run PROGRAMS ARGS to compile a file, generating dependencies
-as side-effects.
-
-Environment variables:
-  depmode     Dependency tracking mode.
-  source      Source file read by `PROGRAMS ARGS'.
-  object      Object file output by `PROGRAMS ARGS'.
-  DEPDIR      directory where to store dependencies.
-  depfile     Dependency file to output.
-  tmpdepfile  Temporary file to use when outputing dependencies.
-  libtool     Whether libtool is used (yes/no).
-
-Report bugs to <bug-automake@gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "depcomp $scriptversion"
-    exit $?
-    ;;
-esac
-
-if test -z "$depmode" || test -z "$source" || test -z "$object"; then
-  echo "depcomp: Variables source, object and depmode must be set" 1>&2
-  exit 1
-fi
-
-# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
-depfile=${depfile-`echo "$object" |
-  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
-tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
-
-rm -f "$tmpdepfile"
-
-# Some modes work just like other modes, but use different flags.  We
-# parameterize here, but still list the modes in the big case below,
-# to make depend.m4 easier to write.  Note that we *cannot* use a case
-# here, because this file can only contain one case statement.
-if test "$depmode" = hp; then
-  # HP compiler uses -M and no extra arg.
-  gccflag=-M
-  depmode=gcc
-fi
-
-if test "$depmode" = dashXmstdout; then
-   # This is just like dashmstdout with a different argument.
-   dashmflag=-xM
-   depmode=dashmstdout
-fi
-
-case "$depmode" in
-gcc3)
-## gcc 3 implements dependency tracking that does exactly what
-## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
-## it if -MD -MP comes after the -MF stuff.  Hmm.
-## Unfortunately, FreeBSD c89 acceptance of flags depends upon
-## the command line argument order; so add the flags where they
-## appear in depend2.am.  Note that the slowdown incurred here
-## affects only configure: in makefiles, %FASTDEP% shortcuts this.
-  for arg
-  do
-    case $arg in
-    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
-    *)  set fnord "$@" "$arg" ;;
-    esac
-    shift # fnord
-    shift # $arg
-  done
-  "$@"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  mv "$tmpdepfile" "$depfile"
-  ;;
-
-gcc)
-## There are various ways to get dependency output from gcc.  Here's
-## why we pick this rather obscure method:
-## - Don't want to use -MD because we'd like the dependencies to end
-##   up in a subdir.  Having to rename by hand is ugly.
-##   (We might end up doing this anyway to support other compilers.)
-## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
-##   -MM, not -M (despite what the docs say).
-## - Using -M directly means running the compiler twice (even worse
-##   than renaming).
-  if test -z "$gccflag"; then
-    gccflag=-MD,
-  fi
-  "$@" -Wp,"$gccflag$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
-## The second -e expression handles DOS-style file names with drive letters.
-  sed -e 's/^[^:]*: / /' \
-      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
-## This next piece of magic avoids the `deleted header file' problem.
-## The problem is that when a header file which appears in a .P file
-## is deleted, the dependency causes make to die (because there is
-## typically no way to rebuild the header).  We avoid this by adding
-## dummy dependencies for each header file.  Too bad gcc doesn't do
-## this for us directly.
-  tr ' ' '
-' < "$tmpdepfile" |
-## Some versions of gcc put a space before the `:'.  On the theory
-## that the space means something, we add a space to the output as
-## well.
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-sgi)
-  if test "$libtool" = yes; then
-    "$@" "-Wp,-MDupdate,$tmpdepfile"
-  else
-    "$@" -MDupdate "$tmpdepfile"
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-
-  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
-    echo "$object : \\" > "$depfile"
-
-    # Clip off the initial element (the dependent).  Don't try to be
-    # clever and replace this with sed code, as IRIX sed won't handle
-    # lines with more than a fixed number of characters (4096 in
-    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
-    # the IRIX cc adds comments like `#:fec' to the end of the
-    # dependency line.
-    tr ' ' '
-' < "$tmpdepfile" \
-    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
-    tr '
-' ' ' >> $depfile
-    echo >> $depfile
-
-    # The second pass generates a dummy entry for each header file.
-    tr ' ' '
-' < "$tmpdepfile" \
-   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
-   >> $depfile
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-aix)
-  # The C for AIX Compiler uses -M and outputs the dependencies
-  # in a .u file.  In older versions, this file always lives in the
-  # current directory.  Also, the AIX compiler puts `$object:' at the
-  # start of each line; $object doesn't have directory information.
-  # Version 6 uses the directory in both cases.
-  stripped=`echo "$object" | sed 's/\(.*\)\..*$/\1/'`
-  tmpdepfile="$stripped.u"
-  if test "$libtool" = yes; then
-    "$@" -Wc,-M
-  else
-    "$@" -M
-  fi
-  stat=$?
-
-  if test -f "$tmpdepfile"; then :
-  else
-    stripped=`echo "$stripped" | sed 's,^.*/,,'`
-    tmpdepfile="$stripped.u"
-  fi
-
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-
-  if test -f "$tmpdepfile"; then
-    outname="$stripped.o"
-    # Each line is of the form `foo.o: dependent.h'.
-    # Do two passes, one to just change these to
-    # `$object: dependent.h' and one to simply `dependent.h:'.
-    sed -e "s,^$outname:,$object :," < "$tmpdepfile" > "$depfile"
-    sed -e "s,^$outname: \(.*\)$,\1:," < "$tmpdepfile" >> "$depfile"
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-icc)
-  # Intel's C compiler understands `-MD -MF file'.  However on
-  #    icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
-  # ICC 7.0 will fill foo.d with something like
-  #    foo.o: sub/foo.c
-  #    foo.o: sub/foo.h
-  # which is wrong.  We want:
-  #    sub/foo.o: sub/foo.c
-  #    sub/foo.o: sub/foo.h
-  #    sub/foo.c:
-  #    sub/foo.h:
-  # ICC 7.1 will output
-  #    foo.o: sub/foo.c sub/foo.h
-  # and will wrap long lines using \ :
-  #    foo.o: sub/foo.c ... \
-  #     sub/foo.h ... \
-  #     ...
-
-  "$@" -MD -MF "$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  # Each line is of the form `foo.o: dependent.h',
-  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
-  # Do two passes, one to just change these to
-  # `$object: dependent.h' and one to simply `dependent.h:'.
-  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
-  # Some versions of the HPUX 10.20 sed can't process this invocation
-  # correctly.  Breaking it into two sed invocations is a workaround.
-  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
-    sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp2)
-  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
-  # compilers, which have integrated preprocessors.  The correct option
-  # to use with these is +Maked; it writes dependencies to a file named
-  # 'foo.d', which lands next to the object file, wherever that
-  # happens to be.
-  # Much of this is similar to the tru64 case; see comments there.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir.libs/$base.d
-    "$@" -Wc,+Maked
-  else
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir$base.d
-    "$@" +Maked
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-     rm -f "$tmpdepfile1" "$tmpdepfile2"
-     exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
-    # Add `dependent.h:' lines.
-    sed -ne '2,${; s/^ *//; s/ \\*$//; s/$/:/; p;}' "$tmpdepfile" >> "$depfile"
-  else
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile" "$tmpdepfile2"
-  ;;
-
-tru64)
-   # The Tru64 compiler uses -MD to generate dependencies as a side
-   # effect.  `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
-   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
-   # dependencies in `foo.d' instead, so we check for that too.
-   # Subdirectories are respected.
-   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-   test "x$dir" = "x$object" && dir=
-   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-
-   if test "$libtool" = yes; then
-      # With Tru64 cc, shared objects can also be used to make a
-      # static library.  This mechanism is used in libtool 1.4 series to
-      # handle both shared and static libraries in a single compilation.
-      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
-      #
-      # With libtool 1.5 this exception was removed, and libtool now
-      # generates 2 separate objects for the 2 libraries.  These two
-      # compilations output dependencies in $dir.libs/$base.o.d and
-      # in $dir$base.o.d.  We have to check for both files, because
-      # one of the two compilations can be disabled.  We should prefer
-      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
-      # automatically cleaned when .libs/ is deleted, while ignoring
-      # the former would cause a distcleancheck panic.
-      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
-      tmpdepfile2=$dir$base.o.d          # libtool 1.5
-      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
-      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
-      "$@" -Wc,-MD
-   else
-      tmpdepfile1=$dir$base.o.d
-      tmpdepfile2=$dir$base.d
-      tmpdepfile3=$dir$base.d
-      tmpdepfile4=$dir$base.d
-      "$@" -MD
-   fi
-
-   stat=$?
-   if test $stat -eq 0; then :
-   else
-      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-      exit $stat
-   fi
-
-   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-   do
-     test -f "$tmpdepfile" && break
-   done
-   if test -f "$tmpdepfile"; then
-      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-      # That's a tab and a space in the [].
-      sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-   else
-      echo "#dummy" > "$depfile"
-   fi
-   rm -f "$tmpdepfile"
-   ;;
-
-#nosideeffect)
-  # This comment above is used by automake to tell side-effect
-  # dependency tracking mechanisms from slower ones.
-
-dashmstdout)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout, regardless of -o.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test $1 != '--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  test -z "$dashmflag" && dashmflag=-M
-  # Require at least two characters before searching for `:'
-  # in the target name.  This is to cope with DOS-style filenames:
-  # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
-  "$@" $dashmflag |
-    sed 's:^[  ]*[^: ][^:][^:]*\:[    ]*:'"$object"'\: :' > "$tmpdepfile"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  tr ' ' '
-' < "$tmpdepfile" | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-dashXmstdout)
-  # This case only exists to satisfy depend.m4.  It is never actually
-  # run, as this mode is specially recognized in the preamble.
-  exit 1
-  ;;
-
-makedepend)
-  "$@" || exit $?
-  # Remove any Libtool call
-  if test "$libtool" = yes; then
-    while test $1 != '--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-  # X makedepend
-  shift
-  cleared=no
-  for arg in "$@"; do
-    case $cleared in
-    no)
-      set ""; shift
-      cleared=yes ;;
-    esac
-    case "$arg" in
-    -D*|-I*)
-      set fnord "$@" "$arg"; shift ;;
-    # Strip any option that makedepend may not understand.  Remove
-    # the object too, otherwise makedepend will parse it as a source file.
-    -*|$object)
-      ;;
-    *)
-      set fnord "$@" "$arg"; shift ;;
-    esac
-  done
-  obj_suffix="`echo $object | sed 's/^.*\././'`"
-  touch "$tmpdepfile"
-  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  sed '1,2d' "$tmpdepfile" | tr ' ' '
-' | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile" "$tmpdepfile".bak
-  ;;
-
-cpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test $1 != '--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  "$@" -E |
-    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
-    sed '$ s: \\$::' > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  cat < "$tmpdepfile" >> "$depfile"
-  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvisualcpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout, regardless of -o,
-  # because we must use -o when running libtool.
-  "$@" || exit $?
-  IFS=" "
-  for arg
-  do
-    case "$arg" in
-    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
-	set fnord "$@"
-	shift
-	shift
-	;;
-    *)
-	set fnord "$@" "$arg"
-	shift
-	shift
-	;;
-    esac
-  done
-  "$@" -E |
-  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::echo "`cygpath -u \\"\1\\"`":p' | sort | uniq > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::	\1 \\:p' >> "$depfile"
-  echo "	" >> "$depfile"
-  . "$tmpdepfile" | sed 's% %\\ %g' | sed -n '/^\(.*\)$/ s::\1\::p' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-none)
-  exec "$@"
-  ;;
-
-*)
-  echo "Unknown depmode $depmode" 1>&2
-  exit 1
-  ;;
-esac
-
-exit 0
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-end: "$"
-# End:
diff --git a/kokkos/basic/optional/ThreadPool/config/generate-makeoptions.pl b/kokkos/basic/optional/ThreadPool/config/generate-makeoptions.pl
deleted file mode 100755
index a39223e..0000000
--- a/kokkos/basic/optional/ThreadPool/config/generate-makeoptions.pl
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/perl -w
-#
-# This perl script graps a bunch of make macro definitions
-# generated for Teuchos that can be used in other makefiles.
-# This is dumped to stdout and can be redirected to build
-# a makefile.
-#
-# Note, this script must be maintained to be current for
-# the Teuchos makefile.
-#
-use strict;
-
-if( !(defined(@ARGV) && scalar(@ARGV)==2) ) {
-  die "Error, this script takes two and only two arguments (makefile_name package_name).!\n";
-}
-
-my $makefile_name = shift;
-my $package_name  = shift;
-
-#
-# List the macros you want to grep and include in the output
-#
-my @macros =
-	(
-	 "CC"
-	 ,"CXX"
-	 ,"F77"
-	 ,"CXXLD"
-	 ,"DEFS"
-	 ,"CPPFLAGS"
-	 ,"CFLAGS"
-	 ,"CXXFLAGS"
-	 ,"FFLAGS"
-	 ,"LDFLAGS"
-	 ,"FLIBS"
-	 ,"BLAS_LIBS"
-	 ,"LAPACK_LIBS"
-	 ,"prefix"
-	 ,"AR"
-	 ,"ALTERNATE_AR"
-	 ,"libteuchos_a_AR"
-	 ,"RANLIB"
-	 );
-
-open FILE_IN, "<$makefile_name" || die "The file $makefile_name could not be opended for input\n";
-my @makefile_name_array = <FILE_IN>;
-close FILE_IN;
-
-#
-# Find the above macros and append "${package_name}_" to the beginning.
-#
-my @new_macros;
-my $add_next_line = 0;
-foreach( @makefile_name_array ) {
-	my $line = $_;
-	if($add_next_line) {
-		push @new_macros, $line;
-		if( substr($line,-1,1) eq "\\" ) {
-			$add_next_line = 1;
-		}
-		else {
-			$add_next_line = 0;
-		}
-		next;
-	}
-	#print "Line = $line";
-	foreach( @macros ) {
-		my $macro_search = "^${_} ";
-		#print "Macro search = \'$macro_search\'\n";
-		if( $line=~/$macro_search/ ) {
-			#print "Adding Macro!\n";
-      my $find_str = '\(CXX\)';
-      my $replace_str = "(${package_name}_CXX)";
-      $line=~s/$find_str/$replace_str/;
-			push @new_macros, "${package_name}_${line}";
-			if( substr($line,-2,1) eq "\\" ) {
-				$add_next_line = 1;
-			}
-			else {
-				$add_next_line = 0;
-			}
-		}
-	}
-}
-
-print join("",@new_macros);
diff --git a/kokkos/basic/optional/ThreadPool/config/install-sh b/kokkos/basic/optional/ThreadPool/config/install-sh
deleted file mode 100755
index 4fbbae7..0000000
--- a/kokkos/basic/optional/ThreadPool/config/install-sh
+++ /dev/null
@@ -1,507 +0,0 @@
-#!/bin/sh
-# install - install a program, script, or datafile
-
-scriptversion=2006-10-14.15
-
-# This originates from X11R5 (mit/util/scripts/install.sh), which was
-# later released in X11R6 (xc/config/util/install.sh) with the
-# following copyright and license.
-#
-# Copyright (C) 1994 X Consortium
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to
-# deal in the Software without restriction, including without limitation the
-# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-# sell copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
-# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
-# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#
-# Except as contained in this notice, the name of the X Consortium shall not
-# be used in advertising or otherwise to promote the sale, use or other deal-
-# ings in this Software without prior written authorization from the X Consor-
-# tium.
-#
-#
-# FSF changes to this file are in the public domain.
-#
-# Calling this script install-sh is preferred over install.sh, to prevent
-# `make' implicit rules from creating a file called install from it
-# when there is no Makefile.
-#
-# This script is compatible with the BSD install script, but was written
-# from scratch.
-
-nl='
-'
-IFS=" ""	$nl"
-
-# set DOITPROG to echo to test this script
-
-# Don't use :- since 4.3BSD and earlier shells don't like it.
-doit="${DOITPROG-}"
-if test -z "$doit"; then
-  doit_exec=exec
-else
-  doit_exec=$doit
-fi
-
-# Put in absolute file names if you don't have them in your path;
-# or use environment vars.
-
-mvprog="${MVPROG-mv}"
-cpprog="${CPPROG-cp}"
-chmodprog="${CHMODPROG-chmod}"
-chownprog="${CHOWNPROG-chown}"
-chgrpprog="${CHGRPPROG-chgrp}"
-stripprog="${STRIPPROG-strip}"
-rmprog="${RMPROG-rm}"
-mkdirprog="${MKDIRPROG-mkdir}"
-
-posix_glob=
-posix_mkdir=
-
-# Desired mode of installed file.
-mode=0755
-
-chmodcmd=$chmodprog
-chowncmd=
-chgrpcmd=
-stripcmd=
-rmcmd="$rmprog -f"
-mvcmd="$mvprog"
-src=
-dst=
-dir_arg=
-dstarg=
-no_target_directory=
-
-usage="Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
-   or: $0 [OPTION]... SRCFILES... DIRECTORY
-   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
-   or: $0 [OPTION]... -d DIRECTORIES...
-
-In the 1st form, copy SRCFILE to DSTFILE.
-In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
-In the 4th, create DIRECTORIES.
-
-Options:
--c         (ignored)
--d         create directories instead of installing files.
--g GROUP   $chgrpprog installed files to GROUP.
--m MODE    $chmodprog installed files to MODE.
--o USER    $chownprog installed files to USER.
--s         $stripprog installed files.
--t DIRECTORY  install into DIRECTORY.
--T         report an error if DSTFILE is a directory.
---help     display this help and exit.
---version  display version info and exit.
-
-Environment variables override the default commands:
-  CHGRPPROG CHMODPROG CHOWNPROG CPPROG MKDIRPROG MVPROG RMPROG STRIPPROG
-"
-
-while test $# -ne 0; do
-  case $1 in
-    -c) shift
-        continue;;
-
-    -d) dir_arg=true
-        shift
-        continue;;
-
-    -g) chgrpcmd="$chgrpprog $2"
-        shift
-        shift
-        continue;;
-
-    --help) echo "$usage"; exit $?;;
-
-    -m) mode=$2
-        shift
-        shift
-	case $mode in
-	  *' '* | *'	'* | *'
-'*	  | *'*'* | *'?'* | *'['*)
-	    echo "$0: invalid mode: $mode" >&2
-	    exit 1;;
-	esac
-        continue;;
-
-    -o) chowncmd="$chownprog $2"
-        shift
-        shift
-        continue;;
-
-    -s) stripcmd=$stripprog
-        shift
-        continue;;
-
-    -t) dstarg=$2
-	shift
-	shift
-	continue;;
-
-    -T) no_target_directory=true
-	shift
-	continue;;
-
-    --version) echo "$0 $scriptversion"; exit $?;;
-
-    --)	shift
-	break;;
-
-    -*)	echo "$0: invalid option: $1" >&2
-	exit 1;;
-
-    *)  break;;
-  esac
-done
-
-if test $# -ne 0 && test -z "$dir_arg$dstarg"; then
-  # When -d is used, all remaining arguments are directories to create.
-  # When -t is used, the destination is already specified.
-  # Otherwise, the last argument is the destination.  Remove it from $@.
-  for arg
-  do
-    if test -n "$dstarg"; then
-      # $@ is not empty: it contains at least $arg.
-      set fnord "$@" "$dstarg"
-      shift # fnord
-    fi
-    shift # arg
-    dstarg=$arg
-  done
-fi
-
-if test $# -eq 0; then
-  if test -z "$dir_arg"; then
-    echo "$0: no input file specified." >&2
-    exit 1
-  fi
-  # It's OK to call `install-sh -d' without argument.
-  # This can happen when creating conditional directories.
-  exit 0
-fi
-
-if test -z "$dir_arg"; then
-  trap '(exit $?); exit' 1 2 13 15
-
-  # Set umask so as not to create temps with too-generous modes.
-  # However, 'strip' requires both read and write access to temps.
-  case $mode in
-    # Optimize common cases.
-    *644) cp_umask=133;;
-    *755) cp_umask=22;;
-
-    *[0-7])
-      if test -z "$stripcmd"; then
-	u_plus_rw=
-      else
-	u_plus_rw='% 200'
-      fi
-      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
-    *)
-      if test -z "$stripcmd"; then
-	u_plus_rw=
-      else
-	u_plus_rw=,u+rw
-      fi
-      cp_umask=$mode$u_plus_rw;;
-  esac
-fi
-
-for src
-do
-  # Protect names starting with `-'.
-  case $src in
-    -*) src=./$src ;;
-  esac
-
-  if test -n "$dir_arg"; then
-    dst=$src
-    dstdir=$dst
-    test -d "$dstdir"
-    dstdir_status=$?
-  else
-
-    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
-    # might cause directories to be created, which would be especially bad
-    # if $src (and thus $dsttmp) contains '*'.
-    if test ! -f "$src" && test ! -d "$src"; then
-      echo "$0: $src does not exist." >&2
-      exit 1
-    fi
-
-    if test -z "$dstarg"; then
-      echo "$0: no destination specified." >&2
-      exit 1
-    fi
-
-    dst=$dstarg
-    # Protect names starting with `-'.
-    case $dst in
-      -*) dst=./$dst ;;
-    esac
-
-    # If destination is a directory, append the input filename; won't work
-    # if double slashes aren't ignored.
-    if test -d "$dst"; then
-      if test -n "$no_target_directory"; then
-	echo "$0: $dstarg: Is a directory" >&2
-	exit 1
-      fi
-      dstdir=$dst
-      dst=$dstdir/`basename "$src"`
-      dstdir_status=0
-    else
-      # Prefer dirname, but fall back on a substitute if dirname fails.
-      dstdir=`
-	(dirname "$dst") 2>/dev/null ||
-	expr X"$dst" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	     X"$dst" : 'X\(//\)[^/]' \| \
-	     X"$dst" : 'X\(//\)$' \| \
-	     X"$dst" : 'X\(/\)' \| . 2>/dev/null ||
-	echo X"$dst" |
-	    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)[^/].*/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\/\)$/{
-		   s//\1/
-		   q
-		 }
-		 /^X\(\/\).*/{
-		   s//\1/
-		   q
-		 }
-		 s/.*/./; q'
-      `
-
-      test -d "$dstdir"
-      dstdir_status=$?
-    fi
-  fi
-
-  obsolete_mkdir_used=false
-
-  if test $dstdir_status != 0; then
-    case $posix_mkdir in
-      '')
-	# Create intermediate dirs using mode 755 as modified by the umask.
-	# This is like FreeBSD 'install' as of 1997-10-28.
-	umask=`umask`
-	case $stripcmd.$umask in
-	  # Optimize common cases.
-	  *[2367][2367]) mkdir_umask=$umask;;
-	  .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
-
-	  *[0-7])
-	    mkdir_umask=`expr $umask + 22 \
-	      - $umask % 100 % 40 + $umask % 20 \
-	      - $umask % 10 % 4 + $umask % 2
-	    `;;
-	  *) mkdir_umask=$umask,go-w;;
-	esac
-
-	# With -d, create the new directory with the user-specified mode.
-	# Otherwise, rely on $mkdir_umask.
-	if test -n "$dir_arg"; then
-	  mkdir_mode=-m$mode
-	else
-	  mkdir_mode=
-	fi
-
-	posix_mkdir=false
-	case $umask in
-	  *[123567][0-7][0-7])
-	    # POSIX mkdir -p sets u+wx bits regardless of umask, which
-	    # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
-	    ;;
-	  *)
-	    tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
-	    trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
-
-	    if (umask $mkdir_umask &&
-		exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
-	    then
-	      if test -z "$dir_arg" || {
-		   # Check for POSIX incompatibilities with -m.
-		   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
-		   # other-writeable bit of parent directory when it shouldn't.
-		   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
-		   ls_ld_tmpdir=`ls -ld "$tmpdir"`
-		   case $ls_ld_tmpdir in
-		     d????-?r-*) different_mode=700;;
-		     d????-?--*) different_mode=755;;
-		     *) false;;
-		   esac &&
-		   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
-		     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
-		     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
-		   }
-		 }
-	      then posix_mkdir=:
-	      fi
-	      rmdir "$tmpdir/d" "$tmpdir"
-	    else
-	      # Remove any dirs left behind by ancient mkdir implementations.
-	      rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
-	    fi
-	    trap '' 0;;
-	esac;;
-    esac
-
-    if
-      $posix_mkdir && (
-	umask $mkdir_umask &&
-	$doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
-      )
-    then :
-    else
-
-      # The umask is ridiculous, or mkdir does not conform to POSIX,
-      # or it failed possibly due to a race condition.  Create the
-      # directory the slow way, step by step, checking for races as we go.
-
-      case $dstdir in
-	/*) prefix=/ ;;
-	-*) prefix=./ ;;
-	*)  prefix= ;;
-      esac
-
-      case $posix_glob in
-        '')
-	  if (set -f) 2>/dev/null; then
-	    posix_glob=true
-	  else
-	    posix_glob=false
-	  fi ;;
-      esac
-
-      oIFS=$IFS
-      IFS=/
-      $posix_glob && set -f
-      set fnord $dstdir
-      shift
-      $posix_glob && set +f
-      IFS=$oIFS
-
-      prefixes=
-
-      for d
-      do
-	test -z "$d" && continue
-
-	prefix=$prefix$d
-	if test -d "$prefix"; then
-	  prefixes=
-	else
-	  if $posix_mkdir; then
-	    (umask=$mkdir_umask &&
-	     $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
-	    # Don't fail if two instances are running concurrently.
-	    test -d "$prefix" || exit 1
-	  else
-	    case $prefix in
-	      *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
-	      *) qprefix=$prefix;;
-	    esac
-	    prefixes="$prefixes '$qprefix'"
-	  fi
-	fi
-	prefix=$prefix/
-      done
-
-      if test -n "$prefixes"; then
-	# Don't fail if two instances are running concurrently.
-	(umask $mkdir_umask &&
-	 eval "\$doit_exec \$mkdirprog $prefixes") ||
-	  test -d "$dstdir" || exit 1
-	obsolete_mkdir_used=true
-      fi
-    fi
-  fi
-
-  if test -n "$dir_arg"; then
-    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
-    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
-    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
-      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
-  else
-
-    # Make a couple of temp file names in the proper directory.
-    dsttmp=$dstdir/_inst.$$_
-    rmtmp=$dstdir/_rm.$$_
-
-    # Trap to clean up those temp files at exit.
-    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
-
-    # Copy the file name to the temp name.
-    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
-
-    # and set any options; do chmod last to preserve setuid bits.
-    #
-    # If any of these fail, we abort the whole thing.  If we want to
-    # ignore errors from any of these, just make sure not to ignore
-    # errors from the above "$doit $cpprog $src $dsttmp" command.
-    #
-    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } \
-      && { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } \
-      && { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } \
-      && { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
-
-    # Now rename the file to the real destination.
-    { $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null \
-      || {
-	   # The rename failed, perhaps because mv can't rename something else
-	   # to itself, or perhaps because mv is so ancient that it does not
-	   # support -f.
-
-	   # Now remove or move aside any old file at destination location.
-	   # We try this two ways since rm can't unlink itself on some
-	   # systems and the destination file might be busy for other
-	   # reasons.  In this case, the final cleanup might fail but the new
-	   # file should still install successfully.
-	   {
-	     if test -f "$dst"; then
-	       $doit $rmcmd -f "$dst" 2>/dev/null \
-	       || { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null \
-		     && { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }; }\
-	       || {
-		 echo "$0: cannot unlink or rename $dst" >&2
-		 (exit 1); exit 1
-	       }
-	     else
-	       :
-	     fi
-	   } &&
-
-	   # Now rename the file to the real destination.
-	   $doit $mvcmd "$dsttmp" "$dst"
-	 }
-    } || exit 1
-
-    trap '' 0
-  fi
-done
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-end: "$"
-# End:
diff --git a/kokkos/basic/optional/ThreadPool/config/missing b/kokkos/basic/optional/ThreadPool/config/missing
deleted file mode 100755
index 1c8ff70..0000000
--- a/kokkos/basic/optional/ThreadPool/config/missing
+++ /dev/null
@@ -1,367 +0,0 @@
-#! /bin/sh
-# Common stub for a few missing GNU programs while installing.
-
-scriptversion=2006-05-10.23
-
-# Copyright (C) 1996, 1997, 1999, 2000, 2002, 2003, 2004, 2005, 2006
-#   Free Software Foundation, Inc.
-# Originally by Fran,cois Pinard <pinard@iro.umontreal.ca>, 1996.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-# 02110-1301, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-if test $# -eq 0; then
-  echo 1>&2 "Try \`$0 --help' for more information"
-  exit 1
-fi
-
-run=:
-sed_output='s/.* --output[ =]\([^ ]*\).*/\1/p'
-sed_minuso='s/.* -o \([^ ]*\).*/\1/p'
-
-# In the cases where this matters, `missing' is being run in the
-# srcdir already.
-if test -f configure.ac; then
-  configure_ac=configure.ac
-else
-  configure_ac=configure.in
-fi
-
-msg="missing on your system"
-
-case $1 in
---run)
-  # Try to run requested program, and just exit if it succeeds.
-  run=
-  shift
-  "$@" && exit 0
-  # Exit code 63 means version mismatch.  This often happens
-  # when the user try to use an ancient version of a tool on
-  # a file that requires a minimum version.  In this case we
-  # we should proceed has if the program had been absent, or
-  # if --run hadn't been passed.
-  if test $? = 63; then
-    run=:
-    msg="probably too old"
-  fi
-  ;;
-
-  -h|--h|--he|--hel|--help)
-    echo "\
-$0 [OPTION]... PROGRAM [ARGUMENT]...
-
-Handle \`PROGRAM [ARGUMENT]...' for when PROGRAM is missing, or return an
-error status if there is no known handling for PROGRAM.
-
-Options:
-  -h, --help      display this help and exit
-  -v, --version   output version information and exit
-  --run           try to run the given command, and emulate it if it fails
-
-Supported PROGRAM values:
-  aclocal      touch file \`aclocal.m4'
-  autoconf     touch file \`configure'
-  autoheader   touch file \`config.h.in'
-  autom4te     touch the output file, or create a stub one
-  automake     touch all \`Makefile.in' files
-  bison        create \`y.tab.[ch]', if possible, from existing .[ch]
-  flex         create \`lex.yy.c', if possible, from existing .c
-  help2man     touch the output file
-  lex          create \`lex.yy.c', if possible, from existing .c
-  makeinfo     touch the output file
-  tar          try tar, gnutar, gtar, then tar without non-portable flags
-  yacc         create \`y.tab.[ch]', if possible, from existing .[ch]
-
-Send bug reports to <bug-automake@gnu.org>."
-    exit $?
-    ;;
-
-  -v|--v|--ve|--ver|--vers|--versi|--versio|--version)
-    echo "missing $scriptversion (GNU Automake)"
-    exit $?
-    ;;
-
-  -*)
-    echo 1>&2 "$0: Unknown \`$1' option"
-    echo 1>&2 "Try \`$0 --help' for more information"
-    exit 1
-    ;;
-
-esac
-
-# Now exit if we have it, but it failed.  Also exit now if we
-# don't have it and --version was passed (most likely to detect
-# the program).
-case $1 in
-  lex|yacc)
-    # Not GNU programs, they don't have --version.
-    ;;
-
-  tar)
-    if test -n "$run"; then
-       echo 1>&2 "ERROR: \`tar' requires --run"
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       exit 1
-    fi
-    ;;
-
-  *)
-    if test -z "$run" && ($1 --version) > /dev/null 2>&1; then
-       # We have it, but it failed.
-       exit 1
-    elif test "x$2" = "x--version" || test "x$2" = "x--help"; then
-       # Could not run --version or --help.  This is probably someone
-       # running `$TOOL --version' or `$TOOL --help' to check whether
-       # $TOOL exists and not knowing $TOOL uses missing.
-       exit 1
-    fi
-    ;;
-esac
-
-# If it does not exist, or fails to run (possibly an outdated version),
-# try to emulate it.
-case $1 in
-  aclocal*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acinclude.m4' or \`${configure_ac}'.  You might want
-         to install the \`Automake' and \`Perl' packages.  Grab them from
-         any GNU archive site."
-    touch aclocal.m4
-    ;;
-
-  autoconf)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`${configure_ac}'.  You might want to install the
-         \`Autoconf' and \`GNU m4' packages.  Grab them from any GNU
-         archive site."
-    touch configure
-    ;;
-
-  autoheader)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`acconfig.h' or \`${configure_ac}'.  You might want
-         to install the \`Autoconf' and \`GNU m4' packages.  Grab them
-         from any GNU archive site."
-    files=`sed -n 's/^[ ]*A[CM]_CONFIG_HEADER(\([^)]*\)).*/\1/p' ${configure_ac}`
-    test -z "$files" && files="config.h"
-    touch_files=
-    for f in $files; do
-      case $f in
-      *:*) touch_files="$touch_files "`echo "$f" |
-				       sed -e 's/^[^:]*://' -e 's/:.*//'`;;
-      *) touch_files="$touch_files $f.in";;
-      esac
-    done
-    touch $touch_files
-    ;;
-
-  automake*)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified \`Makefile.am', \`acinclude.m4' or \`${configure_ac}'.
-         You might want to install the \`Automake' and \`Perl' packages.
-         Grab them from any GNU archive site."
-    find . -type f -name Makefile.am -print |
-	   sed 's/\.am$/.in/' |
-	   while read f; do touch "$f"; done
-    ;;
-
-  autom4te)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, but is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.
-         You can get \`$1' as part of \`Autoconf' from any GNU
-         archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo "#! /bin/sh"
-	echo "# Created by GNU Automake missing as a replacement of"
-	echo "#  $ $@"
-	echo "exit 0"
-	chmod +x $file
-	exit 1
-    fi
-    ;;
-
-  bison|yacc)
-    echo 1>&2 "\
-WARNING: \`$1' $msg.  You should only need it if
-         you modified a \`.y' file.  You may need the \`Bison' package
-         in order for those modifications to take effect.  You can get
-         \`Bison' from any GNU archive site."
-    rm -f y.tab.c y.tab.h
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.y)
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.c
-	    fi
-	    SRCFILE=`echo "$LASTARG" | sed 's/y$/h/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" y.tab.h
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f y.tab.h; then
-	echo >y.tab.h
-    fi
-    if test ! -f y.tab.c; then
-	echo 'main() { return 0; }' >y.tab.c
-    fi
-    ;;
-
-  lex|flex)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.l' file.  You may need the \`Flex' package
-         in order for those modifications to take effect.  You can get
-         \`Flex' from any GNU archive site."
-    rm -f lex.yy.c
-    if test $# -ne 1; then
-        eval LASTARG="\${$#}"
-	case $LASTARG in
-	*.l)
-	    SRCFILE=`echo "$LASTARG" | sed 's/l$/c/'`
-	    if test -f "$SRCFILE"; then
-	         cp "$SRCFILE" lex.yy.c
-	    fi
-	  ;;
-	esac
-    fi
-    if test ! -f lex.yy.c; then
-	echo 'main() { return 0; }' >lex.yy.c
-    fi
-    ;;
-
-  help2man)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-	 you modified a dependency of a manual page.  You may need the
-	 \`Help2man' package in order for those modifications to take
-	 effect.  You can get \`Help2man' from any GNU archive site."
-
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -f "$file"; then
-	touch $file
-    else
-	test -z "$file" || exec >$file
-	echo ".ab help2man is required to generate this page"
-	exit 1
-    fi
-    ;;
-
-  makeinfo)
-    echo 1>&2 "\
-WARNING: \`$1' is $msg.  You should only need it if
-         you modified a \`.texi' or \`.texinfo' file, or any other file
-         indirectly affecting the aspect of the manual.  The spurious
-         call might also be the consequence of using a buggy \`make' (AIX,
-         DU, IRIX).  You might want to install the \`Texinfo' package or
-         the \`GNU make' package.  Grab either from any GNU archive site."
-    # The file to touch is that specified with -o ...
-    file=`echo "$*" | sed -n "$sed_output"`
-    test -z "$file" && file=`echo "$*" | sed -n "$sed_minuso"`
-    if test -z "$file"; then
-      # ... or it is the one specified with @setfilename ...
-      infile=`echo "$*" | sed 's/.* \([^ ]*\) *$/\1/'`
-      file=`sed -n '
-	/^@setfilename/{
-	  s/.* \([^ ]*\) *$/\1/
-	  p
-	  q
-	}' $infile`
-      # ... or it is derived from the source name (dir/f.texi becomes f.info)
-      test -z "$file" && file=`echo "$infile" | sed 's,.*/,,;s,.[^.]*$,,'`.info
-    fi
-    # If the file does not exist, the user really needs makeinfo;
-    # let's fail without touching anything.
-    test -f $file || exit 1
-    touch $file
-    ;;
-
-  tar)
-    shift
-
-    # We have already tried tar in the generic part.
-    # Look for gnutar/gtar before invocation to avoid ugly error
-    # messages.
-    if (gnutar --version > /dev/null 2>&1); then
-       gnutar "$@" && exit 0
-    fi
-    if (gtar --version > /dev/null 2>&1); then
-       gtar "$@" && exit 0
-    fi
-    firstarg="$1"
-    if shift; then
-	case $firstarg in
-	*o*)
-	    firstarg=`echo "$firstarg" | sed s/o//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-	case $firstarg in
-	*h*)
-	    firstarg=`echo "$firstarg" | sed s/h//`
-	    tar "$firstarg" "$@" && exit 0
-	    ;;
-	esac
-    fi
-
-    echo 1>&2 "\
-WARNING: I can't seem to be able to run \`tar' with the given arguments.
-         You may want to install GNU tar or Free paxutils, or check the
-         command line arguments."
-    exit 1
-    ;;
-
-  *)
-    echo 1>&2 "\
-WARNING: \`$1' is needed, and is $msg.
-         You might have modified some files without having the
-         proper tools for further handling them.  Check the \`README' file,
-         it often tells you about the needed prerequisites for installing
-         this package.  You may also peek at any GNU archive site, in case
-         some other package would contain this missing \`$1' program."
-    exit 1
-    ;;
-esac
-
-exit 0
-
-# Local variables:
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-end: "$"
-# End:
diff --git a/kokkos/basic/optional/ThreadPool/config/replace-install-prefix.pl b/kokkos/basic/optional/ThreadPool/config/replace-install-prefix.pl
deleted file mode 100755
index 7523b08..0000000
--- a/kokkos/basic/optional/ThreadPool/config/replace-install-prefix.pl
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-use Getopt::Long;
-#
-# This script is called to do a set of text replacements for installing
-# a Mafile.export.package file so that external clients can use it.
-#
-# Read in commandline arguments
-#
-my $exec_prefix = "";           # [required] Abs path to base installation directory (i.e. --prefix=??? option passed to configure)
-my $my_export_makefile = "";    # [required] Name only of installed Makefile.export.package file
-my $my_top_srcdir = "";         # [required] Abs path to this package's top source directory
-my $my_incl_dirs = "";          # [required] Abs path to this package's include directories
-my $my_lib_dirs = "";           # [optional] Abs path to this package's library directories (if any exist)
-my $dep_package_builddirs = ""; # [optional] Abs paths to other directly dependent framework package build directories (if any exist)
-GetOptions(
-  "exec-prefix=s"                   => \$exec_prefix,
-  "my-export-makefile=s"            => \$my_export_makefile,
-  "my-abs-top-srcdir=s"             => \$my_top_srcdir,
-  "my-abs-incl-dirs=s"              => \$my_incl_dirs,
-  "my-abs-lib-dirs=s"               => \$my_lib_dirs,
-  "dep-package-abs-builddirs=s"     => \$dep_package_builddirs
-  );
-#
-# Validate commandline arguments
-#
-scalar(@ARGV) == 0 || die;
-$exec_prefix ne "" || die;
-$my_export_makefile ne "" || die;
-$my_top_srcdir ne "" || die;
-$my_incl_dirs ne "" || die;
-#
-# Interpret commandline arguments
-#
-$exec_prefix = remove_rel_paths($exec_prefix);
-my @my_incl_dirs = split(":",$my_incl_dirs);
-my @my_lib_dirs = split(":",$my_lib_dirs);
-my @dep_export_package_builddirs = split(":",$dep_package_builddirs);
-#
-# Do the replacements
-#
-my $my_abs_export_makefile = "${exec_prefix}/include/${my_export_makefile}";
-
-my $cmnd_base = "${my_top_srcdir}/config/token-replace.pl ";
-#
-foreach(@dep_export_package_builddirs) {
-  if($_ ne "") {
-    run_cmnd($cmnd_base . "${_} ${exec_prefix}/include ${my_abs_export_makefile} ${my_abs_export_makefile}");
-  }
-}
-#
-foreach(@my_incl_dirs) {
-  if($_ ne "") {
-    run_cmnd($cmnd_base . "-I${_} -I${exec_prefix}/include ${my_abs_export_makefile} ${my_abs_export_makefile}");
-  }
-}
-#
-foreach(@my_lib_dirs) {
-  if($_ ne "") {
-    run_cmnd($cmnd_base . "-L${_} -L${exec_prefix}/lib ${my_abs_export_makefile} ${my_abs_export_makefile}");
-  }
-}
-#
-run_cmnd($cmnd_base . "${my_top_srcdir}/config ${exec_prefix}/include ${my_abs_export_makefile} ${my_abs_export_makefile}");
-#
-# Subroutines
-#
-sub remove_rel_paths {
-	my $entry_in = shift;
-	if ($entry_in=~/-L\.\./) {
-		return $entry_in;
-	}
-	my @paths = split("/",$entry_in);
-	my @new_paths;
-	foreach( @paths ) {
-		if( !($_=~/\.\./) ) {
-			push @new_paths, $_;
-		}
-		else {
-			pop @new_paths
-		}
-	}
-	return join("/",@new_paths);
-}
-sub run_cmnd {
-  my $cmnd = shift;
-  #print "\n", $cmnd, "\n";
-  system($cmnd)==0 || die;
-}
diff --git a/kokkos/basic/optional/ThreadPool/config/string-replace.pl b/kokkos/basic/optional/ThreadPool/config/string-replace.pl
deleted file mode 100755
index adeb1f4..0000000
--- a/kokkos/basic/optional/ThreadPool/config/string-replace.pl
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/perl -w
-#
-# This perl script replaces a string with another string.
-# Here it is allowd for file_in and file_out to be the
-# same file.
-#
-use strict;
-#
-my $g_use_msg =
-  "Use: string-replace.pl find_string replacement_string file_in file_out\n";
-if( scalar(@ARGV) < 4 ) {
-  print STDERR $g_use_msg;
-  exit(-1);
-}
-#
-my $find_string        = shift;
-my $replacement_string = shift;
-my $file_in_name       = shift;
-my $file_out_name      = shift;
-#
-#
-if($file_in_name=~/CVS/) {
-#  print "Do not replace in CVS\n";
-  exit;
-}
-#
-open FILE_IN, "<$file_in_name" || die "The file $file_in_name could not be opended for input\n";
-my @file_in_array = <FILE_IN>;
-close FILE_IN;
-#
-my @file_out_array;
-my $did_replacement = 0;
-foreach(@file_in_array) {
-  #print $_;
-  $did_replacement = 1 if $_=~s/$find_string/$replacement_string/g;
-  #print $_;
-  push @file_out_array, $_;
-}
-if($did_replacement || $file_out_name ne $file_in_name) {
-  open FILE_OUT, ">$file_out_name" || die "The file $file_out_name could not be opended for output\n";
-  print FILE_OUT @file_out_array;
-  close FILE_OUT;
-}
diff --git a/kokkos/basic/optional/ThreadPool/config/strip_dup_incl_paths.pl b/kokkos/basic/optional/ThreadPool/config/strip_dup_incl_paths.pl
deleted file mode 100755
index c628d31..0000000
--- a/kokkos/basic/optional/ThreadPool/config/strip_dup_incl_paths.pl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/perl -w
-# This perl script removes duplicate include paths left to the right
-use strict;
-my @all_incl_paths = @ARGV;
-my @cleaned_up_incl_paths;
-foreach( @all_incl_paths ) {
-	$_ = remove_rel_paths($_);
-	if( !($_=~/-I/) ) {
-		push @cleaned_up_incl_paths, $_;
-	}
-	elsif( !entry_exists($_,\@cleaned_up_incl_paths) ) {
-		push @cleaned_up_incl_paths, $_;
-	}
-}
-print join( " ", @cleaned_up_incl_paths );
-#
-# Subroutines
-#
-sub entry_exists {
-	my $entry = shift; # String
-	my $list  = shift; # Reference to an array
-	foreach( @$list ) {
-		if( $entry eq $_ ) { return 1; }
-	}
-	return 0;
-}
-#
-sub remove_rel_paths {
-	my $entry_in = shift;
-	if ($entry_in=~/-I\.\./) {
-		return $entry_in;
-	}
-	my @paths = split("/",$entry_in);
-	my @new_paths;
-	foreach( @paths ) {
-		if( !($_=~/\.\./) ) {
-			push @new_paths, $_;
-		}
-		else {
-			pop @new_paths
-		}
-	}
-	return join("/",@new_paths);
-}
diff --git a/kokkos/basic/optional/ThreadPool/config/strip_dup_libs.pl b/kokkos/basic/optional/ThreadPool/config/strip_dup_libs.pl
deleted file mode 100755
index cdf4b42..0000000
--- a/kokkos/basic/optional/ThreadPool/config/strip_dup_libs.pl
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/perl -w
-# This perl script removes duplicate libraries from the right to the left and
-# removes duplicate -L library paths from the left to the right
-use strict;
-
-my @all_libs = @ARGV;
-#
-# Move from left to right and remove duplicate -l libraries
-#
-my @cleaned_up_libs_first;
-foreach( reverse @all_libs ) {
-	$_ = remove_rel_paths($_);
-	if( $_=~/-L/ ) {
-		unshift @cleaned_up_libs_first, $_;
-	}
-	else {
-		if( !entry_exists($_,\@cleaned_up_libs_first) ) {
-			unshift @cleaned_up_libs_first, $_;
-		}
-	}
-}
-
-#
-# Move from right to left and remove duplicate -L library paths
-#
-my @cleaned_up_libs;
-foreach( @cleaned_up_libs_first ) {
-	$_ = remove_rel_paths($_);
-	if( !($_=~/-L/) ) {
-		push @cleaned_up_libs, $_;
-	}
-	elsif( !entry_exists($_,\@cleaned_up_libs) ) {
-		push @cleaned_up_libs, $_;
-	}
-}
-#
-# Print the new list of libraries and paths
-#
-print join( " ", @cleaned_up_libs );
-
-#
-# Subroutines
-#
-sub entry_exists {
-	my $entry = shift; # String
-	my $list  = shift; # Reference to an array
-	foreach( @$list ) {
-		if( $entry eq $_ ) { return 1; }
-	}
-	return 0;
-}
-#
-sub remove_rel_paths {
-	my $entry_in = shift;
-	if ($entry_in=~/-L\.\./) {
-		return $entry_in;
-	}
-	my @paths = split("/",$entry_in);
-	my @new_paths;
-	foreach( @paths ) {
-		if( !($_=~/\.\./) ) {
-			push @new_paths, $_;
-		}
-		else {
-			pop @new_paths
-		}
-	}
-	return join("/",@new_paths);
-}
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_check_mpi.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_check_mpi.m4
deleted file mode 100644
index 10d569a..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_check_mpi.m4
+++ /dev/null
@@ -1,68 +0,0 @@
-dnl @synopsis TAC_ARG_CHECK_MPI
-dnl
-dnl Check to make sure any definitions set in TAC_ARG_CONFIG_MPI
-dnl are valid, set the MPI flags.  Test MPI compile using C++ compiler.
-dnl
-dnl @author Mike Heroux <mheroux@cs.sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_CHECK_MPI],
-[
-
-if test "X${HAVE_PKG_MPI}" = "Xyes"; then
-
-  if test -n "${MPI_DIR}" && test -z "${MPI_INC}"; then
-    MPI_INC="${MPI_DIR}/include"
-  fi
-
-  if test -n "${MPI_INC}"; then
-    CPPFLAGS="${CPPFLAGS} -I${MPI_INC}"
-  fi
-
-  AC_LANG_CPLUSPLUS 
-  AC_MSG_CHECKING(for mpi.h)
-  AC_TRY_CPP([#include "mpi.h"],
-    [AC_MSG_RESULT(yes)], 
-    [
-     AC_MSG_RESULT(no)  
-     echo "-----"
-     echo "Cannot link simple MPI program."
-     echo "Try --with-mpi-compilers to specify MPI compilers."
-     echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
-     echo "to specify all the specific MPI compile options."
-     echo "-----"
-     AC_MSG_ERROR(MPI cannot link)
-    ])
-
-  if test -n "${MPI_DIR}" && test -z "${MPI_LIBDIR}"; then
-    MPI_LIBDIR="${MPI_DIR}/lib"
-  fi
-
-  if test -n "${MPI_LIBDIR}"; then
-    LDFLAGS="${LDFLAGS} -L${MPI_LIBDIR}"
-  fi
-
-  if test -z "${MPI_LIBS}" && test -n "${MPI_LIBDIR}"; then
-    MPI_LIBS="-lmpi"
-  fi
-
-  if test -n "${MPI_LIBS}"; then
-    LIBS="${MPI_LIBS} ${LIBS}"
-  fi
-
-#   AC_LANG_CPLUSPLUS 
-#   AC_MSG_CHECKING(whether MPI will link using C++ compiler)
-#   AC_TRY_LINK([#include <mpi.h>],
-#   [int c; char** v; MPI_Init(&c,&v);],
-#   [AC_MSG_RESULT(yes)], 
-#   [AC_MSG_RESULT(no)  
-#    echo "-----"
-#    echo "Cannot link simple MPI program."
-#    echo "Try --with-mpi-cxx to specify MPI C++ compile script."
-#    echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
-#    echo "to specify all the specific MPI compile options."
-#    echo "-----"
-#    AC_MSG_ERROR(MPI cannot link)]
-#   )
-
-fi
-])
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_config_mpi.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_config_mpi.m4
deleted file mode 100644
index 2d1dd98..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_config_mpi.m4
+++ /dev/null
@@ -1,188 +0,0 @@
-dnl @synopsis TAC_ARG_CONFIG_MPI
-dnl
-dnl Test a variety of MPI options:
-dnl --enable-mpi       - Turns MPI compiling mode on
-dnl --with-mpi         - specify root directory of MPI
-dnl --with-mpi-compilers - Turns on MPI compiling mode and sets the MPI C++
-dnl                       compiler = mpicxx, mpic++ or mpiCC,
-dnl                       the MPI C compiler = mpicc and 
-dnl                       the MPI Fortran compiler = mpif77
-dnl --with-mpi-incdir - specify include directory for MPI 
-dnl --with-mpi-libs    - specify MPI libraries
-dnl --with-mpi-libdir  - specify location of MPI libraries
-dnl
-dnl If any of these options are set, HAVE_MPI will be defined for both
-dnl Autoconf and Automake, and HAVE_MPI will be defined in the
-dnl generated config.h file
-dnl
-dnl
-dnl @author Mike Heroux <maherou@sandia.gov>
-dnl Modified 12/26/2007 by Jim Willenbring to skip the Fortran compiler
-dnl check if Fortran is not enabled.
-dnl
-AC_DEFUN([TAC_ARG_CONFIG_MPI],
-[
-
-AC_ARG_ENABLE(mpi,
-[AC_HELP_STRING([--enable-mpi],[MPI support])],
-[HAVE_PKG_MPI=$enableval],
-[HAVE_PKG_MPI=no]
-)
-
-AC_ARG_WITH(mpi-compilers,
-[AC_HELP_STRING([--with-mpi-compilers=PATH],
-[use MPI compilers mpicc, mpif77, and mpicxx, mpic++ or mpiCC in the specified path or in the default path if no path is specified. Enables MPI])],
-[
-  if test X${withval} != Xno; then
-    HAVE_PKG_MPI=yes
-    if test X${withval} = Xyes; then
-      # Check for mpicxx, if it does not exist, check for mpic++, if it does 
-      # not exist, use mpiCC instead.
-      AC_CHECK_PROG(MPI_TEMP_CXX, mpicxx, mpicxx, no)
-      if test X${MPI_TEMP_CXX} = Xno; then
-	AC_CHECK_PROG(MPI_CXX, mpic++, mpic++, mpiCC)
-      else 
-	MPI_CXX=${MPI_TEMP_CXX}
-      fi
-      MPI_CC=mpicc
-      MPI_F77=mpif77
-    else
-      if test -f ${withval}/mpicxx; then
-        MPI_CXX=${withval}/mpicxx
-      elif test -f ${withval}/mpic++; then
-	MPI_CXX=${withval}/mpic++
-      else
-        MPI_CXX=${withval}/mpiCC
-      fi
-      MPI_CC=${withval}/mpicc
-      MPI_F77=${withval}/mpif77
-    fi
-  fi
-]
-)
-
-AC_ARG_WITH(mpi,
-[AC_HELP_STRING([--with-mpi=MPIROOT],[use MPI root directory (enables MPI)])],
-[
-  HAVE_PKG_MPI=yes
-  MPI_DIR=${withval}
-  AC_MSG_CHECKING(MPI directory)
-  AC_MSG_RESULT([${MPI_DIR}])
-]
-)
-
-#AC_ARG_WITH(mpi-include,
-#[AC_HELP_STRING([--with-mpi-include],[Obsolete.  Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.])],
-#[AC_MSG_ERROR([--with-mpi-include is an obsolte option.   Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.  For example '--with-mpi-incdir=/usr/lam_path/include'.])]
-#)
-
-AC_ARG_WITH(mpi-libs,
-[AC_HELP_STRING([--with-mpi-libs="LIBS"],[MPI libraries @<:@"-lmpi"@:>@])],
-[
-  MPI_LIBS=${withval}
-  AC_MSG_CHECKING(user-defined MPI libraries)
-  AC_MSG_RESULT([${MPI_LIBS}])
-]
-)
-
-AC_ARG_WITH(mpi-incdir,
-[AC_HELP_STRING([--with-mpi-incdir=DIR],[MPI include directory @<:@MPIROOT/include@:>@  Do not use -I])],
-[
-  MPI_INC=${withval}
-  AC_MSG_CHECKING(user-defined MPI includes)
-  AC_MSG_RESULT([${MPI_INC}])
-]
-)
-
-AC_ARG_WITH(mpi-libdir,
-[AC_HELP_STRING([--with-mpi-libdir=DIR],[MPI library directory @<:@MPIROOT/lib@:>@  Do not use -L])],
-[
-  MPI_LIBDIR=${withval}
-  AC_MSG_CHECKING(user-defined MPI library directory)
-  AC_MSG_RESULT([${MPI_LIBDIR}])
-]
-)
-
-AC_MSG_CHECKING(whether we are using MPI)
-AC_MSG_RESULT([${HAVE_PKG_MPI}])
-
-if test "X${HAVE_PKG_MPI}" = "Xyes"; then
-   AC_DEFINE(HAVE_MPI,,[define if we want to use MPI])
-fi
-
-dnl Define Automake version of HAVE_MPI if appropriate
-
-AM_CONDITIONAL(HAVE_MPI, [test "X${HAVE_PKG_MPI}" = "Xyes"])
-
-
-dnl
-dnl --------------------------------------------------------------------
-dnl Check for MPI compilers (must be done *before* AC_PROG_CXX,
-dnl AC_PROG_CC and AC_PROG_F77)
-dnl 
-dnl --------------------------------------------------------------------
-
-if test -n "${MPI_CXX}"; then
-  if test -f ${MPI_CXX}; then
-    MPI_CXX_EXISTS=yes
-  else
-    AC_CHECK_PROG(MPI_CXX_EXISTS, ${MPI_CXX}, yes, no)
-  fi
-
-  if test "X${MPI_CXX_EXISTS}" = "Xyes"; then
-    CXX=${MPI_CXX}
-  else
-    echo "-----"
-    echo "Cannot find MPI C++ compiler ${MPI_CXX}."
-    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
-    echo "or specify a C++ compiler using CXX=<compiler>"
-    echo "Do not use --with-mpi-compilers if using CXX=<compiler>"
-    echo "-----"
-    AC_MSG_ERROR([MPI C++ compiler (${MPI_CXX}) not found.])
-  fi
-fi
-
-if test -n "${MPI_CC}"; then
-  if test -f ${MPI_CC}; then
-    MPI_CC_EXISTS=yes
-  else
-    AC_CHECK_PROG(MPI_CC_EXISTS, ${MPI_CC}, yes, no)
-  fi
-
-  if test "X${MPI_CC_EXISTS}" = "Xyes"; then
-    CC=${MPI_CC}
-  else
-    echo "-----"
-    echo "Cannot find MPI C compiler ${MPI_CC}."
-    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
-    echo "or specify a C compiler using CC=<compiler>"
-    echo "Do not use --with-mpi-compilers if using CC=<compiler>"
-    echo "-----"
-    AC_MSG_ERROR([MPI C compiler (${MPI_CC}) not found.])
-  fi
-fi
-
-if test "X$ac_cv_use_fortran" = "Xyes"; then
-
-if test -n "${MPI_F77}"; then
-  if test -f ${MPI_F77}; then
-    MPI_F77_EXISTS=yes
-  else
-    AC_CHECK_PROG(MPI_F77_EXISTS, ${MPI_F77}, yes, no)
-  fi
-
-  if test "X${MPI_F77_EXISTS}" = "Xyes"; then
-    F77=${MPI_F77}
-  else
-    echo "-----"
-    echo "Cannot find MPI Fortran compiler ${MPI_F77}."
-    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
-    echo "or specify a Fortran 77 compiler using F77=<compiler>"
-    echo "Do not use --with-mpi-compilers if using F77=<compiler>"
-    echo "-----"
-    AC_MSG_ERROR([MPI Fortran 77 compiler (${MPI_F77}) not found.])
-  fi
-fi
-
-fi dnl ac_cv_use_fortran
-])
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_export-makefiles.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_export-makefiles.m4
deleted file mode 100644
index b7a8b38..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_export-makefiles.m4
+++ /dev/null
@@ -1,76 +0,0 @@
-dnl Enables export makefile specific code
-dnl 
-dnl The following AM_CONDITIONALS are set for makefiles to access:
-dnl USING_EXPORT_MAKEFILES
-dnl USING_PERL via TAC_ARG_WITH_PERL
-dnl USING_GNUMAKE
-dnl
-dnl The following AC_DEFINES are set:
-dnl HAVE_EXPORT_MAKEFILES
-dnl 
-dnl the following variables are set:
-dnl PERL_EXE for the perl executable via TAC_ARG_WITH_PERL
-dnl 
-dnl This file was based on tac_arg_enable_feature.m4 by Mike Heroux
-dnl @author Roger Pawlowski <rppawlo@sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_ENABLE_EXPORT_MAKEFILES],
-[
-AC_ARG_ENABLE(export-makefiles,
-AC_HELP_STRING([--enable-export-makefiles],[Creates export makefiles in the install (prefix) directory.  This option requires perl to be set in your path or defined with --with-perl=<perl executable>. Note that the export makefiles are always created and used in the build directory, but will not be installable without this option to change the paths. (default is $1)]),
-ac_cv_use_export_makefiles=$enableval, 
-ac_cv_use_export_makefiles=$1)
-
-AC_MSG_CHECKING(whether to build export makefiles)
-
-if test "X$ac_cv_use_export_makefiles" != "Xno"; then
-
-  AC_MSG_RESULT(yes)
-  AC_DEFINE([HAVE_EXPORT_MAKEFILES],,[Define if you want to build export makefiles.])
-
-else
-
-  AC_MSG_RESULT(no)
-
-fi
-
-AM_CONDITIONAL(USING_EXPORT_MAKEFILES, test X${ac_cv_use_export_makefiles} = Xyes)
-
-# Check for perl to run scripts (Required dependency)
-TAC_ARG_WITH_PERL
-
-if test "X$HAVE_PERL" != "Xyes" && 
-   test "X$ac_cv_use_export_makefiles" != "Xno"; then
-  AC_MSG_RESULT(no)
-  AC_MSG_ERROR([Failed to find the perl executable.  The flag --enable-export-makefiles requires perl to be either in your path or explicitly defined by the flag --with-perl=<executable>.  If you do not require the export makefiles to be installed via 'make install', you can disable the export makefiles with --disable-export-makefiles.])
-fi
-
-# Check for using gnumake to clean up link lines via 
-# gnumake's "shell" command. Optional dependency.
-AC_DEFUN([TAC_ARG_WITH_GNUMAKE],
-[
-AC_ARG_WITH(gnumake,
-AC_HELP_STRING([--with-gnumake],[Gnu's make has special functions we can use to eliminate redundant paths in the build and link lines. Enable this if you use gnu-make to build Trilinos. This requires that perl is in your path or that you have specified the perl executable with --with-perl=<perl executable>.  Configure will check for the existence of the perl executable and quit with an error if it is not found. (default is no)]),
-ac_cv_use_gnumake=$withval, ac_cv_use_gnumake=no)
-
-AC_MSG_CHECKING(whether gnumake specific code should be enabled)
-
-if test "X$ac_cv_use_gnumake" != "Xno"; then
-  AC_MSG_RESULT(yes)  
-  AC_DEFINE([HAVE_GNUMAKE],,[Define if you are using gnumake - this will shorten your link lines.])
-else
-  AC_MSG_RESULT(no)
-fi
-AM_CONDITIONAL(USING_GNUMAKE, test "X$ac_cv_use_gnumake" = "Xyes")
-])
-
-TAC_ARG_WITH_GNUMAKE
-
-if test "X$HAVE_PERL" != "Xyes" && 
-   test "X$ac_cv_use_gnumake" != "Xno"; then
-  AC_MSG_RESULT(no)
-  AC_MSG_ERROR([The flag --with-gnumake requires perl to be in your path.  The perl executable can alternatively be explicitly defined by the flag --with-perl=<executable>.])
-fi
-
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_feature.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_feature.m4
deleted file mode 100644
index 4e22753..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_feature.m4
+++ /dev/null
@@ -1,40 +0,0 @@
-dnl @synopsis TAC_ARG_ENABLE_FEATURE(FEATURE_NAME, FEATURE_DESCRIPTION, HAVE_NAME, DEFAULT_VAL)
-dnl
-dnl Test for --enable-${FEATURE_NAME} and set to DEFAULT_VAL value if feature not specified.
-dnl Also calls AC_DEFINE to define HAVE_${HAVE_NAME} if value is not equal to "no"
-dnl 
-dnl Use this macro to help defining whether or not optional 
-dnl features* should compiled.  For example:
-dnl
-dnl TAC_ARG_ENABLE_FEATURE(epetra, [Configure and build epetra], EPETRA, yes)
-dnl 
-dnl will test for --enable-epetra when configure is run.  If it is defined 
-dnl and not set to "no" or not defined (default is "yes") then HAVE_EPETRA will
-dnl be defined, if --enable-epetra is defined to be "no", HAVE_EPETRA will not
-dnl be defined.
-dnl
-dnl *NOTE: epetra, aztecoo, komplex, ifpack, and other software found in
-dnl subdirectories of Trilinos/packages are "packages" in their own right.
-dnl However, these packages are also "features" of the larger package
-dnl "Trilinos".  Therefore, when configuring from the Trilinos directory,
-dnl it is appropriate to refer to these software packages as "features".
-dnl
-dnl This file was based on tac_arg_with_package.m4 by Mike Heroux
-dnl @author James Willenbring <jmwille@sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_ENABLE_FEATURE],
-[
-AC_ARG_ENABLE([$1],
-AC_HELP_STRING([--enable-$1],[$2 (default is [$4])]),
-ac_cv_use_$1=$enableval, ac_cv_use_$1=$4)
-
-AC_MSG_CHECKING(whether to use [$1])
-
-if test "X$ac_cv_use_$1" != "Xno"; then
-  AC_MSG_RESULT(yes)  
-  AC_DEFINE([HAVE_$3],,[Define if want to build $1])
-else
-  AC_MSG_RESULT(no)
-fi
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_feature_sub_check.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_feature_sub_check.m4
deleted file mode 100755
index b3876fd..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_enable_feature_sub_check.m4
+++ /dev/null
@@ -1,54 +0,0 @@
-dnl @synopsis TAC_ARG_ENABLE_FEATURE_SUB_CHECK(FEATURE_NAME, SUB_FEATURE_NAME, FEATURE_DESCRIPTION, HAVE_NAME)
-dnl
-dnl This hack gets around the fact that TAC_ARG_ENABLE_FEATURE does not support underscores
-dnl in its feature names.  TAC_ARG_ENABLE_FEATURE_SUB_CHECK allows exactly one underscore.  Not great,
-dnl but arguably better than supporting no underscores.
-dnl
-dnl TAC_ARG_ENABLE_FEATURE(feature-sub, [Configure and build feature-sub], FEATURE_SUB, yes) 
-dnl   fails because tac_arg_enable_feature tests for ac_cv_use_feature-sub which gets 
-dnl   rejected because the `-' is not allowed in variables.  (AC_ARG_ENABLE sets ac_cv_use_feature_sub
-dnl   to avoid this problem.)  Use:
-dnl 
-dnl TAC_ARG_ENABLE_FEATURE_SUB_CHECK(feature, sub, [Configure and build feature-sub], FEATURE_SUB) 
-dnl   instead.
-dnl
-dnl This macro will test for --enable-${FEATURE_NAME}-${SUB_FEATURE_NAME} when configure is run.  
-dnl If it is defined and not set to "no" or not defined and --disable-${SUB_FEATURE_NAME} is not
-dnl specified then HAVE_${HAVE_NAME} will be defined.
-dnl
-dnl *NOTE: This macro is designed for the use-case when there is an individual Trilinos package 
-dnl offering fine-grained control of a Trilinos option.  This way, the individual package 
-dnl option is enabled, as long as the Trilinos option is not disabled.  If the Trilinos option is
-dnl disabled, then the user must enable each packages option individually.  For instance:
-dnl
-dnl --disable-tests --enable-teuchos-tests
-dnl
-dnl *NOTE: epetra, aztecoo, komplex, ifpack, and other software found in
-dnl subdirectories of Trilinos/packages are "packages" in their own right.
-dnl However, these packages are also "features" of the larger package
-dnl "Trilinos".  Therefore, when configuring from the Trilinos directory,
-dnl it is appropriate to refer to these software packages as "features".
-dnl
-dnl This file was based on tac_arg_enable_package.m4 by Jim Willenbring
-dnl and tac_arg_enable_package_sub.m4 by Ken Stanley.
-dnl
-dnl @author Heidi Thornquist <hkthorn@sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_ENABLE_FEATURE_SUB_CHECK],
-[
-AC_ARG_ENABLE([$2],, ac_cv_use_$2=$enableval, ac_cv_use_$2=yes)
-
-AC_ARG_ENABLE([$1-$2],
-AC_HELP_STRING([--enable-$1-$2],[$3 (default is yes if --disable-$2 is not specified)]),
-ac_cv_use_$1_$2=$enableval, ac_cv_use_$1_$2=${ac_cv_use_$2})
-
-AC_MSG_CHECKING(whether to use [$1-$2])
-
-if test "X$ac_cv_use_$1_$2" != "Xno"; then
-  AC_MSG_RESULT(yes)  
-  AC_DEFINE([HAVE_$4],,[Define if want to build $1-$2])
-else
-  AC_MSG_RESULT(no)
-fi
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_ar.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_with_ar.m4
deleted file mode 100644
index 9568f3e..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_ar.m4
+++ /dev/null
@@ -1,39 +0,0 @@
-dnl @synopsis TAC_ARG_WITH_AR
-dnl
-dnl Test for --with-ar="ar_program ar_flags".
-dnl Default is "ar cru"
-dnl 
-dnl Generates an Automake conditional USE_ALTERNATE_AR that can be tested.  
-dnl Generates the user-specified archiver command in @ALTERNATE_AR@.
-dnl
-dnl @author Mike Heroux <mheroux@cs.sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_WITH_AR],
-[
-AC_ARG_WITH(ar,
-AC_HELP_STRING([--with-ar], [override archiver command (default is "ar cru")]),
-[
-AC_MSG_CHECKING(user-defined archiver)
-AC_MSG_RESULT([${withval}])
-USE_ALTERNATE_AR=yes
-ALTERNATE_AR="${withval}"
-]
-)
-
-if test -n "${SPECIAL_AR}" && test "X${USE_ALTERNATE_AR}" != "Xyes";
-then
-  USE_ALTERNATE_AR=yes
-  ALTERNATE_AR="${SPECIAL_AR}"
-fi
-
-AC_MSG_CHECKING(for special archiver command)
-if test "X${USE_ALTERNATE_AR}" = "Xyes"; then
-   AC_MSG_RESULT([${ALTERNATE_AR}])
-   AM_CONDITIONAL(USE_ALTERNATE_AR, true)
-else
-   AC_MSG_RESULT([none])
-   AM_CONDITIONAL(USE_ALTERNATE_AR, false)
-fi
-AC_SUBST(ALTERNATE_AR)
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_flags.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_with_flags.m4
deleted file mode 100644
index 256450a..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_flags.m4
+++ /dev/null
@@ -1,31 +0,0 @@
-dnl @synopsis TAC_ARG_WITH_FLAGS(lcase_name, UCASE_NAME)
-dnl
-dnl Test for --with-lcase_name="compiler/loader flags".  if defined, prepend 
-dnl flags to standard UCASE_NAME definition.
-dnl
-dnl Use this macro to facilitate additional special flags that should be
-dnl passed on to the preprocessor/compilers/loader.
-dnl
-dnl Example use
-dnl 
-dnl TAC_ARG_WITH_FLAGS(cxxflags, CXXFLAGS)
-dnl 
-dnl tests for --with-cxxflags and pre-pends to CXXFLAGS
-dnl 
-dnl
-dnl @author Mike Heroux <mheroux@cs.sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_WITH_FLAGS],
-[
-AC_MSG_CHECKING([whether additional [$2] flags should be added])
-AC_ARG_WITH($1,
-AC_HELP_STRING([--with-$1], 
-[additional [$2] flags to be added: will prepend to [$2]]),
-[
-$2="${withval} ${$2}"
-AC_MSG_RESULT([$2 = ${$2}])
-],
-AC_MSG_RESULT(no)
-)
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_incdirs.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_with_incdirs.m4
deleted file mode 100644
index f3092e5..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_incdirs.m4
+++ /dev/null
@@ -1,24 +0,0 @@
-dnl @synopsis TAC_ARG_WITH_INCDIRS
-dnl
-dnl Test for --with-incdirs="-Iincdir1 -Iincdir2".  if defined, prepend 
-dnl "-Iincdir1 -Iincdir2" to CPPFLAGS
-dnl
-dnl Use this macro to facilitate addition of directories to include file search path.
-dnl 
-dnl
-dnl @author Mike Heroux <mheroux@cs.sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_WITH_INCDIRS],
-[
-AC_MSG_CHECKING([whether additional include search paths defined])
-AC_ARG_WITH(incdirs,
-AC_HELP_STRING([--with-incdirs], 
-[additional directories containing include files: will prepend to search here for includes, use -Idir format]),
-[
-CPPFLAGS="${withval} ${CPPFLAGS}"
-AC_MSG_RESULT([${withval}])
-],
-AC_MSG_RESULT(no)
-)
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_libdirs.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_with_libdirs.m4
deleted file mode 100644
index b2f9438..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_libdirs.m4
+++ /dev/null
@@ -1,24 +0,0 @@
-dnl @synopsis TAC_ARG_WITH_LIBDIRS
-dnl
-dnl Test for --with-libdirs="-Llibdir1 -Llibdir2".  if defined, 
-dnl prepend "-Llibdir1 -Llibdir2" to LDFLAGS
-dnl
-dnl Use this macro to facilitate addition of directories to library search path.
-dnl 
-dnl
-dnl @author Mike Heroux <mheroux@cs.sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_WITH_LIBDIRS],
-[
-AC_MSG_CHECKING([whether additional library search paths defined])
-AC_ARG_WITH(libdirs,
-AC_HELP_STRING([--with-libdirs], 
-[OBSOLETE use --with-ldflags instead.  (ex. --with-ldflags="-L<DIR> -L<DIR2>")]),
-[
-LDFLAGS="${withval} ${LDFLAGS}"
-AC_MSG_RESULT([${withval}])
-],
-AC_MSG_RESULT(no)
-)
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_libs.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_with_libs.m4
deleted file mode 100644
index 3a64880..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_libs.m4
+++ /dev/null
@@ -1,30 +0,0 @@
-dnl @synopsis TAC_ARG_WITH_LIBS
-dnl
-dnl Test for --with-libs="name(s)".
-dnl 
-dnl Prepends the specified name(s) to the list of libraries to link 
-dnl with.  
-dnl
-dnl Example use
-dnl
-dnl TAC_ARG_WITH_LIBS
-dnl 
-dnl tests for --with-libs and pre-pends to LIBS
-dnl
-dnl @author Jim Willenbring <jmwille@sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_WITH_LIBS],
-[
-AC_MSG_CHECKING([whether additional libraries are needed])
-AC_ARG_WITH(libs,
-AC_HELP_STRING([--with-libs], 
-[List additional libraries here.  For example, --with-libs=-lsuperlu
-or --with-libs=/path/libsuperlu.a]),
-[
-LIBS="${withval} ${LIBS}"
-AC_MSG_RESULT([LIBS = ${LIBS}])
-],
-AC_MSG_RESULT(no)
-)
-]
-)
diff --git a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_perl.m4 b/kokkos/basic/optional/ThreadPool/config/tac_arg_with_perl.m4
deleted file mode 100644
index 63e74ba..0000000
--- a/kokkos/basic/optional/ThreadPool/config/tac_arg_with_perl.m4
+++ /dev/null
@@ -1,34 +0,0 @@
-dnl @synopsis TAC_ARG_WITH_PERL(DEFAULT_VAL)
-dnl
-dnl Test for --enable-gnumake and set to DEFAULT_VAL value if feature not specified.
-dnl Calls AC_DEFINE to define HAVE_GNUMAKE if value is not equal to "no"
-dnl Calls AM_CONDITIONAL to define USING_GNUMAKE to true/false.
-dnl 
-dnl This file was based on tac_arg_with_ar.m4 by Mike Heroux
-dnl @author Roger Pawlowski <rppawlo@sandia.gov>
-dnl
-AC_DEFUN([TAC_ARG_WITH_PERL],
-[
-
-AC_ARG_WITH(perl,
-AC_HELP_STRING([--with-perl], [supply a perl executable.  For example --with-perl=/usr/bin/perl.]),
-[
-AC_MSG_CHECKING(for user supplied perl executable)
-AC_MSG_RESULT([${withval}])
-USER_SPECIFIED_PERL=yes
-PERL_EXE="${withval}"
-],
-[
-USER_SPECIFIED_PERL=no
-])
-
-if test "X${USER_SPECIFIED_PERL}" = "Xyes"; then
-  AC_CHECK_FILE(${PERL_EXE}, [HAVE_PERL=yes], [HAVE_PERL=no])
-  AC_SUBST(PERL_EXE, ${PERL_EXE})
-else
-  AC_CHECK_PROG(HAVE_PERL, perl, yes, no)
-  AC_SUBST(PERL_EXE, perl)
-fi
-AM_CONDITIONAL(USING_PERL, test X${HAVE_PERL} = Xyes)
-])
-
diff --git a/kokkos/basic/optional/ThreadPool/config/token-replace.pl b/kokkos/basic/optional/ThreadPool/config/token-replace.pl
deleted file mode 100755
index c3b413e..0000000
--- a/kokkos/basic/optional/ThreadPool/config/token-replace.pl
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/perl -w
-#
-# This perl script replaces a string with another string
-# on a token basis.  Here it is allowed for file_in and
-# file_out to be the same file.
-#
-use strict;
-#
-my $g_use_msg =
-  "Use: token-replace.pl find_token replacement_token file_in file_out\n";
-if( scalar(@ARGV) < 4 ) {
-  print STDERR $g_use_msg;
-  exit(-1);
-}
-#
-my $find_token         = shift;
-my $replacement_token  = shift;
-my $file_in_name       = shift;
-my $file_out_name      = shift;
-#
-#print "file_in_name = $file_in_name\n";
-if($file_in_name=~/CVS/) {
-#  print "Do not replace in CVS\n";
-  exit;
-}
-open FILE_IN, "<$file_in_name" || die "The file $file_in_name could not be opended for input\n";
-my @file_in_array = <FILE_IN>;
-close FILE_IN;
-#
-my $match_str = '([^\w\d_]|^)' . $find_token . '([^\w\d_]|$)';
-#print $match_str . "\n";
-#
-my @file_out_array;
-my $did_replacement = 0;
-foreach(@file_in_array) {
-  $did_replacement = 1 if $_=~s/$match_str/$1$replacement_token$2/g;
-  push @file_out_array, $_;
-}
-if($did_replacement || $file_out_name ne $file_in_name) {
-  open FILE_OUT, ">$file_out_name" || die "The file $file_out_name could not be opended for output\n";
-  print FILE_OUT @file_out_array;
-  close FILE_OUT;
-}
diff --git a/kokkos/basic/optional/ThreadPool/configure b/kokkos/basic/optional/ThreadPool/configure
deleted file mode 100755
index 6312db9..0000000
--- a/kokkos/basic/optional/ThreadPool/configure
+++ /dev/null
@@ -1,7804 +0,0 @@
-#! /bin/sh
-# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.61 for ThreadPool 1.1d.
-#
-# Report bugs to <hcedwar@sandia.gov>.
-#
-# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
-# 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
-# This configure script is free software; the Free Software Foundation
-# gives unlimited permission to copy, distribute and modify it.
-## --------------------- ##
-## M4sh Initialization.  ##
-## --------------------- ##
-
-# Be more Bourne compatible
-DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
-  emulate sh
-  NULLCMD=:
-  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '${1+"$@"}'='"$@"'
-  setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in
-  *posix*) set -o posix ;;
-esac
-
-fi
-
-
-
-
-# PATH needs CR
-# Avoid depending upon Character Ranges.
-as_cr_letters='abcdefghijklmnopqrstuvwxyz'
-as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-as_cr_Letters=$as_cr_letters$as_cr_LETTERS
-as_cr_digits='0123456789'
-as_cr_alnum=$as_cr_Letters$as_cr_digits
-
-# The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
-  echo "#! /bin/sh" >conf$$.sh
-  echo  "exit 0"   >>conf$$.sh
-  chmod +x conf$$.sh
-  if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then
-    PATH_SEPARATOR=';'
-  else
-    PATH_SEPARATOR=:
-  fi
-  rm -f conf$$.sh
-fi
-
-# Support unset when possible.
-if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
-  as_unset=unset
-else
-  as_unset=false
-fi
-
-
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-as_nl='
-'
-IFS=" ""	$as_nl"
-
-# Find who we are.  Look in the path if we contain no directory separator.
-case $0 in
-  *[\\/]* ) as_myself=$0 ;;
-  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
-done
-IFS=$as_save_IFS
-
-     ;;
-esac
-# We did not find ourselves, most probably we were run as `sh COMMAND'
-# in which case we are not to be found in the path.
-if test "x$as_myself" = x; then
-  as_myself=$0
-fi
-if test ! -f "$as_myself"; then
-  echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
-  { (exit 1); exit 1; }
-fi
-
-# Work around bugs in pre-3.0 UWIN ksh.
-for as_var in ENV MAIL MAILPATH
-do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-for as_var in \
-  LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \
-  LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \
-  LC_TELEPHONE LC_TIME
-do
-  if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then
-    eval $as_var=C; export $as_var
-  else
-    ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
-  fi
-done
-
-# Required to use basename.
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
-else
-  as_expr=false
-fi
-
-if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
-  as_basename=basename
-else
-  as_basename=false
-fi
-
-
-# Name of the executable.
-as_me=`$as_basename -- "$0" ||
-$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
-	 X"$0" : 'X\(//\)$' \| \
-	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-echo X/"$0" |
-    sed '/^.*\/\([^/][^/]*\)\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-
-# CDPATH.
-$as_unset CDPATH
-
-
-if test "x$CONFIG_SHELL" = x; then
-  if (eval ":") 2>/dev/null; then
-  as_have_required=yes
-else
-  as_have_required=no
-fi
-
-  if test $as_have_required = yes && 	 (eval ":
-(as_func_return () {
-  (exit \$1)
-}
-as_func_success () {
-  as_func_return 0
-}
-as_func_failure () {
-  as_func_return 1
-}
-as_func_ret_success () {
-  return 0
-}
-as_func_ret_failure () {
-  return 1
-}
-
-exitcode=0
-if as_func_success; then
-  :
-else
-  exitcode=1
-  echo as_func_success failed.
-fi
-
-if as_func_failure; then
-  exitcode=1
-  echo as_func_failure succeeded.
-fi
-
-if as_func_ret_success; then
-  :
-else
-  exitcode=1
-  echo as_func_ret_success failed.
-fi
-
-if as_func_ret_failure; then
-  exitcode=1
-  echo as_func_ret_failure succeeded.
-fi
-
-if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
-  :
-else
-  exitcode=1
-  echo positional parameters were not saved.
-fi
-
-test \$exitcode = 0) || { (exit 1); exit 1; }
-
-(
-  as_lineno_1=\$LINENO
-  as_lineno_2=\$LINENO
-  test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" &&
-  test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; }
-") 2> /dev/null; then
-  :
-else
-  as_candidate_shells=
-    as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  case $as_dir in
-	 /*)
-	   for as_base in sh bash ksh sh5; do
-	     as_candidate_shells="$as_candidate_shells $as_dir/$as_base"
-	   done;;
-       esac
-done
-IFS=$as_save_IFS
-
-
-      for as_shell in $as_candidate_shells $SHELL; do
-	 # Try only shells that exist, to save several forks.
-	 if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
-		{ ("$as_shell") 2> /dev/null <<\_ASEOF
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
-  emulate sh
-  NULLCMD=:
-  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '${1+"$@"}'='"$@"'
-  setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in
-  *posix*) set -o posix ;;
-esac
-
-fi
-
-
-:
-_ASEOF
-}; then
-  CONFIG_SHELL=$as_shell
-	       as_have_required=yes
-	       if { "$as_shell" 2> /dev/null <<\_ASEOF
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
-  emulate sh
-  NULLCMD=:
-  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '${1+"$@"}'='"$@"'
-  setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in
-  *posix*) set -o posix ;;
-esac
-
-fi
-
-
-:
-(as_func_return () {
-  (exit $1)
-}
-as_func_success () {
-  as_func_return 0
-}
-as_func_failure () {
-  as_func_return 1
-}
-as_func_ret_success () {
-  return 0
-}
-as_func_ret_failure () {
-  return 1
-}
-
-exitcode=0
-if as_func_success; then
-  :
-else
-  exitcode=1
-  echo as_func_success failed.
-fi
-
-if as_func_failure; then
-  exitcode=1
-  echo as_func_failure succeeded.
-fi
-
-if as_func_ret_success; then
-  :
-else
-  exitcode=1
-  echo as_func_ret_success failed.
-fi
-
-if as_func_ret_failure; then
-  exitcode=1
-  echo as_func_ret_failure succeeded.
-fi
-
-if ( set x; as_func_ret_success y && test x = "$1" ); then
-  :
-else
-  exitcode=1
-  echo positional parameters were not saved.
-fi
-
-test $exitcode = 0) || { (exit 1); exit 1; }
-
-(
-  as_lineno_1=$LINENO
-  as_lineno_2=$LINENO
-  test "x$as_lineno_1" != "x$as_lineno_2" &&
-  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; }
-
-_ASEOF
-}; then
-  break
-fi
-
-fi
-
-      done
-
-      if test "x$CONFIG_SHELL" != x; then
-  for as_var in BASH_ENV ENV
-        do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
-        done
-        export CONFIG_SHELL
-        exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"}
-fi
-
-
-    if test $as_have_required = no; then
-  echo This script requires a shell more modern than all the
-      echo shells that I found on your system.  Please install a
-      echo modern shell, or manually run the script under such a
-      echo shell if you do have one.
-      { (exit 1); exit 1; }
-fi
-
-
-fi
-
-fi
-
-
-
-(eval "as_func_return () {
-  (exit \$1)
-}
-as_func_success () {
-  as_func_return 0
-}
-as_func_failure () {
-  as_func_return 1
-}
-as_func_ret_success () {
-  return 0
-}
-as_func_ret_failure () {
-  return 1
-}
-
-exitcode=0
-if as_func_success; then
-  :
-else
-  exitcode=1
-  echo as_func_success failed.
-fi
-
-if as_func_failure; then
-  exitcode=1
-  echo as_func_failure succeeded.
-fi
-
-if as_func_ret_success; then
-  :
-else
-  exitcode=1
-  echo as_func_ret_success failed.
-fi
-
-if as_func_ret_failure; then
-  exitcode=1
-  echo as_func_ret_failure succeeded.
-fi
-
-if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
-  :
-else
-  exitcode=1
-  echo positional parameters were not saved.
-fi
-
-test \$exitcode = 0") || {
-  echo No shell found that supports shell functions.
-  echo Please tell autoconf@gnu.org about your system,
-  echo including any error possibly output before this
-  echo message
-}
-
-
-
-  as_lineno_1=$LINENO
-  as_lineno_2=$LINENO
-  test "x$as_lineno_1" != "x$as_lineno_2" &&
-  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
-
-  # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
-  # uniformly replaced by the line number.  The first 'sed' inserts a
-  # line-number line after each line using $LINENO; the second 'sed'
-  # does the real work.  The second script uses 'N' to pair each
-  # line-number line with the line containing $LINENO, and appends
-  # trailing '-' during substitution so that $LINENO is not a special
-  # case at line end.
-  # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
-  # scripts with optimization help from Paolo Bonzini.  Blame Lee
-  # E. McMahon (1931-1989) for sed's syntax.  :-)
-  sed -n '
-    p
-    /[$]LINENO/=
-  ' <$as_myself |
-    sed '
-      s/[$]LINENO.*/&-/
-      t lineno
-      b
-      :lineno
-      N
-      :loop
-      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
-      t loop
-      s/-\n.*//
-    ' >$as_me.lineno &&
-  chmod +x "$as_me.lineno" ||
-    { echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
-   { (exit 1); exit 1; }; }
-
-  # Don't try to exec as it changes $[0], causing all sort of problems
-  # (the dirname of $[0] is not the place where we might find the
-  # original and so on.  Autoconf is especially sensitive to this).
-  . "./$as_me.lineno"
-  # Exit status is that of the last command.
-  exit
-}
-
-
-if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
-  as_dirname=dirname
-else
-  as_dirname=false
-fi
-
-ECHO_C= ECHO_N= ECHO_T=
-case `echo -n x` in
--n*)
-  case `echo 'x\c'` in
-  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
-  *)   ECHO_C='\c';;
-  esac;;
-*)
-  ECHO_N='-n';;
-esac
-
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
-else
-  as_expr=false
-fi
-
-rm -f conf$$ conf$$.exe conf$$.file
-if test -d conf$$.dir; then
-  rm -f conf$$.dir/conf$$.file
-else
-  rm -f conf$$.dir
-  mkdir conf$$.dir
-fi
-echo >conf$$.file
-if ln -s conf$$.file conf$$ 2>/dev/null; then
-  as_ln_s='ln -s'
-  # ... but there are two gotchas:
-  # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
-  # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-  # In both cases, we have to default to `cp -p'.
-  ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
-    as_ln_s='cp -p'
-elif ln conf$$.file conf$$ 2>/dev/null; then
-  as_ln_s=ln
-else
-  as_ln_s='cp -p'
-fi
-rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
-rmdir conf$$.dir 2>/dev/null
-
-if mkdir -p . 2>/dev/null; then
-  as_mkdir_p=:
-else
-  test -d ./-p && rmdir ./-p
-  as_mkdir_p=false
-fi
-
-if test -x / >/dev/null 2>&1; then
-  as_test_x='test -x'
-else
-  if ls -dL / >/dev/null 2>&1; then
-    as_ls_L_option=L
-  else
-    as_ls_L_option=
-  fi
-  as_test_x='
-    eval sh -c '\''
-      if test -d "$1"; then
-        test -d "$1/.";
-      else
-	case $1 in
-        -*)set "./$1";;
-	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
-	???[sx]*):;;*)false;;esac;fi
-    '\'' sh
-  '
-fi
-as_executable_p=$as_test_x
-
-# Sed expression to map a string onto a valid CPP name.
-as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
-
-# Sed expression to map a string onto a valid variable name.
-as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
-
-
-
-exec 7<&0 </dev/null 6>&1
-
-# Name of the host.
-# hostname on some systems (SVR3.2, Linux) returns a bogus exit status,
-# so uname gets run too.
-ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
-
-#
-# Initializations.
-#
-ac_default_prefix=/usr/local
-ac_clean_files=
-ac_config_libobj_dir=.
-LIBOBJS=
-cross_compiling=no
-subdirs=
-MFLAGS=
-MAKEFLAGS=
-SHELL=${CONFIG_SHELL-/bin/sh}
-
-# Identity of this package.
-PACKAGE_NAME='ThreadPool'
-PACKAGE_TARNAME='threadpool'
-PACKAGE_VERSION='1.1d'
-PACKAGE_STRING='ThreadPool 1.1d'
-PACKAGE_BUGREPORT='hcedwar@sandia.gov'
-
-ac_unique_file="src/TPI.c"
-# Factoring default headers for most tests.
-ac_includes_default="\
-#include <stdio.h>
-#ifdef HAVE_SYS_TYPES_H
-# include <sys/types.h>
-#endif
-#ifdef HAVE_SYS_STAT_H
-# include <sys/stat.h>
-#endif
-#ifdef STDC_HEADERS
-# include <stdlib.h>
-# include <stddef.h>
-#else
-# ifdef HAVE_STDLIB_H
-#  include <stdlib.h>
-# endif
-#endif
-#ifdef HAVE_STRING_H
-# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
-#  include <memory.h>
-# endif
-# include <string.h>
-#endif
-#ifdef HAVE_STRINGS_H
-# include <strings.h>
-#endif
-#ifdef HAVE_INTTYPES_H
-# include <inttypes.h>
-#endif
-#ifdef HAVE_STDINT_H
-# include <stdint.h>
-#endif
-#ifdef HAVE_UNISTD_H
-# include <unistd.h>
-#endif"
-
-ac_subst_vars='SHELL
-PATH_SEPARATOR
-PACKAGE_NAME
-PACKAGE_TARNAME
-PACKAGE_VERSION
-PACKAGE_STRING
-PACKAGE_BUGREPORT
-exec_prefix
-prefix
-program_transform_name
-bindir
-sbindir
-libexecdir
-datarootdir
-datadir
-sysconfdir
-sharedstatedir
-localstatedir
-includedir
-oldincludedir
-docdir
-infodir
-htmldir
-dvidir
-pdfdir
-psdir
-libdir
-localedir
-mandir
-DEFS
-ECHO_C
-ECHO_N
-ECHO_T
-LIBS
-build_alias
-host_alias
-target_alias
-MAINTAINER_MODE_TRUE
-MAINTAINER_MODE_FALSE
-MAINT
-build
-build_cpu
-build_vendor
-build_os
-host
-host_cpu
-host_vendor
-host_os
-target
-target_cpu
-target_vendor
-target_os
-INSTALL_PROGRAM
-INSTALL_SCRIPT
-INSTALL_DATA
-am__isrc
-CYGPATH_W
-PACKAGE
-VERSION
-ACLOCAL
-AUTOCONF
-AUTOMAKE
-AUTOHEADER
-MAKEINFO
-install_sh
-STRIP
-INSTALL_STRIP_PROGRAM
-mkdir_p
-AWK
-SET_MAKE
-am__leading_dot
-AMTAR
-am__tar
-am__untar
-MPI_TEMP_CXX
-MPI_CXX
-HAVE_MPI_TRUE
-HAVE_MPI_FALSE
-MPI_CXX_EXISTS
-MPI_CC_EXISTS
-MPI_F77_EXISTS
-CC
-CFLAGS
-LDFLAGS
-CPPFLAGS
-ac_ct_CC
-EXEEXT
-OBJEXT
-DEPDIR
-am__include
-am__quote
-AMDEP_TRUE
-AMDEP_FALSE
-AMDEPBACKSLASH
-CCDEPMODE
-am__fastdepCC_TRUE
-am__fastdepCC_FALSE
-CXX
-CXXFLAGS
-ac_ct_CXX
-CXXDEPMODE
-am__fastdepCXX_TRUE
-am__fastdepCXX_FALSE
-RANLIB
-USE_ALTERNATE_AR_TRUE
-USE_ALTERNATE_AR_FALSE
-ALTERNATE_AR
-CXXCPP
-USING_EXPORT_MAKEFILES_TRUE
-USING_EXPORT_MAKEFILES_FALSE
-PERL_EXE
-HAVE_PERL
-USING_PERL_TRUE
-USING_PERL_FALSE
-USING_GNUMAKE_TRUE
-USING_GNUMAKE_FALSE
-BUILD_TESTS_TRUE
-BUILD_TESTS_FALSE
-SUB_TEST_TRUE
-SUB_TEST_FALSE
-GREP
-EGREP
-PTHREAD_CC
-PTHREAD_LIBS
-PTHREAD_CFLAGS
-ac_aux_dir
-LIBOBJS
-LTLIBOBJS'
-ac_subst_files=''
-      ac_precious_vars='build_alias
-host_alias
-target_alias
-CC
-CFLAGS
-LDFLAGS
-LIBS
-CPPFLAGS
-CXX
-CXXFLAGS
-CCC
-CXXCPP'
-
-
-# Initialize some variables set by options.
-ac_init_help=
-ac_init_version=false
-# The variables have the same names as the options, with
-# dashes changed to underlines.
-cache_file=/dev/null
-exec_prefix=NONE
-no_create=
-no_recursion=
-prefix=NONE
-program_prefix=NONE
-program_suffix=NONE
-program_transform_name=s,x,x,
-silent=
-site=
-srcdir=
-verbose=
-x_includes=NONE
-x_libraries=NONE
-
-# Installation directory options.
-# These are left unexpanded so users can "make install exec_prefix=/foo"
-# and all the variables that are supposed to be based on exec_prefix
-# by default will actually change.
-# Use braces instead of parens because sh, perl, etc. also accept them.
-# (The list follows the same order as the GNU Coding Standards.)
-bindir='${exec_prefix}/bin'
-sbindir='${exec_prefix}/sbin'
-libexecdir='${exec_prefix}/libexec'
-datarootdir='${prefix}/share'
-datadir='${datarootdir}'
-sysconfdir='${prefix}/etc'
-sharedstatedir='${prefix}/com'
-localstatedir='${prefix}/var'
-includedir='${prefix}/include'
-oldincludedir='/usr/include'
-docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
-infodir='${datarootdir}/info'
-htmldir='${docdir}'
-dvidir='${docdir}'
-pdfdir='${docdir}'
-psdir='${docdir}'
-libdir='${exec_prefix}/lib'
-localedir='${datarootdir}/locale'
-mandir='${datarootdir}/man'
-
-ac_prev=
-ac_dashdash=
-for ac_option
-do
-  # If the previous option needs an argument, assign it.
-  if test -n "$ac_prev"; then
-    eval $ac_prev=\$ac_option
-    ac_prev=
-    continue
-  fi
-
-  case $ac_option in
-  *=*)	ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
-  *)	ac_optarg=yes ;;
-  esac
-
-  # Accept the important Cygnus configure options, so we can diagnose typos.
-
-  case $ac_dashdash$ac_option in
-  --)
-    ac_dashdash=yes ;;
-
-  -bindir | --bindir | --bindi | --bind | --bin | --bi)
-    ac_prev=bindir ;;
-  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
-    bindir=$ac_optarg ;;
-
-  -build | --build | --buil | --bui | --bu)
-    ac_prev=build_alias ;;
-  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
-    build_alias=$ac_optarg ;;
-
-  -cache-file | --cache-file | --cache-fil | --cache-fi \
-  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
-    ac_prev=cache_file ;;
-  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
-  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
-    cache_file=$ac_optarg ;;
-
-  --config-cache | -C)
-    cache_file=config.cache ;;
-
-  -datadir | --datadir | --datadi | --datad)
-    ac_prev=datadir ;;
-  -datadir=* | --datadir=* | --datadi=* | --datad=*)
-    datadir=$ac_optarg ;;
-
-  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
-  | --dataroo | --dataro | --datar)
-    ac_prev=datarootdir ;;
-  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
-  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
-    datarootdir=$ac_optarg ;;
-
-  -disable-* | --disable-*)
-    ac_feature=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_feature" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      { echo "$as_me: error: invalid feature name: $ac_feature" >&2
-   { (exit 1); exit 1; }; }
-    ac_feature=`echo $ac_feature | sed 's/[-.]/_/g'`
-    eval enable_$ac_feature=no ;;
-
-  -docdir | --docdir | --docdi | --doc | --do)
-    ac_prev=docdir ;;
-  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
-    docdir=$ac_optarg ;;
-
-  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
-    ac_prev=dvidir ;;
-  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
-    dvidir=$ac_optarg ;;
-
-  -enable-* | --enable-*)
-    ac_feature=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_feature" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      { echo "$as_me: error: invalid feature name: $ac_feature" >&2
-   { (exit 1); exit 1; }; }
-    ac_feature=`echo $ac_feature | sed 's/[-.]/_/g'`
-    eval enable_$ac_feature=\$ac_optarg ;;
-
-  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
-  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
-  | --exec | --exe | --ex)
-    ac_prev=exec_prefix ;;
-  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
-  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
-  | --exec=* | --exe=* | --ex=*)
-    exec_prefix=$ac_optarg ;;
-
-  -gas | --gas | --ga | --g)
-    # Obsolete; use --with-gas.
-    with_gas=yes ;;
-
-  -help | --help | --hel | --he | -h)
-    ac_init_help=long ;;
-  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
-    ac_init_help=recursive ;;
-  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
-    ac_init_help=short ;;
-
-  -host | --host | --hos | --ho)
-    ac_prev=host_alias ;;
-  -host=* | --host=* | --hos=* | --ho=*)
-    host_alias=$ac_optarg ;;
-
-  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
-    ac_prev=htmldir ;;
-  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
-  | --ht=*)
-    htmldir=$ac_optarg ;;
-
-  -includedir | --includedir | --includedi | --included | --include \
-  | --includ | --inclu | --incl | --inc)
-    ac_prev=includedir ;;
-  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
-  | --includ=* | --inclu=* | --incl=* | --inc=*)
-    includedir=$ac_optarg ;;
-
-  -infodir | --infodir | --infodi | --infod | --info | --inf)
-    ac_prev=infodir ;;
-  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
-    infodir=$ac_optarg ;;
-
-  -libdir | --libdir | --libdi | --libd)
-    ac_prev=libdir ;;
-  -libdir=* | --libdir=* | --libdi=* | --libd=*)
-    libdir=$ac_optarg ;;
-
-  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
-  | --libexe | --libex | --libe)
-    ac_prev=libexecdir ;;
-  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
-  | --libexe=* | --libex=* | --libe=*)
-    libexecdir=$ac_optarg ;;
-
-  -localedir | --localedir | --localedi | --localed | --locale)
-    ac_prev=localedir ;;
-  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
-    localedir=$ac_optarg ;;
-
-  -localstatedir | --localstatedir | --localstatedi | --localstated \
-  | --localstate | --localstat | --localsta | --localst | --locals)
-    ac_prev=localstatedir ;;
-  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
-  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
-    localstatedir=$ac_optarg ;;
-
-  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
-    ac_prev=mandir ;;
-  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
-    mandir=$ac_optarg ;;
-
-  -nfp | --nfp | --nf)
-    # Obsolete; use --without-fp.
-    with_fp=no ;;
-
-  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
-  | --no-cr | --no-c | -n)
-    no_create=yes ;;
-
-  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
-  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
-    no_recursion=yes ;;
-
-  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
-  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
-  | --oldin | --oldi | --old | --ol | --o)
-    ac_prev=oldincludedir ;;
-  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
-  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
-  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
-    oldincludedir=$ac_optarg ;;
-
-  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
-    ac_prev=prefix ;;
-  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
-    prefix=$ac_optarg ;;
-
-  -program-prefix | --program-prefix | --program-prefi | --program-pref \
-  | --program-pre | --program-pr | --program-p)
-    ac_prev=program_prefix ;;
-  -program-prefix=* | --program-prefix=* | --program-prefi=* \
-  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
-    program_prefix=$ac_optarg ;;
-
-  -program-suffix | --program-suffix | --program-suffi | --program-suff \
-  | --program-suf | --program-su | --program-s)
-    ac_prev=program_suffix ;;
-  -program-suffix=* | --program-suffix=* | --program-suffi=* \
-  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
-    program_suffix=$ac_optarg ;;
-
-  -program-transform-name | --program-transform-name \
-  | --program-transform-nam | --program-transform-na \
-  | --program-transform-n | --program-transform- \
-  | --program-transform | --program-transfor \
-  | --program-transfo | --program-transf \
-  | --program-trans | --program-tran \
-  | --progr-tra | --program-tr | --program-t)
-    ac_prev=program_transform_name ;;
-  -program-transform-name=* | --program-transform-name=* \
-  | --program-transform-nam=* | --program-transform-na=* \
-  | --program-transform-n=* | --program-transform-=* \
-  | --program-transform=* | --program-transfor=* \
-  | --program-transfo=* | --program-transf=* \
-  | --program-trans=* | --program-tran=* \
-  | --progr-tra=* | --program-tr=* | --program-t=*)
-    program_transform_name=$ac_optarg ;;
-
-  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
-    ac_prev=pdfdir ;;
-  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
-    pdfdir=$ac_optarg ;;
-
-  -psdir | --psdir | --psdi | --psd | --ps)
-    ac_prev=psdir ;;
-  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
-    psdir=$ac_optarg ;;
-
-  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
-  | -silent | --silent | --silen | --sile | --sil)
-    silent=yes ;;
-
-  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
-    ac_prev=sbindir ;;
-  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
-  | --sbi=* | --sb=*)
-    sbindir=$ac_optarg ;;
-
-  -sharedstatedir | --sharedstatedir | --sharedstatedi \
-  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
-  | --sharedst | --shareds | --shared | --share | --shar \
-  | --sha | --sh)
-    ac_prev=sharedstatedir ;;
-  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
-  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
-  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
-  | --sha=* | --sh=*)
-    sharedstatedir=$ac_optarg ;;
-
-  -site | --site | --sit)
-    ac_prev=site ;;
-  -site=* | --site=* | --sit=*)
-    site=$ac_optarg ;;
-
-  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
-    ac_prev=srcdir ;;
-  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
-    srcdir=$ac_optarg ;;
-
-  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
-  | --syscon | --sysco | --sysc | --sys | --sy)
-    ac_prev=sysconfdir ;;
-  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
-  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
-    sysconfdir=$ac_optarg ;;
-
-  -target | --target | --targe | --targ | --tar | --ta | --t)
-    ac_prev=target_alias ;;
-  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
-    target_alias=$ac_optarg ;;
-
-  -v | -verbose | --verbose | --verbos | --verbo | --verb)
-    verbose=yes ;;
-
-  -version | --version | --versio | --versi | --vers | -V)
-    ac_init_version=: ;;
-
-  -with-* | --with-*)
-    ac_package=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_package" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      { echo "$as_me: error: invalid package name: $ac_package" >&2
-   { (exit 1); exit 1; }; }
-    ac_package=`echo $ac_package | sed 's/[-.]/_/g'`
-    eval with_$ac_package=\$ac_optarg ;;
-
-  -without-* | --without-*)
-    ac_package=`expr "x$ac_option" : 'x-*without-\(.*\)'`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_package" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      { echo "$as_me: error: invalid package name: $ac_package" >&2
-   { (exit 1); exit 1; }; }
-    ac_package=`echo $ac_package | sed 's/[-.]/_/g'`
-    eval with_$ac_package=no ;;
-
-  --x)
-    # Obsolete; use --with-x.
-    with_x=yes ;;
-
-  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
-  | --x-incl | --x-inc | --x-in | --x-i)
-    ac_prev=x_includes ;;
-  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
-  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
-    x_includes=$ac_optarg ;;
-
-  -x-libraries | --x-libraries | --x-librarie | --x-librari \
-  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
-    ac_prev=x_libraries ;;
-  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
-  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
-    x_libraries=$ac_optarg ;;
-
-  -*) { echo "$as_me: error: unrecognized option: $ac_option
-Try \`$0 --help' for more information." >&2
-   { (exit 1); exit 1; }; }
-    ;;
-
-  *=*)
-    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
-    # Reject names that are not valid shell variable names.
-    expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null &&
-      { echo "$as_me: error: invalid variable name: $ac_envvar" >&2
-   { (exit 1); exit 1; }; }
-    eval $ac_envvar=\$ac_optarg
-    export $ac_envvar ;;
-
-  *)
-    # FIXME: should be removed in autoconf 3.0.
-    echo "$as_me: WARNING: you should use --build, --host, --target" >&2
-    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
-      echo "$as_me: WARNING: invalid host type: $ac_option" >&2
-    : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}
-    ;;
-
-  esac
-done
-
-if test -n "$ac_prev"; then
-  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
-  { echo "$as_me: error: missing argument to $ac_option" >&2
-   { (exit 1); exit 1; }; }
-fi
-
-# Be sure to have absolute directory names.
-for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
-		datadir sysconfdir sharedstatedir localstatedir includedir \
-		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-		libdir localedir mandir
-do
-  eval ac_val=\$$ac_var
-  case $ac_val in
-    [\\/$]* | ?:[\\/]* )  continue;;
-    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
-  esac
-  { echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2
-   { (exit 1); exit 1; }; }
-done
-
-# There might be people who depend on the old broken behavior: `$host'
-# used to hold the argument of --host etc.
-# FIXME: To remove some day.
-build=$build_alias
-host=$host_alias
-target=$target_alias
-
-# FIXME: To remove some day.
-if test "x$host_alias" != x; then
-  if test "x$build_alias" = x; then
-    cross_compiling=maybe
-    echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host.
-    If a cross compiler is detected then cross compile mode will be used." >&2
-  elif test "x$build_alias" != "x$host_alias"; then
-    cross_compiling=yes
-  fi
-fi
-
-ac_tool_prefix=
-test -n "$host_alias" && ac_tool_prefix=$host_alias-
-
-test "$silent" = yes && exec 6>/dev/null
-
-
-ac_pwd=`pwd` && test -n "$ac_pwd" &&
-ac_ls_di=`ls -di .` &&
-ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
-  { echo "$as_me: error: Working directory cannot be determined" >&2
-   { (exit 1); exit 1; }; }
-test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
-  { echo "$as_me: error: pwd does not report name of working directory" >&2
-   { (exit 1); exit 1; }; }
-
-
-# Find the source files, if location was not specified.
-if test -z "$srcdir"; then
-  ac_srcdir_defaulted=yes
-  # Try the directory containing this script, then the parent directory.
-  ac_confdir=`$as_dirname -- "$0" ||
-$as_expr X"$0" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$0" : 'X\(//\)[^/]' \| \
-	 X"$0" : 'X\(//\)$' \| \
-	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-echo X"$0" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-  srcdir=$ac_confdir
-  if test ! -r "$srcdir/$ac_unique_file"; then
-    srcdir=..
-  fi
-else
-  ac_srcdir_defaulted=no
-fi
-if test ! -r "$srcdir/$ac_unique_file"; then
-  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
-  { echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2
-   { (exit 1); exit 1; }; }
-fi
-ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
-ac_abs_confdir=`(
-	cd "$srcdir" && test -r "./$ac_unique_file" || { echo "$as_me: error: $ac_msg" >&2
-   { (exit 1); exit 1; }; }
-	pwd)`
-# When building in place, set srcdir=.
-if test "$ac_abs_confdir" = "$ac_pwd"; then
-  srcdir=.
-fi
-# Remove unnecessary trailing slashes from srcdir.
-# Double slashes in file names in object file debugging info
-# mess up M-x gdb in Emacs.
-case $srcdir in
-*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
-esac
-for ac_var in $ac_precious_vars; do
-  eval ac_env_${ac_var}_set=\${${ac_var}+set}
-  eval ac_env_${ac_var}_value=\$${ac_var}
-  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
-  eval ac_cv_env_${ac_var}_value=\$${ac_var}
-done
-
-#
-# Report the --help message.
-#
-if test "$ac_init_help" = "long"; then
-  # Omit some internal or obsolete options to make the list less imposing.
-  # This message is too long to be a string in the A/UX 3.1 sh.
-  cat <<_ACEOF
-\`configure' configures ThreadPool 1.1d to adapt to many kinds of systems.
-
-Usage: $0 [OPTION]... [VAR=VALUE]...
-
-To assign environment variables (e.g., CC, CFLAGS...), specify them as
-VAR=VALUE.  See below for descriptions of some of the useful variables.
-
-Defaults for the options are specified in brackets.
-
-Configuration:
-  -h, --help              display this help and exit
-      --help=short        display options specific to this package
-      --help=recursive    display the short help of all the included packages
-  -V, --version           display version information and exit
-  -q, --quiet, --silent   do not print \`checking...' messages
-      --cache-file=FILE   cache test results in FILE [disabled]
-  -C, --config-cache      alias for \`--cache-file=config.cache'
-  -n, --no-create         do not create output files
-      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
-
-Installation directories:
-  --prefix=PREFIX         install architecture-independent files in PREFIX
-			  [$ac_default_prefix]
-  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
-			  [PREFIX]
-
-By default, \`make install' will install all the files in
-\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
-an installation prefix other than \`$ac_default_prefix' using \`--prefix',
-for instance \`--prefix=\$HOME'.
-
-For better control, use the options below.
-
-Fine tuning of the installation directories:
-  --bindir=DIR           user executables [EPREFIX/bin]
-  --sbindir=DIR          system admin executables [EPREFIX/sbin]
-  --libexecdir=DIR       program executables [EPREFIX/libexec]
-  --sysconfdir=DIR       read-only single-machine data [PREFIX/etc]
-  --sharedstatedir=DIR   modifiable architecture-independent data [PREFIX/com]
-  --localstatedir=DIR    modifiable single-machine data [PREFIX/var]
-  --libdir=DIR           object code libraries [EPREFIX/lib]
-  --includedir=DIR       C header files [PREFIX/include]
-  --oldincludedir=DIR    C header files for non-gcc [/usr/include]
-  --datarootdir=DIR      read-only arch.-independent data root [PREFIX/share]
-  --datadir=DIR          read-only architecture-independent data [DATAROOTDIR]
-  --infodir=DIR          info documentation [DATAROOTDIR/info]
-  --localedir=DIR        locale-dependent data [DATAROOTDIR/locale]
-  --mandir=DIR           man documentation [DATAROOTDIR/man]
-  --docdir=DIR           documentation root [DATAROOTDIR/doc/threadpool]
-  --htmldir=DIR          html documentation [DOCDIR]
-  --dvidir=DIR           dvi documentation [DOCDIR]
-  --pdfdir=DIR           pdf documentation [DOCDIR]
-  --psdir=DIR            ps documentation [DOCDIR]
-_ACEOF
-
-  cat <<\_ACEOF
-
-Program names:
-  --program-prefix=PREFIX            prepend PREFIX to installed program names
-  --program-suffix=SUFFIX            append SUFFIX to installed program names
-  --program-transform-name=PROGRAM   run sed PROGRAM on installed program names
-
-System types:
-  --build=BUILD     configure for building on BUILD [guessed]
-  --host=HOST       cross-compile to build programs to run on HOST [BUILD]
-  --target=TARGET   configure for building compilers for TARGET [HOST]
-_ACEOF
-fi
-
-if test -n "$ac_init_help"; then
-  case $ac_init_help in
-     short | recursive ) echo "Configuration of ThreadPool 1.1d:";;
-   esac
-  cat <<\_ACEOF
-
-Optional Features:
-  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
-  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
-  --enable-maintainer-mode  enable make rules and dependencies not useful
-			  (and sometimes confusing) to the casual installer
-  --enable-mpi            MPI support
-  --disable-dependency-tracking  speeds up one-time build
-  --enable-dependency-tracking   do not reject slow dependency extractors
-  --enable-export-makefiles
-                          Creates export makefiles in the install (prefix)
-                          directory. This option requires perl to be set in
-                          your path or defined with --with-perl=<perl
-                          executable>. Note that the export makefiles are
-                          always created and used in the build directory, but
-                          will not be installable without this option to
-                          change the paths. (default is yes)
-  --enable-tests          Make tests for all Trilinos packages buildable with
-                          'make tests' (default is yes)
-
-  --enable-threadpool-tests
-                          Make ThreadPool tests buildable with 'make tests'
-                          (default is yes if --disable-tests is not specified)
-  --enable-libcheck       Check for some third-party libraries. (Cannot be
-                          disabled unless tests and examples are also
-                          disabled.) (default is yes)
-
-Optional Packages:
-  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
-  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
-  --with-install=INSTALL_PROGRAM
-                          Use the installation program INSTALL_PROGRAM rather
-                          the default that is provided. For example
-                          --with-install="/path/install -p"
-  --with-mpi-compilers=PATH
-                          use MPI compilers mpicc, mpif77, and mpicxx, mpic++
-                          or mpiCC in the specified path or in the default
-                          path if no path is specified. Enables MPI
-  --with-mpi=MPIROOT      use MPI root directory (enables MPI)
-  --with-mpi-libs="LIBS"  MPI libraries ["-lmpi"]
-  --with-mpi-incdir=DIR   MPI include directory [MPIROOT/include] Do not use
-                          -I
-  --with-mpi-libdir=DIR   MPI library directory [MPIROOT/lib] Do not use -L
-  --with-ccflags          additional CCFLAGS flags to be added: will prepend
-                          to CCFLAGS
-  --with-cxxflags         additional CXXFLAGS flags to be added: will
-                          prepend to CXXFLAGS
-  --with-cflags           additional CFLAGS flags to be added: will prepend
-                          to CFLAGS
-  --with-libs             List additional libraries here. For example,
-                          --with-libs=-lsuperlu or
-                          --with-libs=/path/libsuperlu.a
-  --with-ldflags          additional LDFLAGS flags to be added: will prepend
-                          to LDFLAGS
-  --with-ar               override archiver command (default is "ar cru")
-  --with-perl             supply a perl executable. For example
-                          --with-perl=/usr/bin/perl.
-  --with-gnumake          Gnu's make has special functions we can use to
-                          eliminate redundant paths in the build and link
-                          lines. Enable this if you use gnu-make to build
-                          Trilinos. This requires that perl is in your path or
-                          that you have specified the perl executable with
-                          --with-perl=<perl executable>. Configure will check
-                          for the existence of the perl executable and quit
-                          with an error if it is not found. (default is no)
-  --with-libdirs          OBSOLETE use --with-ldflags instead. (ex.
-                          --with-ldflags="-L<DIR> -L<DIR2>")
-  --with-incdirs          additional directories containing include files:
-                          will prepend to search here for includes, use -Idir
-                          format
-
-Some influential environment variables:
-  CC          C compiler command
-  CFLAGS      C compiler flags
-  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
-              nonstandard directory <lib dir>
-  LIBS        libraries to pass to the linker, e.g. -l<library>
-  CPPFLAGS    C/C++/Objective C preprocessor flags, e.g. -I<include dir> if
-              you have headers in a nonstandard directory <include dir>
-  CXX         C++ compiler command
-  CXXFLAGS    C++ compiler flags
-  CXXCPP      C++ preprocessor
-
-Use these variables to override the choices made by `configure' or to help
-it to find libraries and programs with nonstandard names/locations.
-
-Report bugs to <hcedwar@sandia.gov>.
-_ACEOF
-ac_status=$?
-fi
-
-if test "$ac_init_help" = "recursive"; then
-  # If there are subdirs, report their specific --help.
-  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
-    test -d "$ac_dir" || continue
-    ac_builddir=.
-
-case "$ac_dir" in
-.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
-*)
-  ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'`
-  # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,/..,g;s,/,,'`
-  case $ac_top_builddir_sub in
-  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
-  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
-  esac ;;
-esac
-ac_abs_top_builddir=$ac_pwd
-ac_abs_builddir=$ac_pwd$ac_dir_suffix
-# for backward compatibility:
-ac_top_builddir=$ac_top_build_prefix
-
-case $srcdir in
-  .)  # We are building in place.
-    ac_srcdir=.
-    ac_top_srcdir=$ac_top_builddir_sub
-    ac_abs_top_srcdir=$ac_pwd ;;
-  [\\/]* | ?:[\\/]* )  # Absolute name.
-    ac_srcdir=$srcdir$ac_dir_suffix;
-    ac_top_srcdir=$srcdir
-    ac_abs_top_srcdir=$srcdir ;;
-  *) # Relative name.
-    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
-    ac_top_srcdir=$ac_top_build_prefix$srcdir
-    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
-esac
-ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
-
-    cd "$ac_dir" || { ac_status=$?; continue; }
-    # Check for guested configure.
-    if test -f "$ac_srcdir/configure.gnu"; then
-      echo &&
-      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
-    elif test -f "$ac_srcdir/configure"; then
-      echo &&
-      $SHELL "$ac_srcdir/configure" --help=recursive
-    else
-      echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
-    fi || ac_status=$?
-    cd "$ac_pwd" || { ac_status=$?; break; }
-  done
-fi
-
-test -n "$ac_init_help" && exit $ac_status
-if $ac_init_version; then
-  cat <<\_ACEOF
-ThreadPool configure 1.1d
-generated by GNU Autoconf 2.61
-
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
-2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
-This configure script is free software; the Free Software Foundation
-gives unlimited permission to copy, distribute and modify it.
-_ACEOF
-  exit
-fi
-cat >config.log <<_ACEOF
-This file contains any messages produced by compilers while
-running configure, to aid debugging if configure makes a mistake.
-
-It was created by ThreadPool $as_me 1.1d, which was
-generated by GNU Autoconf 2.61.  Invocation command line was
-
-  $ $0 $@
-
-_ACEOF
-exec 5>>config.log
-{
-cat <<_ASUNAME
-## --------- ##
-## Platform. ##
-## --------- ##
-
-hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
-uname -m = `(uname -m) 2>/dev/null || echo unknown`
-uname -r = `(uname -r) 2>/dev/null || echo unknown`
-uname -s = `(uname -s) 2>/dev/null || echo unknown`
-uname -v = `(uname -v) 2>/dev/null || echo unknown`
-
-/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
-/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
-
-/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
-/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
-/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
-/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
-/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
-/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
-/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
-
-_ASUNAME
-
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  echo "PATH: $as_dir"
-done
-IFS=$as_save_IFS
-
-} >&5
-
-cat >&5 <<_ACEOF
-
-
-## ----------- ##
-## Core tests. ##
-## ----------- ##
-
-_ACEOF
-
-
-# Keep a trace of the command line.
-# Strip out --no-create and --no-recursion so they do not pile up.
-# Strip out --silent because we don't want to record it for future runs.
-# Also quote any args containing shell meta-characters.
-# Make two passes to allow for proper duplicate-argument suppression.
-ac_configure_args=
-ac_configure_args0=
-ac_configure_args1=
-ac_must_keep_next=false
-for ac_pass in 1 2
-do
-  for ac_arg
-  do
-    case $ac_arg in
-    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
-    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
-    | -silent | --silent | --silen | --sile | --sil)
-      continue ;;
-    *\'*)
-      ac_arg=`echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
-    esac
-    case $ac_pass in
-    1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;;
-    2)
-      ac_configure_args1="$ac_configure_args1 '$ac_arg'"
-      if test $ac_must_keep_next = true; then
-	ac_must_keep_next=false # Got value, back to normal.
-      else
-	case $ac_arg in
-	  *=* | --config-cache | -C | -disable-* | --disable-* \
-	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
-	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
-	  | -with-* | --with-* | -without-* | --without-* | --x)
-	    case "$ac_configure_args0 " in
-	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
-	    esac
-	    ;;
-	  -* ) ac_must_keep_next=true ;;
-	esac
-      fi
-      ac_configure_args="$ac_configure_args '$ac_arg'"
-      ;;
-    esac
-  done
-done
-$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; }
-$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; }
-
-# When interrupted or exit'd, cleanup temporary files, and complete
-# config.log.  We remove comments because anyway the quotes in there
-# would cause problems or look ugly.
-# WARNING: Use '\'' to represent an apostrophe within the trap.
-# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
-trap 'exit_status=$?
-  # Save into config.log some information that might help in debugging.
-  {
-    echo
-
-    cat <<\_ASBOX
-## ---------------- ##
-## Cache variables. ##
-## ---------------- ##
-_ASBOX
-    echo
-    # The following way of writing the cache mishandles newlines in values,
-(
-  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
-    eval ac_val=\$$ac_var
-    case $ac_val in #(
-    *${as_nl}*)
-      case $ac_var in #(
-      *_cv_*) { echo "$as_me:$LINENO: WARNING: Cache variable $ac_var contains a newline." >&5
-echo "$as_me: WARNING: Cache variable $ac_var contains a newline." >&2;} ;;
-      esac
-      case $ac_var in #(
-      _ | IFS | as_nl) ;; #(
-      *) $as_unset $ac_var ;;
-      esac ;;
-    esac
-  done
-  (set) 2>&1 |
-    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
-    *${as_nl}ac_space=\ *)
-      sed -n \
-	"s/'\''/'\''\\\\'\'''\''/g;
-	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
-      ;; #(
-    *)
-      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
-      ;;
-    esac |
-    sort
-)
-    echo
-
-    cat <<\_ASBOX
-## ----------------- ##
-## Output variables. ##
-## ----------------- ##
-_ASBOX
-    echo
-    for ac_var in $ac_subst_vars
-    do
-      eval ac_val=\$$ac_var
-      case $ac_val in
-      *\'\''*) ac_val=`echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
-      esac
-      echo "$ac_var='\''$ac_val'\''"
-    done | sort
-    echo
-
-    if test -n "$ac_subst_files"; then
-      cat <<\_ASBOX
-## ------------------- ##
-## File substitutions. ##
-## ------------------- ##
-_ASBOX
-      echo
-      for ac_var in $ac_subst_files
-      do
-	eval ac_val=\$$ac_var
-	case $ac_val in
-	*\'\''*) ac_val=`echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
-	esac
-	echo "$ac_var='\''$ac_val'\''"
-      done | sort
-      echo
-    fi
-
-    if test -s confdefs.h; then
-      cat <<\_ASBOX
-## ----------- ##
-## confdefs.h. ##
-## ----------- ##
-_ASBOX
-      echo
-      cat confdefs.h
-      echo
-    fi
-    test "$ac_signal" != 0 &&
-      echo "$as_me: caught signal $ac_signal"
-    echo "$as_me: exit $exit_status"
-  } >&5
-  rm -f core *.core core.conftest.* &&
-    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
-    exit $exit_status
-' 0
-for ac_signal in 1 2 13 15; do
-  trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal
-done
-ac_signal=0
-
-# confdefs.h avoids OS command line length limits that DEFS can exceed.
-rm -f -r conftest* confdefs.h
-
-# Predefined preprocessor variables.
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_NAME "$PACKAGE_NAME"
-_ACEOF
-
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
-_ACEOF
-
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_VERSION "$PACKAGE_VERSION"
-_ACEOF
-
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_STRING "$PACKAGE_STRING"
-_ACEOF
-
-
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
-_ACEOF
-
-
-# Let the site file select an alternate cache file if it wants to.
-# Prefer explicitly selected file to automatically selected ones.
-if test -n "$CONFIG_SITE"; then
-  set x "$CONFIG_SITE"
-elif test "x$prefix" != xNONE; then
-  set x "$prefix/share/config.site" "$prefix/etc/config.site"
-else
-  set x "$ac_default_prefix/share/config.site" \
-	"$ac_default_prefix/etc/config.site"
-fi
-shift
-for ac_site_file
-do
-  if test -r "$ac_site_file"; then
-    { echo "$as_me:$LINENO: loading site script $ac_site_file" >&5
-echo "$as_me: loading site script $ac_site_file" >&6;}
-    sed 's/^/| /' "$ac_site_file" >&5
-    . "$ac_site_file"
-  fi
-done
-
-if test -r "$cache_file"; then
-  # Some versions of bash will fail to source /dev/null (special
-  # files actually), so we avoid doing that.
-  if test -f "$cache_file"; then
-    { echo "$as_me:$LINENO: loading cache $cache_file" >&5
-echo "$as_me: loading cache $cache_file" >&6;}
-    case $cache_file in
-      [\\/]* | ?:[\\/]* ) . "$cache_file";;
-      *)                      . "./$cache_file";;
-    esac
-  fi
-else
-  { echo "$as_me:$LINENO: creating cache $cache_file" >&5
-echo "$as_me: creating cache $cache_file" >&6;}
-  >$cache_file
-fi
-
-# Check that the precious variables saved in the cache have kept the same
-# value.
-ac_cache_corrupted=false
-for ac_var in $ac_precious_vars; do
-  eval ac_old_set=\$ac_cv_env_${ac_var}_set
-  eval ac_new_set=\$ac_env_${ac_var}_set
-  eval ac_old_val=\$ac_cv_env_${ac_var}_value
-  eval ac_new_val=\$ac_env_${ac_var}_value
-  case $ac_old_set,$ac_new_set in
-    set,)
-      { echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
-echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
-      ac_cache_corrupted=: ;;
-    ,set)
-      { echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5
-echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
-      ac_cache_corrupted=: ;;
-    ,);;
-    *)
-      if test "x$ac_old_val" != "x$ac_new_val"; then
-	{ echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5
-echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
-	{ echo "$as_me:$LINENO:   former value:  $ac_old_val" >&5
-echo "$as_me:   former value:  $ac_old_val" >&2;}
-	{ echo "$as_me:$LINENO:   current value: $ac_new_val" >&5
-echo "$as_me:   current value: $ac_new_val" >&2;}
-	ac_cache_corrupted=:
-      fi;;
-  esac
-  # Pass precious variables to config.status.
-  if test "$ac_new_set" = set; then
-    case $ac_new_val in
-    *\'*) ac_arg=$ac_var=`echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
-    *) ac_arg=$ac_var=$ac_new_val ;;
-    esac
-    case " $ac_configure_args " in
-      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
-      *) ac_configure_args="$ac_configure_args '$ac_arg'" ;;
-    esac
-  fi
-done
-if $ac_cache_corrupted; then
-  { echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5
-echo "$as_me: error: changes in the environment can compromise the build" >&2;}
-  { { echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5
-echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-
-
-# Hello World!
-echo "----------------------------------------"
-echo "Running ThreadPool Configure Script"
-echo "----------------------------------------"
-
-# This is to protect against accidentally specifying the wrong
-# directory with --srcdir.  Any file in that directory will do,
-# preferably one that is unlikely to be removed or renamed.
-
-
-
-# Specify directory for auxillary build tools (e.g., install-sh,
-# config.sub, config.guess) and M4 files.
-
-ac_aux_dir=
-for ac_dir in config "$srcdir"/config; do
-  if test -f "$ac_dir/install-sh"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install-sh -c"
-    break
-  elif test -f "$ac_dir/install.sh"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/install.sh -c"
-    break
-  elif test -f "$ac_dir/shtool"; then
-    ac_aux_dir=$ac_dir
-    ac_install_sh="$ac_aux_dir/shtool install -c"
-    break
-  fi
-done
-if test -z "$ac_aux_dir"; then
-  { { echo "$as_me:$LINENO: error: cannot find install-sh or install.sh in config \"$srcdir\"/config" >&5
-echo "$as_me: error: cannot find install-sh or install.sh in config \"$srcdir\"/config" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-# These three variables are undocumented and unsupported,
-# and are intended to be withdrawn in a future Autoconf release.
-# They can cause serious problems if a builder's source tree is in a directory
-# whose full name contains unusual characters.
-ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
-ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
-ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
-
-
-#  #auto np# - Change file names in next line
-# Configure should create src/ThreadPool_config.h from src/ThreadPool_config.h.in
-
-ac_config_headers="$ac_config_headers src/ThreadPool_config.h:src/ThreadPool_config.h.in"
-
-
-# Allow users to specify their own "install" command.  If none is specified,
-# the default is install-sh found in the config subdirectory.
-
-
-# Check whether --with-install was given.
-if test "${with_install+set}" = set; then
-  withval=$with_install;
-   INSTALL=$withval
-   INSTALL_PROGRAM=$withval
-   INSTALL_SCRIPT=$withval
-   INSTALL_DATA="$withval -m 644"
-
-fi
-
-
-# AM_MAINTAINER_MODE turns off maintainer-only makefile targets by
-# default, and changes configure to understand a
-# --enable-maintainer-mode option. --enable-maintainer-mode turns the
-# maintainer-only targets back on. The maintainer-only makefile
-# targets permit end users to clean automatically-generated files such
-# as configure, which means they have to have autoconf and automake
-# installed to repair the damage. AM_MAINTAINER_MODE makes it a bit
-# harder for users to shoot themselves in the foot.
-
-{ echo "$as_me:$LINENO: checking whether to enable maintainer-specific portions of Makefiles" >&5
-echo $ECHO_N "checking whether to enable maintainer-specific portions of Makefiles... $ECHO_C" >&6; }
-    # Check whether --enable-maintainer-mode was given.
-if test "${enable_maintainer_mode+set}" = set; then
-  enableval=$enable_maintainer_mode; USE_MAINTAINER_MODE=$enableval
-else
-  USE_MAINTAINER_MODE=no
-fi
-
-  { echo "$as_me:$LINENO: result: $USE_MAINTAINER_MODE" >&5
-echo "${ECHO_T}$USE_MAINTAINER_MODE" >&6; }
-   if test $USE_MAINTAINER_MODE = yes; then
-  MAINTAINER_MODE_TRUE=
-  MAINTAINER_MODE_FALSE='#'
-else
-  MAINTAINER_MODE_TRUE='#'
-  MAINTAINER_MODE_FALSE=
-fi
-
-  MAINT=$MAINTAINER_MODE_TRUE
-
-
-
-# Define $build, $host, $target, etc
-
-# Make sure we can run config.sub.
-$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 ||
-  { { echo "$as_me:$LINENO: error: cannot run $SHELL $ac_aux_dir/config.sub" >&5
-echo "$as_me: error: cannot run $SHELL $ac_aux_dir/config.sub" >&2;}
-   { (exit 1); exit 1; }; }
-
-{ echo "$as_me:$LINENO: checking build system type" >&5
-echo $ECHO_N "checking build system type... $ECHO_C" >&6; }
-if test "${ac_cv_build+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_build_alias=$build_alias
-test "x$ac_build_alias" = x &&
-  ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"`
-test "x$ac_build_alias" = x &&
-  { { echo "$as_me:$LINENO: error: cannot guess build type; you must specify one" >&5
-echo "$as_me: error: cannot guess build type; you must specify one" >&2;}
-   { (exit 1); exit 1; }; }
-ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` ||
-  { { echo "$as_me:$LINENO: error: $SHELL $ac_aux_dir/config.sub $ac_build_alias failed" >&5
-echo "$as_me: error: $SHELL $ac_aux_dir/config.sub $ac_build_alias failed" >&2;}
-   { (exit 1); exit 1; }; }
-
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_build" >&5
-echo "${ECHO_T}$ac_cv_build" >&6; }
-case $ac_cv_build in
-*-*-*) ;;
-*) { { echo "$as_me:$LINENO: error: invalid value of canonical build" >&5
-echo "$as_me: error: invalid value of canonical build" >&2;}
-   { (exit 1); exit 1; }; };;
-esac
-build=$ac_cv_build
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_build
-shift
-build_cpu=$1
-build_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-build_os=$*
-IFS=$ac_save_IFS
-case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac
-
-
-{ echo "$as_me:$LINENO: checking host system type" >&5
-echo $ECHO_N "checking host system type... $ECHO_C" >&6; }
-if test "${ac_cv_host+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test "x$host_alias" = x; then
-  ac_cv_host=$ac_cv_build
-else
-  ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` ||
-    { { echo "$as_me:$LINENO: error: $SHELL $ac_aux_dir/config.sub $host_alias failed" >&5
-echo "$as_me: error: $SHELL $ac_aux_dir/config.sub $host_alias failed" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_host" >&5
-echo "${ECHO_T}$ac_cv_host" >&6; }
-case $ac_cv_host in
-*-*-*) ;;
-*) { { echo "$as_me:$LINENO: error: invalid value of canonical host" >&5
-echo "$as_me: error: invalid value of canonical host" >&2;}
-   { (exit 1); exit 1; }; };;
-esac
-host=$ac_cv_host
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_host
-shift
-host_cpu=$1
-host_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-host_os=$*
-IFS=$ac_save_IFS
-case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
-
-
-{ echo "$as_me:$LINENO: checking target system type" >&5
-echo $ECHO_N "checking target system type... $ECHO_C" >&6; }
-if test "${ac_cv_target+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test "x$target_alias" = x; then
-  ac_cv_target=$ac_cv_host
-else
-  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
-    { { echo "$as_me:$LINENO: error: $SHELL $ac_aux_dir/config.sub $target_alias failed" >&5
-echo "$as_me: error: $SHELL $ac_aux_dir/config.sub $target_alias failed" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_target" >&5
-echo "${ECHO_T}$ac_cv_target" >&6; }
-case $ac_cv_target in
-*-*-*) ;;
-*) { { echo "$as_me:$LINENO: error: invalid value of canonical target" >&5
-echo "$as_me: error: invalid value of canonical target" >&2;}
-   { (exit 1); exit 1; }; };;
-esac
-target=$ac_cv_target
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_target
-shift
-target_cpu=$1
-target_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-target_os=$*
-IFS=$ac_save_IFS
-case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
-
-
-# The aliases save the names the user supplied, while $host etc.
-# will get canonicalized.
-test -n "$target_alias" &&
-  test "$program_prefix$program_suffix$program_transform_name" = \
-    NONENONEs,x,x, &&
-  program_prefix=${target_alias}-
-
-# Use automake
-
-#  - Required version of automake.
-am__api_version='1.10'
-
-# Find a good install program.  We prefer a C program (faster),
-# so one script is as good as another.  But avoid the broken or
-# incompatible versions:
-# SysV /etc/install, /usr/sbin/install
-# SunOS /usr/etc/install
-# IRIX /sbin/install
-# AIX /bin/install
-# AmigaOS /C/install, which installs bootblocks on floppy discs
-# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
-# AFS /usr/afsws/bin/install, which mishandles nonexistent args
-# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
-# OS/2's system install, which has a completely different semantic
-# ./install, which can be erroneously created by make from ./install.sh.
-{ echo "$as_me:$LINENO: checking for a BSD-compatible install" >&5
-echo $ECHO_N "checking for a BSD-compatible install... $ECHO_C" >&6; }
-if test -z "$INSTALL"; then
-if test "${ac_cv_path_install+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  # Account for people who put trailing slashes in PATH elements.
-case $as_dir/ in
-  ./ | .// | /cC/* | \
-  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
-  ?:\\/os2\\/install\\/* | ?:\\/OS2\\/INSTALL\\/* | \
-  /usr/ucb/* ) ;;
-  *)
-    # OSF1 and SCO ODT 3.0 have their own names for install.
-    # Don't use installbsd from OSF since it installs stuff as root
-    # by default.
-    for ac_prog in ginstall scoinst install; do
-      for ac_exec_ext in '' $ac_executable_extensions; do
-	if { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; }; then
-	  if test $ac_prog = install &&
-	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
-	    # AIX install.  It has an incompatible calling convention.
-	    :
-	  elif test $ac_prog = install &&
-	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
-	    # program-specific install script used by HP pwplus--don't use.
-	    :
-	  else
-	    ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
-	    break 3
-	  fi
-	fi
-      done
-    done
-    ;;
-esac
-done
-IFS=$as_save_IFS
-
-
-fi
-  if test "${ac_cv_path_install+set}" = set; then
-    INSTALL=$ac_cv_path_install
-  else
-    # As a last resort, use the slow shell script.  Don't cache a
-    # value for INSTALL within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the value is a relative name.
-    INSTALL=$ac_install_sh
-  fi
-fi
-{ echo "$as_me:$LINENO: result: $INSTALL" >&5
-echo "${ECHO_T}$INSTALL" >&6; }
-
-# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
-# It thinks the first close brace ends the variable substitution.
-test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
-
-test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
-
-test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
-
-{ echo "$as_me:$LINENO: checking whether build environment is sane" >&5
-echo $ECHO_N "checking whether build environment is sane... $ECHO_C" >&6; }
-# Just in case
-sleep 1
-echo timestamp > conftest.file
-# Do `set' in a subshell so we don't clobber the current shell's
-# arguments.  Must try -L first in case configure is actually a
-# symlink; some systems play weird games with the mod time of symlinks
-# (eg FreeBSD returns the mod time of the symlink's containing
-# directory).
-if (
-   set X `ls -Lt $srcdir/configure conftest.file 2> /dev/null`
-   if test "$*" = "X"; then
-      # -L didn't work.
-      set X `ls -t $srcdir/configure conftest.file`
-   fi
-   rm -f conftest.file
-   if test "$*" != "X $srcdir/configure conftest.file" \
-      && test "$*" != "X conftest.file $srcdir/configure"; then
-
-      # If neither matched, then we have a broken ls.  This can happen
-      # if, for instance, CONFIG_SHELL is bash and it inherits a
-      # broken ls alias from the environment.  This has actually
-      # happened.  Such a system could not be considered "sane".
-      { { echo "$as_me:$LINENO: error: ls -t appears to fail.  Make sure there is not a broken
-alias in your environment" >&5
-echo "$as_me: error: ls -t appears to fail.  Make sure there is not a broken
-alias in your environment" >&2;}
-   { (exit 1); exit 1; }; }
-   fi
-
-   test "$2" = conftest.file
-   )
-then
-   # Ok.
-   :
-else
-   { { echo "$as_me:$LINENO: error: newly created file is older than distributed files!
-Check your system clock" >&5
-echo "$as_me: error: newly created file is older than distributed files!
-Check your system clock" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-{ echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-test "$program_prefix" != NONE &&
-  program_transform_name="s&^&$program_prefix&;$program_transform_name"
-# Use a double $ so make ignores it.
-test "$program_suffix" != NONE &&
-  program_transform_name="s&\$&$program_suffix&;$program_transform_name"
-# Double any \ or $.  echo might interpret backslashes.
-# By default was `s,x,x', remove it if useless.
-cat <<\_ACEOF >conftest.sed
-s/[\\$]/&&/g;s/;s,x,x,$//
-_ACEOF
-program_transform_name=`echo $program_transform_name | sed -f conftest.sed`
-rm -f conftest.sed
-
-# expand $ac_aux_dir to an absolute path
-am_aux_dir=`cd $ac_aux_dir && pwd`
-
-test x"${MISSING+set}" = xset || MISSING="\${SHELL} $am_aux_dir/missing"
-# Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
-else
-  am_missing_run=
-  { echo "$as_me:$LINENO: WARNING: \`missing' script is too old or missing" >&5
-echo "$as_me: WARNING: \`missing' script is too old or missing" >&2;}
-fi
-
-{ echo "$as_me:$LINENO: checking for a thread-safe mkdir -p" >&5
-echo $ECHO_N "checking for a thread-safe mkdir -p... $ECHO_C" >&6; }
-if test -z "$MKDIR_P"; then
-  if test "${ac_cv_path_mkdir+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/opt/sfw/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_prog in mkdir gmkdir; do
-	 for ac_exec_ext in '' $ac_executable_extensions; do
-	   { test -f "$as_dir/$ac_prog$ac_exec_ext" && $as_test_x "$as_dir/$ac_prog$ac_exec_ext"; } || continue
-	   case `"$as_dir/$ac_prog$ac_exec_ext" --version 2>&1` in #(
-	     'mkdir (GNU coreutils) '* | \
-	     'mkdir (coreutils) '* | \
-	     'mkdir (fileutils) '4.1*)
-	       ac_cv_path_mkdir=$as_dir/$ac_prog$ac_exec_ext
-	       break 3;;
-	   esac
-	 done
-       done
-done
-IFS=$as_save_IFS
-
-fi
-
-  if test "${ac_cv_path_mkdir+set}" = set; then
-    MKDIR_P="$ac_cv_path_mkdir -p"
-  else
-    # As a last resort, use the slow shell script.  Don't cache a
-    # value for MKDIR_P within a source directory, because that will
-    # break other packages using the cache if that directory is
-    # removed, or if the value is a relative name.
-    test -d ./--version && rmdir ./--version
-    MKDIR_P="$ac_install_sh -d"
-  fi
-fi
-{ echo "$as_me:$LINENO: result: $MKDIR_P" >&5
-echo "${ECHO_T}$MKDIR_P" >&6; }
-
-mkdir_p="$MKDIR_P"
-case $mkdir_p in
-  [\\/$]* | ?:[\\/]*) ;;
-  */*) mkdir_p="\$(top_builddir)/$mkdir_p" ;;
-esac
-
-for ac_prog in gawk mawk nawk awk
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_AWK+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$AWK"; then
-  ac_cv_prog_AWK="$AWK" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_AWK="$ac_prog"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-AWK=$ac_cv_prog_AWK
-if test -n "$AWK"; then
-  { echo "$as_me:$LINENO: result: $AWK" >&5
-echo "${ECHO_T}$AWK" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$AWK" && break
-done
-
-{ echo "$as_me:$LINENO: checking whether ${MAKE-make} sets \$(MAKE)" >&5
-echo $ECHO_N "checking whether ${MAKE-make} sets \$(MAKE)... $ECHO_C" >&6; }
-set x ${MAKE-make}; ac_make=`echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'`
-if { as_var=ac_cv_prog_make_${ac_make}_set; eval "test \"\${$as_var+set}\" = set"; }; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  cat >conftest.make <<\_ACEOF
-SHELL = /bin/sh
-all:
-	@echo '@@@%%%=$(MAKE)=@@@%%%'
-_ACEOF
-# GNU make sometimes prints "make[1]: Entering...", which would confuse us.
-case `${MAKE-make} -f conftest.make 2>/dev/null` in
-  *@@@%%%=?*=@@@%%%*)
-    eval ac_cv_prog_make_${ac_make}_set=yes;;
-  *)
-    eval ac_cv_prog_make_${ac_make}_set=no;;
-esac
-rm -f conftest.make
-fi
-if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-  SET_MAKE=
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-  SET_MAKE="MAKE=${MAKE-make}"
-fi
-
-rm -rf .tst 2>/dev/null
-mkdir .tst 2>/dev/null
-if test -d .tst; then
-  am__leading_dot=.
-else
-  am__leading_dot=_
-fi
-rmdir .tst 2>/dev/null
-
-if test "`cd $srcdir && pwd`" != "`pwd`"; then
-  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
-  # is not polluted with repeated "-I."
-  am__isrc=' -I$(srcdir)'
-  # test to see if srcdir already configured
-  if test -f $srcdir/config.status; then
-    { { echo "$as_me:$LINENO: error: source directory already configured; run \"make distclean\" there first" >&5
-echo "$as_me: error: source directory already configured; run \"make distclean\" there first" >&2;}
-   { (exit 1); exit 1; }; }
-  fi
-fi
-
-# test whether we have cygpath
-if test -z "$CYGPATH_W"; then
-  if (cygpath --version) >/dev/null 2>/dev/null; then
-    CYGPATH_W='cygpath -w'
-  else
-    CYGPATH_W=echo
-  fi
-fi
-
-
-# Define the identity of the package.
- PACKAGE='threadpool'
- VERSION='1.1d'
-
-
-# Some tools Automake needs.
-
-ACLOCAL=${ACLOCAL-"${am_missing_run}aclocal-${am__api_version}"}
-
-
-AUTOCONF=${AUTOCONF-"${am_missing_run}autoconf"}
-
-
-AUTOMAKE=${AUTOMAKE-"${am_missing_run}automake-${am__api_version}"}
-
-
-AUTOHEADER=${AUTOHEADER-"${am_missing_run}autoheader"}
-
-
-MAKEINFO=${MAKEINFO-"${am_missing_run}makeinfo"}
-
-install_sh=${install_sh-"\$(SHELL) $am_aux_dir/install-sh"}
-
-# Installed binaries are usually stripped using `strip' when the user
-# run `make install-strip'.  However `strip' might not be the right
-# tool to use in cross-compilation environments, therefore Automake
-# will honor the `STRIP' environment variable to overrule this program.
-if test "$cross_compiling" != no; then
-  if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}strip", so it can be a program name with args.
-set dummy ${ac_tool_prefix}strip; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_STRIP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$STRIP"; then
-  ac_cv_prog_STRIP="$STRIP" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_STRIP="${ac_tool_prefix}strip"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-STRIP=$ac_cv_prog_STRIP
-if test -n "$STRIP"; then
-  { echo "$as_me:$LINENO: result: $STRIP" >&5
-echo "${ECHO_T}$STRIP" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-fi
-if test -z "$ac_cv_prog_STRIP"; then
-  ac_ct_STRIP=$STRIP
-  # Extract the first word of "strip", so it can be a program name with args.
-set dummy strip; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_ac_ct_STRIP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$ac_ct_STRIP"; then
-  ac_cv_prog_ac_ct_STRIP="$ac_ct_STRIP" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_STRIP="strip"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_STRIP=$ac_cv_prog_ac_ct_STRIP
-if test -n "$ac_ct_STRIP"; then
-  { echo "$as_me:$LINENO: result: $ac_ct_STRIP" >&5
-echo "${ECHO_T}$ac_ct_STRIP" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-  if test "x$ac_ct_STRIP" = x; then
-    STRIP=":"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&5
-echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&2;}
-ac_tool_warned=yes ;;
-esac
-    STRIP=$ac_ct_STRIP
-  fi
-else
-  STRIP="$ac_cv_prog_STRIP"
-fi
-
-fi
-INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
-
-# We need awk for the "check" target.  The system "awk" is bad on
-# some platforms.
-# Always define AMTAR for backward compatibility.
-
-AMTAR=${AMTAR-"${am_missing_run}tar"}
-
-
-{ echo "$as_me:$LINENO: checking how to create a ustar tar archive" >&5
-echo $ECHO_N "checking how to create a ustar tar archive... $ECHO_C" >&6; }
-# Loop over all known methods to create a tar archive until one works.
-_am_tools='gnutar plaintar pax cpio none'
-_am_tools=${am_cv_prog_tar_ustar-$_am_tools}
-# Do not fold the above two line into one, because Tru64 sh and
-# Solaris sh will not grok spaces in the rhs of `-'.
-for _am_tool in $_am_tools
-do
-  case $_am_tool in
-  gnutar)
-    for _am_tar in tar gnutar gtar;
-    do
-      { echo "$as_me:$LINENO: $_am_tar --version" >&5
-   ($_am_tar --version) >&5 2>&5
-   ac_status=$?
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   (exit $ac_status); } && break
-    done
-    am__tar="$_am_tar --format=ustar -chf - "'"$$tardir"'
-    am__tar_="$_am_tar --format=ustar -chf - "'"$tardir"'
-    am__untar="$_am_tar -xf -"
-    ;;
-  plaintar)
-    # Must skip GNU tar: if it does not support --format= it doesn't create
-    # ustar tarball either.
-    (tar --version) >/dev/null 2>&1 && continue
-    am__tar='tar chf - "$$tardir"'
-    am__tar_='tar chf - "$tardir"'
-    am__untar='tar xf -'
-    ;;
-  pax)
-    am__tar='pax -L -x ustar -w "$$tardir"'
-    am__tar_='pax -L -x ustar -w "$tardir"'
-    am__untar='pax -r'
-    ;;
-  cpio)
-    am__tar='find "$$tardir" -print | cpio -o -H ustar -L'
-    am__tar_='find "$tardir" -print | cpio -o -H ustar -L'
-    am__untar='cpio -i -H ustar -d'
-    ;;
-  none)
-    am__tar=false
-    am__tar_=false
-    am__untar=false
-    ;;
-  esac
-
-  # If the value was cached, stop now.  We just wanted to have am__tar
-  # and am__untar set.
-  test -n "${am_cv_prog_tar_ustar}" && break
-
-  # tar/untar a dummy directory, and stop if the command works
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  echo GrepMe > conftest.dir/file
-  { echo "$as_me:$LINENO: tardir=conftest.dir && eval $am__tar_ >conftest.tar" >&5
-   (tardir=conftest.dir && eval $am__tar_ >conftest.tar) >&5 2>&5
-   ac_status=$?
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   (exit $ac_status); }
-  rm -rf conftest.dir
-  if test -s conftest.tar; then
-    { echo "$as_me:$LINENO: $am__untar <conftest.tar" >&5
-   ($am__untar <conftest.tar) >&5 2>&5
-   ac_status=$?
-   echo "$as_me:$LINENO: \$? = $ac_status" >&5
-   (exit $ac_status); }
-    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
-  fi
-done
-rm -rf conftest.dir
-
-if test "${am_cv_prog_tar_ustar+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  am_cv_prog_tar_ustar=$_am_tool
-fi
-
-{ echo "$as_me:$LINENO: result: $am_cv_prog_tar_ustar" >&5
-echo "${ECHO_T}$am_cv_prog_tar_ustar" >&6; }
-
-
-
-
-
-
-# Specify required version of autoconf.
-
-
-
-# ------------------------------------------------------------------------
-# Check to see if MPI enabled and if any special configuration done
-# ------------------------------------------------------------------------
-
-
-
-# Check whether --enable-mpi was given.
-if test "${enable_mpi+set}" = set; then
-  enableval=$enable_mpi; HAVE_PKG_MPI=$enableval
-else
-  HAVE_PKG_MPI=no
-
-fi
-
-
-
-# Check whether --with-mpi-compilers was given.
-if test "${with_mpi_compilers+set}" = set; then
-  withval=$with_mpi_compilers;
-  if test X${withval} != Xno; then
-    HAVE_PKG_MPI=yes
-    if test X${withval} = Xyes; then
-      # Check for mpicxx, if it does not exist, check for mpic++, if it does
-      # not exist, use mpiCC instead.
-      # Extract the first word of "mpicxx", so it can be a program name with args.
-set dummy mpicxx; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_MPI_TEMP_CXX+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$MPI_TEMP_CXX"; then
-  ac_cv_prog_MPI_TEMP_CXX="$MPI_TEMP_CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_MPI_TEMP_CXX="mpicxx"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_prog_MPI_TEMP_CXX" && ac_cv_prog_MPI_TEMP_CXX="no"
-fi
-fi
-MPI_TEMP_CXX=$ac_cv_prog_MPI_TEMP_CXX
-if test -n "$MPI_TEMP_CXX"; then
-  { echo "$as_me:$LINENO: result: $MPI_TEMP_CXX" >&5
-echo "${ECHO_T}$MPI_TEMP_CXX" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-      if test X${MPI_TEMP_CXX} = Xno; then
-	# Extract the first word of "mpic++", so it can be a program name with args.
-set dummy mpic++; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_MPI_CXX+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$MPI_CXX"; then
-  ac_cv_prog_MPI_CXX="$MPI_CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_MPI_CXX="mpic++"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_prog_MPI_CXX" && ac_cv_prog_MPI_CXX="mpiCC"
-fi
-fi
-MPI_CXX=$ac_cv_prog_MPI_CXX
-if test -n "$MPI_CXX"; then
-  { echo "$as_me:$LINENO: result: $MPI_CXX" >&5
-echo "${ECHO_T}$MPI_CXX" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-      else
-	MPI_CXX=${MPI_TEMP_CXX}
-      fi
-      MPI_CC=mpicc
-      MPI_F77=mpif77
-    else
-      if test -f ${withval}/mpicxx; then
-        MPI_CXX=${withval}/mpicxx
-      elif test -f ${withval}/mpic++; then
-	MPI_CXX=${withval}/mpic++
-      else
-        MPI_CXX=${withval}/mpiCC
-      fi
-      MPI_CC=${withval}/mpicc
-      MPI_F77=${withval}/mpif77
-    fi
-  fi
-
-
-fi
-
-
-
-# Check whether --with-mpi was given.
-if test "${with_mpi+set}" = set; then
-  withval=$with_mpi;
-  HAVE_PKG_MPI=yes
-  MPI_DIR=${withval}
-  { echo "$as_me:$LINENO: checking MPI directory" >&5
-echo $ECHO_N "checking MPI directory... $ECHO_C" >&6; }
-  { echo "$as_me:$LINENO: result: ${MPI_DIR}" >&5
-echo "${ECHO_T}${MPI_DIR}" >&6; }
-
-
-fi
-
-
-#AC_ARG_WITH(mpi-include,
-#[AC_HELP_STRING([--with-mpi-include],[Obsolete.  Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.])],
-#[AC_MSG_ERROR([--with-mpi-include is an obsolte option.   Use --with-mpi-incdir=DIR instead.  Do not prefix DIR with '-I'.  For example '--with-mpi-incdir=/usr/lam_path/include'.])]
-#)
-
-
-# Check whether --with-mpi-libs was given.
-if test "${with_mpi_libs+set}" = set; then
-  withval=$with_mpi_libs;
-  MPI_LIBS=${withval}
-  { echo "$as_me:$LINENO: checking user-defined MPI libraries" >&5
-echo $ECHO_N "checking user-defined MPI libraries... $ECHO_C" >&6; }
-  { echo "$as_me:$LINENO: result: ${MPI_LIBS}" >&5
-echo "${ECHO_T}${MPI_LIBS}" >&6; }
-
-
-fi
-
-
-
-# Check whether --with-mpi-incdir was given.
-if test "${with_mpi_incdir+set}" = set; then
-  withval=$with_mpi_incdir;
-  MPI_INC=${withval}
-  { echo "$as_me:$LINENO: checking user-defined MPI includes" >&5
-echo $ECHO_N "checking user-defined MPI includes... $ECHO_C" >&6; }
-  { echo "$as_me:$LINENO: result: ${MPI_INC}" >&5
-echo "${ECHO_T}${MPI_INC}" >&6; }
-
-
-fi
-
-
-
-# Check whether --with-mpi-libdir was given.
-if test "${with_mpi_libdir+set}" = set; then
-  withval=$with_mpi_libdir;
-  MPI_LIBDIR=${withval}
-  { echo "$as_me:$LINENO: checking user-defined MPI library directory" >&5
-echo $ECHO_N "checking user-defined MPI library directory... $ECHO_C" >&6; }
-  { echo "$as_me:$LINENO: result: ${MPI_LIBDIR}" >&5
-echo "${ECHO_T}${MPI_LIBDIR}" >&6; }
-
-
-fi
-
-
-{ echo "$as_me:$LINENO: checking whether we are using MPI" >&5
-echo $ECHO_N "checking whether we are using MPI... $ECHO_C" >&6; }
-{ echo "$as_me:$LINENO: result: ${HAVE_PKG_MPI}" >&5
-echo "${ECHO_T}${HAVE_PKG_MPI}" >&6; }
-
-if test "X${HAVE_PKG_MPI}" = "Xyes"; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_MPI
-_ACEOF
-
-fi
-
-
- if test "X${HAVE_PKG_MPI}" = "Xyes"; then
-  HAVE_MPI_TRUE=
-  HAVE_MPI_FALSE='#'
-else
-  HAVE_MPI_TRUE='#'
-  HAVE_MPI_FALSE=
-fi
-
-
-
-
-if test -n "${MPI_CXX}"; then
-  if test -f ${MPI_CXX}; then
-    MPI_CXX_EXISTS=yes
-  else
-    # Extract the first word of "${MPI_CXX}", so it can be a program name with args.
-set dummy ${MPI_CXX}; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_MPI_CXX_EXISTS+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$MPI_CXX_EXISTS"; then
-  ac_cv_prog_MPI_CXX_EXISTS="$MPI_CXX_EXISTS" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_MPI_CXX_EXISTS="yes"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_prog_MPI_CXX_EXISTS" && ac_cv_prog_MPI_CXX_EXISTS="no"
-fi
-fi
-MPI_CXX_EXISTS=$ac_cv_prog_MPI_CXX_EXISTS
-if test -n "$MPI_CXX_EXISTS"; then
-  { echo "$as_me:$LINENO: result: $MPI_CXX_EXISTS" >&5
-echo "${ECHO_T}$MPI_CXX_EXISTS" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  fi
-
-  if test "X${MPI_CXX_EXISTS}" = "Xyes"; then
-    CXX=${MPI_CXX}
-  else
-    echo "-----"
-    echo "Cannot find MPI C++ compiler ${MPI_CXX}."
-    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
-    echo "or specify a C++ compiler using CXX=<compiler>"
-    echo "Do not use --with-mpi-compilers if using CXX=<compiler>"
-    echo "-----"
-    { { echo "$as_me:$LINENO: error: MPI C++ compiler (${MPI_CXX}) not found." >&5
-echo "$as_me: error: MPI C++ compiler (${MPI_CXX}) not found." >&2;}
-   { (exit 1); exit 1; }; }
-  fi
-fi
-
-if test -n "${MPI_CC}"; then
-  if test -f ${MPI_CC}; then
-    MPI_CC_EXISTS=yes
-  else
-    # Extract the first word of "${MPI_CC}", so it can be a program name with args.
-set dummy ${MPI_CC}; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_MPI_CC_EXISTS+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$MPI_CC_EXISTS"; then
-  ac_cv_prog_MPI_CC_EXISTS="$MPI_CC_EXISTS" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_MPI_CC_EXISTS="yes"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_prog_MPI_CC_EXISTS" && ac_cv_prog_MPI_CC_EXISTS="no"
-fi
-fi
-MPI_CC_EXISTS=$ac_cv_prog_MPI_CC_EXISTS
-if test -n "$MPI_CC_EXISTS"; then
-  { echo "$as_me:$LINENO: result: $MPI_CC_EXISTS" >&5
-echo "${ECHO_T}$MPI_CC_EXISTS" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  fi
-
-  if test "X${MPI_CC_EXISTS}" = "Xyes"; then
-    CC=${MPI_CC}
-  else
-    echo "-----"
-    echo "Cannot find MPI C compiler ${MPI_CC}."
-    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
-    echo "or specify a C compiler using CC=<compiler>"
-    echo "Do not use --with-mpi-compilers if using CC=<compiler>"
-    echo "-----"
-    { { echo "$as_me:$LINENO: error: MPI C compiler (${MPI_CC}) not found." >&5
-echo "$as_me: error: MPI C compiler (${MPI_CC}) not found." >&2;}
-   { (exit 1); exit 1; }; }
-  fi
-fi
-
-if test "X$ac_cv_use_fortran" = "Xyes"; then
-
-if test -n "${MPI_F77}"; then
-  if test -f ${MPI_F77}; then
-    MPI_F77_EXISTS=yes
-  else
-    # Extract the first word of "${MPI_F77}", so it can be a program name with args.
-set dummy ${MPI_F77}; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_MPI_F77_EXISTS+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$MPI_F77_EXISTS"; then
-  ac_cv_prog_MPI_F77_EXISTS="$MPI_F77_EXISTS" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_MPI_F77_EXISTS="yes"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_prog_MPI_F77_EXISTS" && ac_cv_prog_MPI_F77_EXISTS="no"
-fi
-fi
-MPI_F77_EXISTS=$ac_cv_prog_MPI_F77_EXISTS
-if test -n "$MPI_F77_EXISTS"; then
-  { echo "$as_me:$LINENO: result: $MPI_F77_EXISTS" >&5
-echo "${ECHO_T}$MPI_F77_EXISTS" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  fi
-
-  if test "X${MPI_F77_EXISTS}" = "Xyes"; then
-    F77=${MPI_F77}
-  else
-    echo "-----"
-    echo "Cannot find MPI Fortran compiler ${MPI_F77}."
-    echo "Specify a path to all mpi compilers with --with-mpi-compilers=PATH"
-    echo "or specify a Fortran 77 compiler using F77=<compiler>"
-    echo "Do not use --with-mpi-compilers if using F77=<compiler>"
-    echo "-----"
-    { { echo "$as_me:$LINENO: error: MPI Fortran 77 compiler (${MPI_F77}) not found." >&5
-echo "$as_me: error: MPI Fortran 77 compiler (${MPI_F77}) not found." >&2;}
-   { (exit 1); exit 1; }; }
-  fi
-fi
-
-fi
-
-#  #np# - can eliminate compiler checks below if your package does not use the
-#         language corresponding to the check.  Please note that if you use
-#	  F77_FUNC to determine Fortran name mangling, you should not remove
-#	  the Fortran compiler check or the check for Fortran flags.  Doing
-#	  so will prevent the detection of the proper name mangling in some
-#	  cases.
-# ------------------------------------------------------------------------
-# Checks for programs
-# ------------------------------------------------------------------------
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-if test -n "$ac_tool_prefix"; then
-  for ac_prog in cc gcc
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$CC"; then
-  ac_cv_prog_CC="$CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-CC=$ac_cv_prog_CC
-if test -n "$CC"; then
-  { echo "$as_me:$LINENO: result: $CC" >&5
-echo "${ECHO_T}$CC" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-    test -n "$CC" && break
-  done
-fi
-if test -z "$CC"; then
-  ac_ct_CC=$CC
-  for ac_prog in cc gcc
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$ac_ct_CC"; then
-  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_CC="$ac_prog"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CC=$ac_cv_prog_ac_ct_CC
-if test -n "$ac_ct_CC"; then
-  { echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
-echo "${ECHO_T}$ac_ct_CC" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$ac_ct_CC" && break
-done
-
-  if test "x$ac_ct_CC" = x; then
-    CC=""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&5
-echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&2;}
-ac_tool_warned=yes ;;
-esac
-    CC=$ac_ct_CC
-  fi
-fi
-
-
-test -z "$CC" && { { echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH
-See \`config.log' for more details." >&5
-echo "$as_me: error: no acceptable C compiler found in \$PATH
-See \`config.log' for more details." >&2;}
-   { (exit 1); exit 1; }; }
-
-# Provide some information about the compiler.
-echo "$as_me:$LINENO: checking for C compiler version" >&5
-ac_compiler=`set X $ac_compile; echo $2`
-{ (ac_try="$ac_compiler --version >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compiler --version >&5") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }
-{ (ac_try="$ac_compiler -v >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compiler -v >&5") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }
-{ (ac_try="$ac_compiler -V >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compiler -V >&5") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }
-
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-ac_clean_files_save=$ac_clean_files
-ac_clean_files="$ac_clean_files a.out a.exe b.out"
-# Try to create an executable without -o first, disregard a.out.
-# It will help us diagnose broken compilers, and finding out an intuition
-# of exeext.
-{ echo "$as_me:$LINENO: checking for C compiler default output file name" >&5
-echo $ECHO_N "checking for C compiler default output file name... $ECHO_C" >&6; }
-ac_link_default=`echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
-#
-# List of possible output files, starting from the most likely.
-# The algorithm is not robust to junk in `.', hence go to wildcards (a.*)
-# only as a last resort.  b.out is created by i960 compilers.
-ac_files='a_out.exe a.exe conftest.exe a.out conftest a.* conftest.* b.out'
-#
-# The IRIX 6 linker writes into existing files which may not be
-# executable, retaining their permissions.  Remove them first so a
-# subsequent execution test works.
-ac_rmfiles=
-for ac_file in $ac_files
-do
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.o | *.obj ) ;;
-    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
-  esac
-done
-rm -f $ac_rmfiles
-
-if { (ac_try="$ac_link_default"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link_default") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; then
-  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
-# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
-# in a Makefile.  We should not override ac_cv_exeext if it was cached,
-# so that the user can short-circuit this test for compilers unknown to
-# Autoconf.
-for ac_file in $ac_files ''
-do
-  test -f "$ac_file" || continue
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.o | *.obj )
-	;;
-    [ab].out )
-	# We found the default executable, but exeext='' is most
-	# certainly right.
-	break;;
-    *.* )
-        if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
-	then :; else
-	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
-	fi
-	# We set ac_cv_exeext here because the later test for it is not
-	# safe: cross compilers may not add the suffix if given an `-o'
-	# argument, so we may need to know it at that point already.
-	# Even if this section looks crufty: it has the advantage of
-	# actually working.
-	break;;
-    * )
-	break;;
-  esac
-done
-test "$ac_cv_exeext" = no && ac_cv_exeext=
-
-else
-  ac_file=''
-fi
-
-{ echo "$as_me:$LINENO: result: $ac_file" >&5
-echo "${ECHO_T}$ac_file" >&6; }
-if test -z "$ac_file"; then
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-{ { echo "$as_me:$LINENO: error: C compiler cannot create executables
-See \`config.log' for more details." >&5
-echo "$as_me: error: C compiler cannot create executables
-See \`config.log' for more details." >&2;}
-   { (exit 77); exit 77; }; }
-fi
-
-ac_exeext=$ac_cv_exeext
-
-# Check that the compiler produces executables we can run.  If not, either
-# the compiler is broken, or we cross compile.
-{ echo "$as_me:$LINENO: checking whether the C compiler works" >&5
-echo $ECHO_N "checking whether the C compiler works... $ECHO_C" >&6; }
-# FIXME: These cross compiler hacks should be removed for Autoconf 3.0
-# If not cross compiling, check that we can run a simple program.
-if test "$cross_compiling" != yes; then
-  if { ac_try='./$ac_file'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-    cross_compiling=no
-  else
-    if test "$cross_compiling" = maybe; then
-	cross_compiling=yes
-    else
-	{ { echo "$as_me:$LINENO: error: cannot run C compiled programs.
-If you meant to cross compile, use \`--host'.
-See \`config.log' for more details." >&5
-echo "$as_me: error: cannot run C compiled programs.
-If you meant to cross compile, use \`--host'.
-See \`config.log' for more details." >&2;}
-   { (exit 1); exit 1; }; }
-    fi
-  fi
-fi
-{ echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-
-rm -f a.out a.exe conftest$ac_cv_exeext b.out
-ac_clean_files=$ac_clean_files_save
-# Check that the compiler produces executables we can run.  If not, either
-# the compiler is broken, or we cross compile.
-{ echo "$as_me:$LINENO: checking whether we are cross compiling" >&5
-echo $ECHO_N "checking whether we are cross compiling... $ECHO_C" >&6; }
-{ echo "$as_me:$LINENO: result: $cross_compiling" >&5
-echo "${ECHO_T}$cross_compiling" >&6; }
-
-{ echo "$as_me:$LINENO: checking for suffix of executables" >&5
-echo $ECHO_N "checking for suffix of executables... $ECHO_C" >&6; }
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; then
-  # If both `conftest.exe' and `conftest' are `present' (well, observable)
-# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
-# work properly (i.e., refer to `conftest.exe'), while it won't with
-# `rm'.
-for ac_file in conftest.exe conftest conftest.*; do
-  test -f "$ac_file" || continue
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.o | *.obj ) ;;
-    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
-	  break;;
-    * ) break;;
-  esac
-done
-else
-  { { echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link
-See \`config.log' for more details." >&5
-echo "$as_me: error: cannot compute suffix of executables: cannot compile and link
-See \`config.log' for more details." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-rm -f conftest$ac_cv_exeext
-{ echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5
-echo "${ECHO_T}$ac_cv_exeext" >&6; }
-
-rm -f conftest.$ac_ext
-EXEEXT=$ac_cv_exeext
-ac_exeext=$EXEEXT
-{ echo "$as_me:$LINENO: checking for suffix of object files" >&5
-echo $ECHO_N "checking for suffix of object files... $ECHO_C" >&6; }
-if test "${ac_cv_objext+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.o conftest.obj
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; then
-  for ac_file in conftest.o conftest.obj conftest.*; do
-  test -f "$ac_file" || continue;
-  case $ac_file in
-    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf ) ;;
-    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
-       break;;
-  esac
-done
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-{ { echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile
-See \`config.log' for more details." >&5
-echo "$as_me: error: cannot compute suffix of object files: cannot compile
-See \`config.log' for more details." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-rm -f conftest.$ac_cv_objext conftest.$ac_ext
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_objext" >&5
-echo "${ECHO_T}$ac_cv_objext" >&6; }
-OBJEXT=$ac_cv_objext
-ac_objext=$OBJEXT
-{ echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5
-echo $ECHO_N "checking whether we are using the GNU C compiler... $ECHO_C" >&6; }
-if test "${ac_cv_c_compiler_gnu+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-#ifndef __GNUC__
-       choke me
-#endif
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_compiler_gnu=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_compiler_gnu=no
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_c_compiler_gnu=$ac_compiler_gnu
-
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5
-echo "${ECHO_T}$ac_cv_c_compiler_gnu" >&6; }
-GCC=`test $ac_compiler_gnu = yes && echo yes`
-ac_test_CFLAGS=${CFLAGS+set}
-ac_save_CFLAGS=$CFLAGS
-{ echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5
-echo $ECHO_N "checking whether $CC accepts -g... $ECHO_C" >&6; }
-if test "${ac_cv_prog_cc_g+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_save_c_werror_flag=$ac_c_werror_flag
-   ac_c_werror_flag=yes
-   ac_cv_prog_cc_g=no
-   CFLAGS="-g"
-   cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_cv_prog_cc_g=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	CFLAGS=""
-      cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  :
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_c_werror_flag=$ac_save_c_werror_flag
-	 CFLAGS="-g"
-	 cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_cv_prog_cc_g=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_c_werror_flag=$ac_save_c_werror_flag
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5
-echo "${ECHO_T}$ac_cv_prog_cc_g" >&6; }
-if test "$ac_test_CFLAGS" = set; then
-  CFLAGS=$ac_save_CFLAGS
-elif test $ac_cv_prog_cc_g = yes; then
-  if test "$GCC" = yes; then
-    CFLAGS="-g -O2"
-  else
-    CFLAGS="-g"
-  fi
-else
-  if test "$GCC" = yes; then
-    CFLAGS="-O2"
-  else
-    CFLAGS=
-  fi
-fi
-{ echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5
-echo $ECHO_N "checking for $CC option to accept ISO C89... $ECHO_C" >&6; }
-if test "${ac_cv_prog_cc_c89+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_cv_prog_cc_c89=no
-ac_save_CC=$CC
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <stdarg.h>
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
-struct buf { int x; };
-FILE * (*rcsopen) (struct buf *, struct stat *, int);
-static char *e (p, i)
-     char **p;
-     int i;
-{
-  return p[i];
-}
-static char *f (char * (*g) (char **, int), char **p, ...)
-{
-  char *s;
-  va_list v;
-  va_start (v,p);
-  s = g (p, va_arg (v,int));
-  va_end (v);
-  return s;
-}
-
-/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
-   function prototypes and stuff, but not '\xHH' hex character constants.
-   These don't provoke an error unfortunately, instead are silently treated
-   as 'x'.  The following induces an error, until -std is added to get
-   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
-   array size at least.  It's necessary to write '\x00'==0 to get something
-   that's true only with -std.  */
-int osf4_cc_array ['\x00' == 0 ? 1 : -1];
-
-/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
-   inside strings and character constants.  */
-#define FOO(x) 'x'
-int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
-
-int test (int i, double x);
-struct s1 {int (*f) (int a);};
-struct s2 {int (*f) (double a);};
-int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
-int argc;
-char **argv;
-int
-main ()
-{
-return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
-  ;
-  return 0;
-}
-_ACEOF
-for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
-	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
-do
-  CC="$ac_save_CC $ac_arg"
-  rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_cv_prog_cc_c89=$ac_arg
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-
-fi
-
-rm -f core conftest.err conftest.$ac_objext
-  test "x$ac_cv_prog_cc_c89" != "xno" && break
-done
-rm -f conftest.$ac_ext
-CC=$ac_save_CC
-
-fi
-# AC_CACHE_VAL
-case "x$ac_cv_prog_cc_c89" in
-  x)
-    { echo "$as_me:$LINENO: result: none needed" >&5
-echo "${ECHO_T}none needed" >&6; } ;;
-  xno)
-    { echo "$as_me:$LINENO: result: unsupported" >&5
-echo "${ECHO_T}unsupported" >&6; } ;;
-  *)
-    CC="$CC $ac_cv_prog_cc_c89"
-    { echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5
-echo "${ECHO_T}$ac_cv_prog_cc_c89" >&6; } ;;
-esac
-
-
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-DEPDIR="${am__leading_dot}deps"
-
-ac_config_commands="$ac_config_commands depfiles"
-
-
-am_make=${MAKE-make}
-cat > confinc << 'END'
-am__doit:
-	@echo done
-.PHONY: am__doit
-END
-# If we don't find an include directive, just comment out the code.
-{ echo "$as_me:$LINENO: checking for style of include used by $am_make" >&5
-echo $ECHO_N "checking for style of include used by $am_make... $ECHO_C" >&6; }
-am__include="#"
-am__quote=
-_am_result=none
-# First try GNU make style include.
-echo "include confinc" > confmf
-# We grep out `Entering directory' and `Leaving directory'
-# messages which can occur if `w' ends up in MAKEFLAGS.
-# In particular we don't look at `^make:' because GNU make might
-# be invoked under some other name (usually "gmake"), in which
-# case it prints its new name instead of `make'.
-if test "`$am_make -s -f confmf 2> /dev/null | grep -v 'ing directory'`" = "done"; then
-   am__include=include
-   am__quote=
-   _am_result=GNU
-fi
-# Now try BSD make style include.
-if test "$am__include" = "#"; then
-   echo '.include "confinc"' > confmf
-   if test "`$am_make -s -f confmf 2> /dev/null`" = "done"; then
-      am__include=.include
-      am__quote="\""
-      _am_result=BSD
-   fi
-fi
-
-
-{ echo "$as_me:$LINENO: result: $_am_result" >&5
-echo "${ECHO_T}$_am_result" >&6; }
-rm -f confinc confmf
-
-# Check whether --enable-dependency-tracking was given.
-if test "${enable_dependency_tracking+set}" = set; then
-  enableval=$enable_dependency_tracking;
-fi
-
-if test "x$enable_dependency_tracking" != xno; then
-  am_depcomp="$ac_aux_dir/depcomp"
-  AMDEPBACKSLASH='\'
-fi
- if test "x$enable_dependency_tracking" != xno; then
-  AMDEP_TRUE=
-  AMDEP_FALSE='#'
-else
-  AMDEP_TRUE='#'
-  AMDEP_FALSE=
-fi
-
-
-
-depcc="$CC"   am_compiler_list=
-
-{ echo "$as_me:$LINENO: checking dependency style of $depcc" >&5
-echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6; }
-if test "${am_cv_CC_dependencies_compiler_type+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CC_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    case $depmode in
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    none) break ;;
-    esac
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.
-    if depmode=$depmode \
-       source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CC_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CC_dependencies_compiler_type=none
-fi
-
-fi
-{ echo "$as_me:$LINENO: result: $am_cv_CC_dependencies_compiler_type" >&5
-echo "${ECHO_T}$am_cv_CC_dependencies_compiler_type" >&6; }
-CCDEPMODE=depmode=$am_cv_CC_dependencies_compiler_type
-
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CC_dependencies_compiler_type" = gcc3; then
-  am__fastdepCC_TRUE=
-  am__fastdepCC_FALSE='#'
-else
-  am__fastdepCC_TRUE='#'
-  am__fastdepCC_FALSE=
-fi
-
-
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-if test -z "$CXX"; then
-  if test -n "$CCC"; then
-    CXX=$CCC
-  else
-    if test -n "$ac_tool_prefix"; then
-  for ac_prog in CC g++ c++ cxx
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_CXX+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$CXX"; then
-  ac_cv_prog_CXX="$CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-CXX=$ac_cv_prog_CXX
-if test -n "$CXX"; then
-  { echo "$as_me:$LINENO: result: $CXX" >&5
-echo "${ECHO_T}$CXX" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-    test -n "$CXX" && break
-  done
-fi
-if test -z "$CXX"; then
-  ac_ct_CXX=$CXX
-  for ac_prog in CC g++ c++ cxx
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_ac_ct_CXX+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$ac_ct_CXX"; then
-  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_CXX="$ac_prog"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
-if test -n "$ac_ct_CXX"; then
-  { echo "$as_me:$LINENO: result: $ac_ct_CXX" >&5
-echo "${ECHO_T}$ac_ct_CXX" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  test -n "$ac_ct_CXX" && break
-done
-
-  if test "x$ac_ct_CXX" = x; then
-    CXX="g++"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&5
-echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&2;}
-ac_tool_warned=yes ;;
-esac
-    CXX=$ac_ct_CXX
-  fi
-fi
-
-  fi
-fi
-# Provide some information about the compiler.
-echo "$as_me:$LINENO: checking for C++ compiler version" >&5
-ac_compiler=`set X $ac_compile; echo $2`
-{ (ac_try="$ac_compiler --version >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compiler --version >&5") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }
-{ (ac_try="$ac_compiler -v >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compiler -v >&5") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }
-{ (ac_try="$ac_compiler -V >&5"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compiler -V >&5") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }
-
-{ echo "$as_me:$LINENO: checking whether we are using the GNU C++ compiler" >&5
-echo $ECHO_N "checking whether we are using the GNU C++ compiler... $ECHO_C" >&6; }
-if test "${ac_cv_cxx_compiler_gnu+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-#ifndef __GNUC__
-       choke me
-#endif
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_compiler_gnu=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_compiler_gnu=no
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
-
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_cxx_compiler_gnu" >&5
-echo "${ECHO_T}$ac_cv_cxx_compiler_gnu" >&6; }
-GXX=`test $ac_compiler_gnu = yes && echo yes`
-ac_test_CXXFLAGS=${CXXFLAGS+set}
-ac_save_CXXFLAGS=$CXXFLAGS
-{ echo "$as_me:$LINENO: checking whether $CXX accepts -g" >&5
-echo $ECHO_N "checking whether $CXX accepts -g... $ECHO_C" >&6; }
-if test "${ac_cv_prog_cxx_g+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_save_cxx_werror_flag=$ac_cxx_werror_flag
-   ac_cxx_werror_flag=yes
-   ac_cv_prog_cxx_g=no
-   CXXFLAGS="-g"
-   cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_cv_prog_cxx_g=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	CXXFLAGS=""
-      cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  :
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-	 CXXFLAGS="-g"
-	 cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_cv_prog_cxx_g=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-   ac_cxx_werror_flag=$ac_save_cxx_werror_flag
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_prog_cxx_g" >&5
-echo "${ECHO_T}$ac_cv_prog_cxx_g" >&6; }
-if test "$ac_test_CXXFLAGS" = set; then
-  CXXFLAGS=$ac_save_CXXFLAGS
-elif test $ac_cv_prog_cxx_g = yes; then
-  if test "$GXX" = yes; then
-    CXXFLAGS="-g -O2"
-  else
-    CXXFLAGS="-g"
-  fi
-else
-  if test "$GXX" = yes; then
-    CXXFLAGS="-O2"
-  else
-    CXXFLAGS=
-  fi
-fi
-ac_ext=c
-ac_cpp='$CPP $CPPFLAGS'
-ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_c_compiler_gnu
-
-depcc="$CXX"  am_compiler_list=
-
-{ echo "$as_me:$LINENO: checking dependency style of $depcc" >&5
-echo $ECHO_N "checking dependency style of $depcc... $ECHO_C" >&6; }
-if test "${am_cv_CXX_dependencies_compiler_type+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -z "$AMDEP_TRUE" && test -f "$am_depcomp"; then
-  # We make a subdir and do the tests there.  Otherwise we can end up
-  # making bogus files that we don't know about and never remove.  For
-  # instance it was reported that on HP-UX the gcc test will end up
-  # making a dummy file named `D' -- because `-MD' means `put the output
-  # in D'.
-  mkdir conftest.dir
-  # Copy depcomp to subdir because otherwise we won't find it if we're
-  # using a relative directory.
-  cp "$am_depcomp" conftest.dir
-  cd conftest.dir
-  # We will build objects and dependencies in a subdirectory because
-  # it helps to detect inapplicable dependency modes.  For instance
-  # both Tru64's cc and ICC support -MD to output dependencies as a
-  # side effect of compilation, but ICC will put the dependencies in
-  # the current directory while Tru64 will put them in the object
-  # directory.
-  mkdir sub
-
-  am_cv_CXX_dependencies_compiler_type=none
-  if test "$am_compiler_list" = ""; then
-     am_compiler_list=`sed -n 's/^#*\([a-zA-Z0-9]*\))$/\1/p' < ./depcomp`
-  fi
-  for depmode in $am_compiler_list; do
-    # Setup a source with many dependencies, because some compilers
-    # like to wrap large dependency lists on column 80 (with \), and
-    # we should not choose a depcomp mode which is confused by this.
-    #
-    # We need to recreate these files for each test, as the compiler may
-    # overwrite some of them when testing with obscure command lines.
-    # This happens at least with the AIX C compiler.
-    : > sub/conftest.c
-    for i in 1 2 3 4 5 6; do
-      echo '#include "conftst'$i'.h"' >> sub/conftest.c
-      # Using `: > sub/conftst$i.h' creates only sub/conftst1.h with
-      # Solaris 8's {/usr,}/bin/sh.
-      touch sub/conftst$i.h
-    done
-    echo "${am__include} ${am__quote}sub/conftest.Po${am__quote}" > confmf
-
-    case $depmode in
-    nosideeffect)
-      # after this tag, mechanisms are not by side-effect, so they'll
-      # only be used when explicitly requested
-      if test "x$enable_dependency_tracking" = xyes; then
-	continue
-      else
-	break
-      fi
-      ;;
-    none) break ;;
-    esac
-    # We check with `-c' and `-o' for the sake of the "dashmstdout"
-    # mode.  It turns out that the SunPro C++ compiler does not properly
-    # handle `-M -o', and we need to detect this.
-    if depmode=$depmode \
-       source=sub/conftest.c object=sub/conftest.${OBJEXT-o} \
-       depfile=sub/conftest.Po tmpdepfile=sub/conftest.TPo \
-       $SHELL ./depcomp $depcc -c -o sub/conftest.${OBJEXT-o} sub/conftest.c \
-         >/dev/null 2>conftest.err &&
-       grep sub/conftst1.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftst6.h sub/conftest.Po > /dev/null 2>&1 &&
-       grep sub/conftest.${OBJEXT-o} sub/conftest.Po > /dev/null 2>&1 &&
-       ${MAKE-make} -s -f confmf > /dev/null 2>&1; then
-      # icc doesn't choke on unknown options, it will just issue warnings
-      # or remarks (even with -Werror).  So we grep stderr for any message
-      # that says an option was ignored or not supported.
-      # When given -MP, icc 7.0 and 7.1 complain thusly:
-      #   icc: Command line warning: ignoring option '-M'; no argument required
-      # The diagnosis changed in icc 8.0:
-      #   icc: Command line remark: option '-MP' not supported
-      if (grep 'ignoring option' conftest.err ||
-          grep 'not supported' conftest.err) >/dev/null 2>&1; then :; else
-        am_cv_CXX_dependencies_compiler_type=$depmode
-        break
-      fi
-    fi
-  done
-
-  cd ..
-  rm -rf conftest.dir
-else
-  am_cv_CXX_dependencies_compiler_type=none
-fi
-
-fi
-{ echo "$as_me:$LINENO: result: $am_cv_CXX_dependencies_compiler_type" >&5
-echo "${ECHO_T}$am_cv_CXX_dependencies_compiler_type" >&6; }
-CXXDEPMODE=depmode=$am_cv_CXX_dependencies_compiler_type
-
- if
-  test "x$enable_dependency_tracking" != xno \
-  && test "$am_cv_CXX_dependencies_compiler_type" = gcc3; then
-  am__fastdepCXX_TRUE=
-  am__fastdepCXX_FALSE='#'
-else
-  am__fastdepCXX_TRUE='#'
-  am__fastdepCXX_FALSE=
-fi
-
-
-#AC_PROG_F77(f77 g77 gfortran f90 xlf90 f95)
-if test -n "$ac_tool_prefix"; then
-  # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args.
-set dummy ${ac_tool_prefix}ranlib; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_RANLIB+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$RANLIB"; then
-  ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-RANLIB=$ac_cv_prog_RANLIB
-if test -n "$RANLIB"; then
-  { echo "$as_me:$LINENO: result: $RANLIB" >&5
-echo "${ECHO_T}$RANLIB" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-fi
-if test -z "$ac_cv_prog_RANLIB"; then
-  ac_ct_RANLIB=$RANLIB
-  # Extract the first word of "ranlib", so it can be a program name with args.
-set dummy ranlib; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_ac_ct_RANLIB+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$ac_ct_RANLIB"; then
-  ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_ac_ct_RANLIB="ranlib"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB
-if test -n "$ac_ct_RANLIB"; then
-  { echo "$as_me:$LINENO: result: $ac_ct_RANLIB" >&5
-echo "${ECHO_T}$ac_ct_RANLIB" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-  if test "x$ac_ct_RANLIB" = x; then
-    RANLIB=":"
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ echo "$as_me:$LINENO: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&5
-echo "$as_me: WARNING: In the future, Autoconf will not detect cross-tools
-whose name does not start with the host triplet.  If you think this
-configuration is useful to you, please write to autoconf@gnu.org." >&2;}
-ac_tool_warned=yes ;;
-esac
-    RANLIB=$ac_ct_RANLIB
-  fi
-else
-  RANLIB="$ac_cv_prog_RANLIB"
-fi
-
-
-# Check if --with-flags present, prepend any specs to FLAGS
-
-
-{ echo "$as_me:$LINENO: checking whether additional CCFLAGS flags should be added" >&5
-echo $ECHO_N "checking whether additional CCFLAGS flags should be added... $ECHO_C" >&6; }
-
-# Check whether --with-ccflags was given.
-if test "${with_ccflags+set}" = set; then
-  withval=$with_ccflags;
-CCFLAGS="${withval} ${CCFLAGS}"
-{ echo "$as_me:$LINENO: result: CCFLAGS = ${CCFLAGS}" >&5
-echo "${ECHO_T}CCFLAGS = ${CCFLAGS}" >&6; }
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
-
-
-{ echo "$as_me:$LINENO: checking whether additional CXXFLAGS flags should be added" >&5
-echo $ECHO_N "checking whether additional CXXFLAGS flags should be added... $ECHO_C" >&6; }
-
-# Check whether --with-cxxflags was given.
-if test "${with_cxxflags+set}" = set; then
-  withval=$with_cxxflags;
-CXXFLAGS="${withval} ${CXXFLAGS}"
-{ echo "$as_me:$LINENO: result: CXXFLAGS = ${CXXFLAGS}" >&5
-echo "${ECHO_T}CXXFLAGS = ${CXXFLAGS}" >&6; }
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
-
-
-{ echo "$as_me:$LINENO: checking whether additional CFLAGS flags should be added" >&5
-echo $ECHO_N "checking whether additional CFLAGS flags should be added... $ECHO_C" >&6; }
-
-# Check whether --with-cflags was given.
-if test "${with_cflags+set}" = set; then
-  withval=$with_cflags;
-CFLAGS="${withval} ${CFLAGS}"
-{ echo "$as_me:$LINENO: result: CFLAGS = ${CFLAGS}" >&5
-echo "${ECHO_T}CFLAGS = ${CFLAGS}" >&6; }
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
-
-#TAC_ARG_WITH_FLAGS(fflags, FFLAGS)
-
-{ echo "$as_me:$LINENO: checking whether additional libraries are needed" >&5
-echo $ECHO_N "checking whether additional libraries are needed... $ECHO_C" >&6; }
-
-# Check whether --with-libs was given.
-if test "${with_libs+set}" = set; then
-  withval=$with_libs;
-LIBS="${withval} ${LIBS}"
-{ echo "$as_me:$LINENO: result: LIBS = ${LIBS}" >&5
-echo "${ECHO_T}LIBS = ${LIBS}" >&6; }
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
-
-
-
-{ echo "$as_me:$LINENO: checking whether additional LDFLAGS flags should be added" >&5
-echo $ECHO_N "checking whether additional LDFLAGS flags should be added... $ECHO_C" >&6; }
-
-# Check whether --with-ldflags was given.
-if test "${with_ldflags+set}" = set; then
-  withval=$with_ldflags;
-LDFLAGS="${withval} ${LDFLAGS}"
-{ echo "$as_me:$LINENO: result: LDFLAGS = ${LDFLAGS}" >&5
-echo "${ECHO_T}LDFLAGS = ${LDFLAGS}" >&6; }
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
-
-
-# ------------------------------------------------------------------------
-# Alternate archiver
-# ------------------------------------------------------------------------
-
-
-
-# Check whether --with-ar was given.
-if test "${with_ar+set}" = set; then
-  withval=$with_ar;
-{ echo "$as_me:$LINENO: checking user-defined archiver" >&5
-echo $ECHO_N "checking user-defined archiver... $ECHO_C" >&6; }
-{ echo "$as_me:$LINENO: result: ${withval}" >&5
-echo "${ECHO_T}${withval}" >&6; }
-USE_ALTERNATE_AR=yes
-ALTERNATE_AR="${withval}"
-
-
-fi
-
-
-if test -n "${SPECIAL_AR}" && test "X${USE_ALTERNATE_AR}" != "Xyes";
-then
-  USE_ALTERNATE_AR=yes
-  ALTERNATE_AR="${SPECIAL_AR}"
-fi
-
-{ echo "$as_me:$LINENO: checking for special archiver command" >&5
-echo $ECHO_N "checking for special archiver command... $ECHO_C" >&6; }
-if test "X${USE_ALTERNATE_AR}" = "Xyes"; then
-   { echo "$as_me:$LINENO: result: ${ALTERNATE_AR}" >&5
-echo "${ECHO_T}${ALTERNATE_AR}" >&6; }
-    if true; then
-  USE_ALTERNATE_AR_TRUE=
-  USE_ALTERNATE_AR_FALSE='#'
-else
-  USE_ALTERNATE_AR_TRUE='#'
-  USE_ALTERNATE_AR_FALSE=
-fi
-
-else
-   { echo "$as_me:$LINENO: result: none" >&5
-echo "${ECHO_T}none" >&6; }
-    if false; then
-  USE_ALTERNATE_AR_TRUE=
-  USE_ALTERNATE_AR_FALSE='#'
-else
-  USE_ALTERNATE_AR_TRUE='#'
-  USE_ALTERNATE_AR_FALSE=
-fi
-
-fi
-
-
-
-# ------------------------------------------------------------------------
-# MPI link check
-# ------------------------------------------------------------------------
-
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-{ echo "$as_me:$LINENO: checking how to run the C++ preprocessor" >&5
-echo $ECHO_N "checking how to run the C++ preprocessor... $ECHO_C" >&6; }
-if test -z "$CXXCPP"; then
-  if test "${ac_cv_prog_CXXCPP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-      # Double quotes because CXXCPP needs to be expanded
-    for CXXCPP in "$CXX -E" "/lib/cpp"
-    do
-      ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null && {
-	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       }; then
-  :
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Broken: fails on valid input.
-continue
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null && {
-	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       }; then
-  # Broken: success on invalid input.
-continue
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then
-  break
-fi
-
-    done
-    ac_cv_prog_CXXCPP=$CXXCPP
-
-fi
-  CXXCPP=$ac_cv_prog_CXXCPP
-else
-  ac_cv_prog_CXXCPP=$CXXCPP
-fi
-{ echo "$as_me:$LINENO: result: $CXXCPP" >&5
-echo "${ECHO_T}$CXXCPP" >&6; }
-ac_preproc_ok=false
-for ac_cxx_preproc_warn_flag in '' yes
-do
-  # Use a header file that comes with gcc, so configuring glibc
-  # with a fresh cross-compiler works.
-  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
-  # <limits.h> exists even on freestanding compilers.
-  # On the NeXT, cc -E runs the code through the compiler's parser,
-  # not just through cpp. "Syntax error" is here to catch this case.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#ifdef __STDC__
-# include <limits.h>
-#else
-# include <assert.h>
-#endif
-		     Syntax error
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null && {
-	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       }; then
-  :
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Broken: fails on valid input.
-continue
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-  # OK, works on sane cases.  Now check whether nonexistent headers
-  # can be detected and how.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <ac_nonexistent.h>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null && {
-	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       }; then
-  # Broken: success on invalid input.
-continue
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  # Passes both tests.
-ac_preproc_ok=:
-break
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-done
-# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then
-  :
-else
-  { { echo "$as_me:$LINENO: error: C++ preprocessor \"$CXXCPP\" fails sanity check
-See \`config.log' for more details." >&5
-echo "$as_me: error: C++ preprocessor \"$CXXCPP\" fails sanity check
-See \`config.log' for more details." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-
-
-
-
-if test "X${HAVE_PKG_MPI}" = "Xyes"; then
-
-  if test -n "${MPI_DIR}" && test -z "${MPI_INC}"; then
-    MPI_INC="${MPI_DIR}/include"
-  fi
-
-  if test -n "${MPI_INC}"; then
-    CPPFLAGS="${CPPFLAGS} -I${MPI_INC}"
-  fi
-
-  ac_ext=cpp
-ac_cpp='$CXXCPP $CPPFLAGS'
-ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
-ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
-ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
-
-  { echo "$as_me:$LINENO: checking for mpi.h" >&5
-echo $ECHO_N "checking for mpi.h... $ECHO_C" >&6; }
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include "mpi.h"
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null && {
-	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       }; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-
-     { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-     echo "-----"
-     echo "Cannot link simple MPI program."
-     echo "Try --with-mpi-compilers to specify MPI compilers."
-     echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
-     echo "to specify all the specific MPI compile options."
-     echo "-----"
-     { { echo "$as_me:$LINENO: error: MPI cannot link" >&5
-echo "$as_me: error: MPI cannot link" >&2;}
-   { (exit 1); exit 1; }; }
-
-fi
-
-rm -f conftest.err conftest.$ac_ext
-
-  if test -n "${MPI_DIR}" && test -z "${MPI_LIBDIR}"; then
-    MPI_LIBDIR="${MPI_DIR}/lib"
-  fi
-
-  if test -n "${MPI_LIBDIR}"; then
-    LDFLAGS="${LDFLAGS} -L${MPI_LIBDIR}"
-  fi
-
-  if test -z "${MPI_LIBS}" && test -n "${MPI_LIBDIR}"; then
-    MPI_LIBS="-lmpi"
-  fi
-
-  if test -n "${MPI_LIBS}"; then
-    LIBS="${MPI_LIBS} ${LIBS}"
-  fi
-
-#   AC_LANG_CPLUSPLUS
-#   AC_MSG_CHECKING(whether MPI will link using C++ compiler)
-#   AC_TRY_LINK([#include <mpi.h>],
-#   [int c; char** v; MPI_Init(&c,&v);],
-#   [AC_MSG_RESULT(yes)],
-#   [AC_MSG_RESULT(no)
-#    echo "-----"
-#    echo "Cannot link simple MPI program."
-#    echo "Try --with-mpi-cxx to specify MPI C++ compile script."
-#    echo "Or try --with-mpi-libs, --with-mpi-incdir, --with-mpi-libdir"
-#    echo "to specify all the specific MPI compile options."
-#    echo "-----"
-#    AC_MSG_ERROR(MPI cannot link)]
-#   )
-
-fi
-
-
-# ------------------------------------------------------------------------
-# Checks for Makefile.export related systems
-# ------------------------------------------------------------------------
-
-# Check whether --enable-export-makefiles was given.
-if test "${enable_export_makefiles+set}" = set; then
-  enableval=$enable_export_makefiles; ac_cv_use_export_makefiles=$enableval
-else
-  ac_cv_use_export_makefiles=yes
-fi
-
-
-{ echo "$as_me:$LINENO: checking whether to build export makefiles" >&5
-echo $ECHO_N "checking whether to build export makefiles... $ECHO_C" >&6; }
-
-if test "X$ac_cv_use_export_makefiles" != "Xno"; then
-
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_EXPORT_MAKEFILES
-_ACEOF
-
-
-else
-
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
- if test X${ac_cv_use_export_makefiles} = Xyes; then
-  USING_EXPORT_MAKEFILES_TRUE=
-  USING_EXPORT_MAKEFILES_FALSE='#'
-else
-  USING_EXPORT_MAKEFILES_TRUE='#'
-  USING_EXPORT_MAKEFILES_FALSE=
-fi
-
-
-# Check for perl to run scripts (Required dependency)
-
-
-
-# Check whether --with-perl was given.
-if test "${with_perl+set}" = set; then
-  withval=$with_perl;
-{ echo "$as_me:$LINENO: checking for user supplied perl executable" >&5
-echo $ECHO_N "checking for user supplied perl executable... $ECHO_C" >&6; }
-{ echo "$as_me:$LINENO: result: ${withval}" >&5
-echo "${ECHO_T}${withval}" >&6; }
-USER_SPECIFIED_PERL=yes
-PERL_EXE="${withval}"
-
-else
-
-USER_SPECIFIED_PERL=no
-
-fi
-
-
-if test "X${USER_SPECIFIED_PERL}" = "Xyes"; then
-  as_ac_File=`echo "ac_cv_file_${PERL_EXE}" | $as_tr_sh`
-{ echo "$as_me:$LINENO: checking for ${PERL_EXE}" >&5
-echo $ECHO_N "checking for ${PERL_EXE}... $ECHO_C" >&6; }
-if { as_var=$as_ac_File; eval "test \"\${$as_var+set}\" = set"; }; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  test "$cross_compiling" = yes &&
-  { { echo "$as_me:$LINENO: error: cannot check for file existence when cross compiling" >&5
-echo "$as_me: error: cannot check for file existence when cross compiling" >&2;}
-   { (exit 1); exit 1; }; }
-if test -r "${PERL_EXE}"; then
-  eval "$as_ac_File=yes"
-else
-  eval "$as_ac_File=no"
-fi
-fi
-ac_res=`eval echo '${'$as_ac_File'}'`
-	       { echo "$as_me:$LINENO: result: $ac_res" >&5
-echo "${ECHO_T}$ac_res" >&6; }
-if test `eval echo '${'$as_ac_File'}'` = yes; then
-  HAVE_PERL=yes
-else
-  HAVE_PERL=no
-fi
-
-  PERL_EXE=${PERL_EXE}
-
-else
-  # Extract the first word of "perl", so it can be a program name with args.
-set dummy perl; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_HAVE_PERL+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$HAVE_PERL"; then
-  ac_cv_prog_HAVE_PERL="$HAVE_PERL" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_HAVE_PERL="yes"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_prog_HAVE_PERL" && ac_cv_prog_HAVE_PERL="no"
-fi
-fi
-HAVE_PERL=$ac_cv_prog_HAVE_PERL
-if test -n "$HAVE_PERL"; then
-  { echo "$as_me:$LINENO: result: $HAVE_PERL" >&5
-echo "${ECHO_T}$HAVE_PERL" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-  PERL_EXE=perl
-
-fi
- if test X${HAVE_PERL} = Xyes; then
-  USING_PERL_TRUE=
-  USING_PERL_FALSE='#'
-else
-  USING_PERL_TRUE='#'
-  USING_PERL_FALSE=
-fi
-
-
-
-if test "X$HAVE_PERL" != "Xyes" &&
-   test "X$ac_cv_use_export_makefiles" != "Xno"; then
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-  { { echo "$as_me:$LINENO: error: Failed to find the perl executable.  The flag --enable-export-makefiles requires perl to be either in your path or explicitly defined by the flag --with-perl=<executable>.  If you do not require the export makefiles to be installed via 'make install', you can disable the export makefiles with --disable-export-makefiles." >&5
-echo "$as_me: error: Failed to find the perl executable.  The flag --enable-export-makefiles requires perl to be either in your path or explicitly defined by the flag --with-perl=<executable>.  If you do not require the export makefiles to be installed via 'make install', you can disable the export makefiles with --disable-export-makefiles." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-# Check for using gnumake to clean up link lines via
-# gnumake's "shell" command. Optional dependency.
-
-
-
-
-# Check whether --with-gnumake was given.
-if test "${with_gnumake+set}" = set; then
-  withval=$with_gnumake; ac_cv_use_gnumake=$withval
-else
-  ac_cv_use_gnumake=no
-fi
-
-
-{ echo "$as_me:$LINENO: checking whether gnumake specific code should be enabled" >&5
-echo $ECHO_N "checking whether gnumake specific code should be enabled... $ECHO_C" >&6; }
-
-if test "X$ac_cv_use_gnumake" != "Xno"; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_GNUMAKE
-_ACEOF
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
- if test "X$ac_cv_use_gnumake" = "Xyes"; then
-  USING_GNUMAKE_TRUE=
-  USING_GNUMAKE_FALSE='#'
-else
-  USING_GNUMAKE_TRUE='#'
-  USING_GNUMAKE_FALSE=
-fi
-
-
-
-if test "X$HAVE_PERL" != "Xyes" &&
-   test "X$ac_cv_use_gnumake" != "Xno"; then
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-  { { echo "$as_me:$LINENO: error: The flag --with-gnumake requires perl to be in your path.  The perl executable can alternatively be explicitly defined by the flag --with-perl=<executable>." >&5
-echo "$as_me: error: The flag --with-gnumake requires perl to be in your path.  The perl executable can alternatively be explicitly defined by the flag --with-perl=<executable>." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-
-
-# ------------------------------------------------------------------------
-# Checks if tests and examples should be built
-# ------------------------------------------------------------------------
-
-#  #np# - These options can disable the tests and examples of a package.
-#  #np# - Packages that do not have tests or examples should #-out the
-#  #np# - option(s) that does (do) not apply.
-
-
-# Check whether --enable-tests was given.
-if test "${enable_tests+set}" = set; then
-  enableval=$enable_tests; ac_cv_use_tests=$enableval
-else
-  ac_cv_use_tests=yes
-fi
-
-
-{ echo "$as_me:$LINENO: checking whether to use tests" >&5
-echo $ECHO_N "checking whether to use tests... $ECHO_C" >&6; }
-
-if test "X$ac_cv_use_tests" != "Xno"; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_TESTS
-_ACEOF
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-# Check whether --enable-tests was given.
-if test "${enable_tests+set}" = set; then
-  enableval=$enable_tests; ac_cv_use_tests=$enableval
-else
-  ac_cv_use_tests=yes
-fi
-
-
-# Check whether --enable-threadpool-tests was given.
-if test "${enable_threadpool_tests+set}" = set; then
-  enableval=$enable_threadpool_tests; ac_cv_use_threadpool_tests=$enableval
-else
-  ac_cv_use_threadpool_tests=${ac_cv_use_tests}
-fi
-
-
-{ echo "$as_me:$LINENO: checking whether to use threadpool-tests" >&5
-echo $ECHO_N "checking whether to use threadpool-tests... $ECHO_C" >&6; }
-
-if test "X$ac_cv_use_threadpool_tests" != "Xno"; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_NEW_PACKAGE_TESTS
-_ACEOF
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
- if test "X$ac_cv_use_threadpool_tests" != "Xno"; then
-  BUILD_TESTS_TRUE=
-  BUILD_TESTS_FALSE='#'
-else
-  BUILD_TESTS_TRUE='#'
-  BUILD_TESTS_FALSE=
-fi
-
-
-#TAC_ARG_ENABLE_FEATURE(examples, [Make examples for all Trilinos packages buildable with 'make examples'], EXAMPLES, yes)
-#TAC_ARG_ENABLE_FEATURE_SUB_CHECK( new_package, examples, [Make New_Package examples buildable with 'make examples'], NEW_PACKAGE_EXAMPLES)
-#AM_CONDITIONAL(BUILD_EXAMPLES, test "X$ac_cv_use_new_package_examples" != "Xno")
-
-#We now build tests and examples through separate make targets, rather than
-#during "make".  We still need to conditionally include the test and example
-#in SUBDIRS, even though SUB_TEST and SUB_EXAMPLE will never be
-#defined, so that the tests and examples are included in the distribution
-#tarball.
- if test "X$ac_cv_use_sub_test" = "Xyes"; then
-  SUB_TEST_TRUE=
-  SUB_TEST_FALSE='#'
-else
-  SUB_TEST_TRUE='#'
-  SUB_TEST_FALSE=
-fi
-
-#AM_CONDITIONAL(SUB_EXAMPLE, test "X$ac_cv_use_sub_example" = "Xyes")
-
-
-# Check whether --enable-libcheck was given.
-if test "${enable_libcheck+set}" = set; then
-  enableval=$enable_libcheck; ac_cv_use_libcheck=$enableval
-else
-  ac_cv_use_libcheck=yes
-fi
-
-
-{ echo "$as_me:$LINENO: checking whether to use libcheck" >&5
-echo $ECHO_N "checking whether to use libcheck... $ECHO_C" >&6; }
-
-if test "X$ac_cv_use_libcheck" != "Xno"; then
-  { echo "$as_me:$LINENO: result: yes" >&5
-echo "${ECHO_T}yes" >&6; }
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_LIBCHECK
-_ACEOF
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-# ------------------------------------------------------------------------
-# Specify other directories
-# ------------------------------------------------------------------------
-
-# enable use of --with-libdirs="-Llibdir1 -Llibdir2 ..." to prepend to LDFLAGS
-
-{ echo "$as_me:$LINENO: checking whether additional library search paths defined" >&5
-echo $ECHO_N "checking whether additional library search paths defined... $ECHO_C" >&6; }
-
-# Check whether --with-libdirs was given.
-if test "${with_libdirs+set}" = set; then
-  withval=$with_libdirs;
-LDFLAGS="${withval} ${LDFLAGS}"
-{ echo "$as_me:$LINENO: result: ${withval}" >&5
-echo "${ECHO_T}${withval}" >&6; }
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
-
-# enable use of --with-incdirs="-Lincdir1 -Lincdir2 ..." to prepend to CPPFLAGS
-
-{ echo "$as_me:$LINENO: checking whether additional include search paths defined" >&5
-echo $ECHO_N "checking whether additional include search paths defined... $ECHO_C" >&6; }
-
-# Check whether --with-incdirs was given.
-if test "${with_incdirs+set}" = set; then
-  withval=$with_incdirs;
-CPPFLAGS="${withval} ${CPPFLAGS}"
-{ echo "$as_me:$LINENO: result: ${withval}" >&5
-echo "${ECHO_T}${withval}" >&6; }
-
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-
-fi
-
-
-
-# #np# - Yet another opportunity to remove code if you aren't
-# using Fortran
-# Define F77_FUNC that will be used to link with Fortran subroutines. - trash WORKGXX
-#AC_F77_WRAPPERS
-
-# ------------------------------------------------------------------------
-# Checks for libraries
-# ------------------------------------------------------------------------
-
-# If tests, examples and libcheck are disabled, we don't have to check
-# for these libraries.
-
-# #np# -
-# If a package does not have tests or examples, the corresponding check(s)
-# should be pulled out of the "if" statement below.
-#if test "X$ac_cv_use_new_package_examples" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
-if test "X$ac_cv_use_threadpool_tests" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
-
-{ echo "$as_me:$LINENO: checking for grep that handles long lines and -e" >&5
-echo $ECHO_N "checking for grep that handles long lines and -e... $ECHO_C" >&6; }
-if test "${ac_cv_path_GREP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  # Extract the first word of "grep ggrep" to use in msg output
-if test -z "$GREP"; then
-set dummy grep ggrep; ac_prog_name=$2
-if test "${ac_cv_path_GREP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_path_GREP_found=false
-# Loop through the user's path and test for each of PROGNAME-LIST
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_prog in grep ggrep; do
-  for ac_exec_ext in '' $ac_executable_extensions; do
-    ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
-    { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
-    # Check for GNU ac_path_GREP and select it if it is found.
-  # Check for GNU $ac_path_GREP
-case `"$ac_path_GREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
-*)
-  ac_count=0
-  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    echo 'GREP' >> "conftest.nl"
-    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    ac_count=`expr $ac_count + 1`
-    if test $ac_count -gt ${ac_path_GREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_GREP="$ac_path_GREP"
-      ac_path_GREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-
-    $ac_path_GREP_found && break 3
-  done
-done
-
-done
-IFS=$as_save_IFS
-
-
-fi
-
-GREP="$ac_cv_path_GREP"
-if test -z "$GREP"; then
-  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
-echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-else
-  ac_cv_path_GREP=$GREP
-fi
-
-
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_path_GREP" >&5
-echo "${ECHO_T}$ac_cv_path_GREP" >&6; }
- GREP="$ac_cv_path_GREP"
-
-
-{ echo "$as_me:$LINENO: checking for egrep" >&5
-echo $ECHO_N "checking for egrep... $ECHO_C" >&6; }
-if test "${ac_cv_path_EGREP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
-   then ac_cv_path_EGREP="$GREP -E"
-   else
-     # Extract the first word of "egrep" to use in msg output
-if test -z "$EGREP"; then
-set dummy egrep; ac_prog_name=$2
-if test "${ac_cv_path_EGREP+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_path_EGREP_found=false
-# Loop through the user's path and test for each of PROGNAME-LIST
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_prog in egrep; do
-  for ac_exec_ext in '' $ac_executable_extensions; do
-    ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
-    { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
-    # Check for GNU ac_path_EGREP and select it if it is found.
-  # Check for GNU $ac_path_EGREP
-case `"$ac_path_EGREP" --version 2>&1` in
-*GNU*)
-  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
-*)
-  ac_count=0
-  echo $ECHO_N "0123456789$ECHO_C" >"conftest.in"
-  while :
-  do
-    cat "conftest.in" "conftest.in" >"conftest.tmp"
-    mv "conftest.tmp" "conftest.in"
-    cp "conftest.in" "conftest.nl"
-    echo 'EGREP' >> "conftest.nl"
-    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
-    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    ac_count=`expr $ac_count + 1`
-    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
-      # Best one so far, save it but keep looking for a better one
-      ac_cv_path_EGREP="$ac_path_EGREP"
-      ac_path_EGREP_max=$ac_count
-    fi
-    # 10*(2^10) chars as input seems more than enough
-    test $ac_count -gt 10 && break
-  done
-  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
-esac
-
-
-    $ac_path_EGREP_found && break 3
-  done
-done
-
-done
-IFS=$as_save_IFS
-
-
-fi
-
-EGREP="$ac_cv_path_EGREP"
-if test -z "$EGREP"; then
-  { { echo "$as_me:$LINENO: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
-echo "$as_me: error: no acceptable $ac_prog_name could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-else
-  ac_cv_path_EGREP=$EGREP
-fi
-
-
-   fi
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_path_EGREP" >&5
-echo "${ECHO_T}$ac_cv_path_EGREP" >&6; }
- EGREP="$ac_cv_path_EGREP"
-
-
-{ echo "$as_me:$LINENO: checking for ANSI C header files" >&5
-echo $ECHO_N "checking for ANSI C header files... $ECHO_C" >&6; }
-if test "${ac_cv_header_stdc+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <stdlib.h>
-#include <stdarg.h>
-#include <string.h>
-#include <float.h>
-
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_cv_header_stdc=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_cv_header_stdc=no
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-
-if test $ac_cv_header_stdc = yes; then
-  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <string.h>
-
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "memchr" >/dev/null 2>&1; then
-  :
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
-
-fi
-
-if test $ac_cv_header_stdc = yes; then
-  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <stdlib.h>
-
-_ACEOF
-if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "free" >/dev/null 2>&1; then
-  :
-else
-  ac_cv_header_stdc=no
-fi
-rm -f conftest*
-
-fi
-
-if test $ac_cv_header_stdc = yes; then
-  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
-  if test "$cross_compiling" = yes; then
-  :
-else
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <ctype.h>
-#include <stdlib.h>
-#if ((' ' & 0x0FF) == 0x020)
-# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
-# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
-#else
-# define ISLOWER(c) \
-		   (('a' <= (c) && (c) <= 'i') \
-		     || ('j' <= (c) && (c) <= 'r') \
-		     || ('s' <= (c) && (c) <= 'z'))
-# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
-#endif
-
-#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
-int
-main ()
-{
-  int i;
-  for (i = 0; i < 256; i++)
-    if (XOR (islower (i), ISLOWER (i))
-	|| toupper (i) != TOUPPER (i))
-      return 2;
-  return 0;
-}
-_ACEOF
-rm -f conftest$ac_exeext
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-  :
-else
-  echo "$as_me: program exited with status $ac_status" >&5
-echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-( exit $ac_status )
-ac_cv_header_stdc=no
-fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
-fi
-
-
-fi
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5
-echo "${ECHO_T}$ac_cv_header_stdc" >&6; }
-if test $ac_cv_header_stdc = yes; then
-
-cat >>confdefs.h <<\_ACEOF
-#define STDC_HEADERS 1
-_ACEOF
-
-fi
-
-# On IRIX 5.3, sys/types and inttypes.h are conflicting.
-
-
-
-
-
-
-
-
-
-for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
-		  inttypes.h stdint.h unistd.h
-do
-as_ac_Header=`echo "ac_cv_header_$ac_header" | $as_tr_sh`
-{ echo "$as_me:$LINENO: checking for $ac_header" >&5
-echo $ECHO_N "checking for $ac_header... $ECHO_C" >&6; }
-if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-$ac_includes_default
-
-#include <$ac_header>
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  eval "$as_ac_Header=yes"
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	eval "$as_ac_Header=no"
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-ac_res=`eval echo '${'$as_ac_Header'}'`
-	       { echo "$as_me:$LINENO: result: $ac_res" >&5
-echo "${ECHO_T}$ac_res" >&6; }
-if test `eval echo '${'$as_ac_Header'}'` = yes; then
-  cat >>confdefs.h <<_ACEOF
-#define `echo "HAVE_$ac_header" | $as_tr_cpp` 1
-_ACEOF
-
-fi
-
-done
-
-
-
-
-acx_pthread_ok=no
-
-# First, check if the POSIX threads header, pthread.h, is available.
-# If it isn't, don't bother looking for the threads libraries.
-if test "${ac_cv_header_pthread_h+set}" = set; then
-  { echo "$as_me:$LINENO: checking for pthread.h" >&5
-echo $ECHO_N "checking for pthread.h... $ECHO_C" >&6; }
-if test "${ac_cv_header_pthread_h+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_header_pthread_h" >&5
-echo "${ECHO_T}$ac_cv_header_pthread_h" >&6; }
-else
-  # Is the header compilable?
-{ echo "$as_me:$LINENO: checking pthread.h usability" >&5
-echo $ECHO_N "checking pthread.h usability... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-$ac_includes_default
-#include <pthread.h>
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then
-  ac_header_compiler=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_header_compiler=no
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
-echo "${ECHO_T}$ac_header_compiler" >&6; }
-
-# Is the header present?
-{ echo "$as_me:$LINENO: checking pthread.h presence" >&5
-echo $ECHO_N "checking pthread.h presence... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <pthread.h>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null && {
-	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       }; then
-  ac_header_preproc=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  ac_header_preproc=no
-fi
-
-rm -f conftest.err conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
-echo "${ECHO_T}$ac_header_preproc" >&6; }
-
-# So?  What about this header?
-case $ac_header_compiler:$ac_header_preproc:$ac_cxx_preproc_warn_flag in
-  yes:no: )
-    { echo "$as_me:$LINENO: WARNING: pthread.h: accepted by the compiler, rejected by the preprocessor!" >&5
-echo "$as_me: WARNING: pthread.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
-    { echo "$as_me:$LINENO: WARNING: pthread.h: proceeding with the compiler's result" >&5
-echo "$as_me: WARNING: pthread.h: proceeding with the compiler's result" >&2;}
-    ac_header_preproc=yes
-    ;;
-  no:yes:* )
-    { echo "$as_me:$LINENO: WARNING: pthread.h: present but cannot be compiled" >&5
-echo "$as_me: WARNING: pthread.h: present but cannot be compiled" >&2;}
-    { echo "$as_me:$LINENO: WARNING: pthread.h:     check for missing prerequisite headers?" >&5
-echo "$as_me: WARNING: pthread.h:     check for missing prerequisite headers?" >&2;}
-    { echo "$as_me:$LINENO: WARNING: pthread.h: see the Autoconf documentation" >&5
-echo "$as_me: WARNING: pthread.h: see the Autoconf documentation" >&2;}
-    { echo "$as_me:$LINENO: WARNING: pthread.h:     section \"Present But Cannot Be Compiled\"" >&5
-echo "$as_me: WARNING: pthread.h:     section \"Present But Cannot Be Compiled\"" >&2;}
-    { echo "$as_me:$LINENO: WARNING: pthread.h: proceeding with the preprocessor's result" >&5
-echo "$as_me: WARNING: pthread.h: proceeding with the preprocessor's result" >&2;}
-    { echo "$as_me:$LINENO: WARNING: pthread.h: in the future, the compiler will take precedence" >&5
-echo "$as_me: WARNING: pthread.h: in the future, the compiler will take precedence" >&2;}
-    ( cat <<\_ASBOX
-## --------------------------------- ##
-## Report this to hcedwar@sandia.gov ##
-## --------------------------------- ##
-_ASBOX
-     ) | sed "s/^/$as_me: WARNING:     /" >&2
-    ;;
-esac
-{ echo "$as_me:$LINENO: checking for pthread.h" >&5
-echo $ECHO_N "checking for pthread.h... $ECHO_C" >&6; }
-if test "${ac_cv_header_pthread_h+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_cv_header_pthread_h=$ac_header_preproc
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_header_pthread_h" >&5
-echo "${ECHO_T}$ac_cv_header_pthread_h" >&6; }
-
-fi
-if test $ac_cv_header_pthread_h = yes; then
-  :
-else
-  acx_pthread_ok=noheader
-fi
-
-
-
-# We must check for the threads library under a number of different
-# names; the ordering is very important because some systems
-# (e.g. DEC) have both -lpthread and -lpthreads, where one of the
-# libraries is broken (non-POSIX).
-
-# First of all, check if the user has set any of the PTHREAD_LIBS,
-# etcetera environment variables, and if threads linking works using
-# them:
-if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then
-        save_CFLAGS="$CFLAGS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-        save_LIBS="$LIBS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        { echo "$as_me:$LINENO: checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS" >&5
-echo $ECHO_N "checking for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS... $ECHO_C" >&6; }
-        cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char pthread_join ();
-int
-main ()
-{
-return pthread_join ();
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext conftest$ac_exeext
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext &&
-       $as_test_x conftest$ac_exeext; then
-  acx_pthread_ok=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
-      conftest$ac_exeext conftest.$ac_ext
-        { echo "$as_me:$LINENO: result: $acx_pthread_ok" >&5
-echo "${ECHO_T}$acx_pthread_ok" >&6; }
-        if test x"$acx_pthread_ok" = xno; then
-                PTHREAD_LIBS=""
-                PTHREAD_CFLAGS=""
-        fi
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
-fi
-
-# Create a list of thread flags to try.  Items starting with a "-" are
-# C compiler flags, and other items are library names, except for "none"
-# which indicates that we try without any flags at all.
-
-acx_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt"
-
-# The ordering *is* (sometimes) important.  Some notes on the
-# individual items follow:
-
-# pthreads: AIX (must check this before -lpthread)
-# none: in case threads are in libc; should be tried before -Kthread and
-#       other compiler flags to prevent continual compiler warnings
-# -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
-# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
-# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
-# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads)
-# -pthreads: Solaris/gcc
-# -mthreads: Mingw32/gcc, Lynx/gcc
-# -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
-#      doesn't hurt to check since this sometimes defines pthreads too;
-#      also defines -D_REENTRANT)
-# pthread: Linux, etcetera
-# --thread-safe: KAI C++
-
-case "${host_cpu}-${host_os}" in
-        *solaris*)
-
-        # On Solaris (at least, for some versions), libc contains stubbed
-        # (non-functional) versions of the pthreads routines, so link-based
-        # tests will erroneously succeed.  (We need to link with -pthread or
-        # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
-        # a function called by this macro, so we could check for that, but
-        # who knows whether they'll stub that too in a future libc.)  So,
-        # we'll just look for -pthreads and -lpthread first:
-
-        acx_pthread_flags="-pthread -pthreads pthread -mt $acx_pthread_flags"
-        ;;
-esac
-
-if test x"$acx_pthread_ok" = xno; then
-for flag in $acx_pthread_flags; do
-
-        case $flag in
-                none)
-                { echo "$as_me:$LINENO: checking whether pthreads work without any flags" >&5
-echo $ECHO_N "checking whether pthreads work without any flags... $ECHO_C" >&6; }
-                ;;
-
-                -*)
-                { echo "$as_me:$LINENO: checking whether pthreads work with $flag" >&5
-echo $ECHO_N "checking whether pthreads work with $flag... $ECHO_C" >&6; }
-                PTHREAD_CFLAGS="$flag"
-                ;;
-
-                *)
-                { echo "$as_me:$LINENO: checking for the pthreads library -l$flag" >&5
-echo $ECHO_N "checking for the pthreads library -l$flag... $ECHO_C" >&6; }
-                PTHREAD_LIBS="-l$flag"
-                ;;
-        esac
-
-        save_LIBS="$LIBS"
-        save_CFLAGS="$CFLAGS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-
-        # Check for various functions.  We must include pthread.h,
-        # since some functions may be macros.  (On the Sequent, we
-        # need a special flag -Kthread to make this header compile.)
-        # We check for pthread_join because it is in -lpthread on IRIX
-        # while pthread_create is in libc.  We check for pthread_attr_init
-        # due to DEC craziness with -lpthreads.  We check for
-        # pthread_cleanup_push because it is one of the few pthread
-        # functions on Solaris that doesn't have a non-functional libc stub.
-        # We try pthread_create on general principles.
-        cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <pthread.h>
-int
-main ()
-{
-pthread_t th; pthread_join(th, 0);
-                     pthread_attr_init(0); pthread_cleanup_push(0, 0);
-                     pthread_create(0,0,0,0); pthread_cleanup_pop(0);
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext conftest$ac_exeext
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext &&
-       $as_test_x conftest$ac_exeext; then
-  acx_pthread_ok=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
-      conftest$ac_exeext conftest.$ac_ext
-
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
-
-        { echo "$as_me:$LINENO: result: $acx_pthread_ok" >&5
-echo "${ECHO_T}$acx_pthread_ok" >&6; }
-        if test "x$acx_pthread_ok" = xyes; then
-                break;
-        fi
-
-        PTHREAD_LIBS=""
-        PTHREAD_CFLAGS=""
-done
-fi
-
-# Various other checks:
-if test "x$acx_pthread_ok" = xyes; then
-        save_LIBS="$LIBS"
-        LIBS="$PTHREAD_LIBS $LIBS"
-        save_CFLAGS="$CFLAGS"
-        CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-
-        # Detect AIX lossage: threads are created detached by default
-        # and the JOINABLE attribute has a nonstandard name (UNDETACHED).
-        { echo "$as_me:$LINENO: checking for joinable pthread attribute" >&5
-echo $ECHO_N "checking for joinable pthread attribute... $ECHO_C" >&6; }
-        cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <pthread.h>
-int
-main ()
-{
-int attr=PTHREAD_CREATE_JOINABLE;
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext conftest$ac_exeext
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext &&
-       $as_test_x conftest$ac_exeext; then
-  ok=PTHREAD_CREATE_JOINABLE
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ok=unknown
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
-      conftest$ac_exeext conftest.$ac_ext
-        if test x"$ok" = xunknown; then
-                cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <pthread.h>
-int
-main ()
-{
-int attr=PTHREAD_CREATE_UNDETACHED;
-  ;
-  return 0;
-}
-_ACEOF
-rm -f conftest.$ac_objext conftest$ac_exeext
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } && {
-	 test -z "$ac_cxx_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext &&
-       $as_test_x conftest$ac_exeext; then
-  ok=PTHREAD_CREATE_UNDETACHED
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ok=unknown
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
-      conftest$ac_exeext conftest.$ac_ext
-        fi
-        if test x"$ok" != xPTHREAD_CREATE_JOINABLE; then
-
-cat >>confdefs.h <<\_ACEOF
-#define PTHREAD_CREATE_JOINABLE $ok
-_ACEOF
-
-        fi
-        { echo "$as_me:$LINENO: result: ${ok}" >&5
-echo "${ECHO_T}${ok}" >&6; }
-        if test x"$ok" = xunknown; then
-                { echo "$as_me:$LINENO: WARNING: we do not know how to create joinable pthreads" >&5
-echo "$as_me: WARNING: we do not know how to create joinable pthreads" >&2;}
-        fi
-
-        { echo "$as_me:$LINENO: checking if more special flags are required for pthreads" >&5
-echo $ECHO_N "checking if more special flags are required for pthreads... $ECHO_C" >&6; }
-        flag=no
-        case "${host_cpu}-${host_os}" in
-                *-aix* | *-freebsd*)     flag="-D_THREAD_SAFE";;
-                *solaris* | alpha*-osf*) flag="-D_REENTRANT";;
-        esac
-        { echo "$as_me:$LINENO: result: ${flag}" >&5
-echo "${ECHO_T}${flag}" >&6; }
-        if test "x$flag" != xno; then
-                PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS"
-        fi
-
-        LIBS="$save_LIBS"
-        CFLAGS="$save_CFLAGS"
-
-        # More AIX lossage: must compile with cc_r
-        # Extract the first word of "cc_r", so it can be a program name with args.
-set dummy cc_r; ac_word=$2
-{ echo "$as_me:$LINENO: checking for $ac_word" >&5
-echo $ECHO_N "checking for $ac_word... $ECHO_C" >&6; }
-if test "${ac_cv_prog_PTHREAD_CC+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  if test -n "$PTHREAD_CC"; then
-  ac_cv_prog_PTHREAD_CC="$PTHREAD_CC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  for ac_exec_ext in '' $ac_executable_extensions; do
-  if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
-    ac_cv_prog_PTHREAD_CC="cc_r"
-    echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-done
-IFS=$as_save_IFS
-
-  test -z "$ac_cv_prog_PTHREAD_CC" && ac_cv_prog_PTHREAD_CC="${CC}"
-fi
-fi
-PTHREAD_CC=$ac_cv_prog_PTHREAD_CC
-if test -n "$PTHREAD_CC"; then
-  { echo "$as_me:$LINENO: result: $PTHREAD_CC" >&5
-echo "${ECHO_T}$PTHREAD_CC" >&6; }
-else
-  { echo "$as_me:$LINENO: result: no" >&5
-echo "${ECHO_T}no" >&6; }
-fi
-
-
-else
-        PTHREAD_CC="$CC"
-fi
-
-
-
-
-
-# Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
-if test x"$acx_pthread_ok" = xyes; then
-
-cat >>confdefs.h <<\_ACEOF
-#define HAVE_PTHREAD 1
-_ACEOF
-
-        :
-else
-        acx_pthread_ok=no
-
-fi
-
-
-LIBS="$PTHREAD_LIBS $LIBS"
-CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-CC="$PTHREAD_CC"
-
-fi
-# end of the list of libraries that don't need to be checked for if
-# tests and examples are disabled.
-
-# ------------------------------------------------------------------------
-# Checks for linker characteristics
-# ------------------------------------------------------------------------
-
-# Determine libraries needed for linking with Fortran
-#AC_F77_LIBRARY_LDFLAGS
-
-
-# ------------------------------------------------------------------------
-# Perform substitutions in output files
-# ------------------------------------------------------------------------
-
-
-
-# ------------------------------------------------------------------------
-# Output files
-# ------------------------------------------------------------------------
-##
-#  You will need to change AC_CONFIG_FILES below and Makefile.am
-#  to add a new directory.
-ac_config_files="$ac_config_files Makefile Makefile.export.threadpool src/Makefile test/Makefile"
-
-
-cat >confcache <<\_ACEOF
-# This file is a shell script that caches the results of configure
-# tests run on this system so they can be shared between configure
-# scripts and configure runs, see configure's option --config-cache.
-# It is not useful on other systems.  If it contains results you don't
-# want to keep, you may remove or edit it.
-#
-# config.status only pays attention to the cache file if you give it
-# the --recheck option to rerun configure.
-#
-# `ac_cv_env_foo' variables (set or unset) will be overridden when
-# loading this file, other *unset* `ac_cv_foo' will be assigned the
-# following values.
-
-_ACEOF
-
-# The following way of writing the cache mishandles newlines in values,
-# but we know of no workaround that is simple, portable, and efficient.
-# So, we kill variables containing newlines.
-# Ultrix sh set writes to stderr and can't be redirected directly,
-# and sets the high bit in the cache file unless we assign to the vars.
-(
-  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
-    eval ac_val=\$$ac_var
-    case $ac_val in #(
-    *${as_nl}*)
-      case $ac_var in #(
-      *_cv_*) { echo "$as_me:$LINENO: WARNING: Cache variable $ac_var contains a newline." >&5
-echo "$as_me: WARNING: Cache variable $ac_var contains a newline." >&2;} ;;
-      esac
-      case $ac_var in #(
-      _ | IFS | as_nl) ;; #(
-      *) $as_unset $ac_var ;;
-      esac ;;
-    esac
-  done
-
-  (set) 2>&1 |
-    case $as_nl`(ac_space=' '; set) 2>&1` in #(
-    *${as_nl}ac_space=\ *)
-      # `set' does not quote correctly, so add quotes (double-quote
-      # substitution turns \\\\ into \\, and sed turns \\ into \).
-      sed -n \
-	"s/'/'\\\\''/g;
-	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
-      ;; #(
-    *)
-      # `set' quotes correctly as required by POSIX, so do not add quotes.
-      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
-      ;;
-    esac |
-    sort
-) |
-  sed '
-     /^ac_cv_env_/b end
-     t clear
-     :clear
-     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
-     t end
-     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
-     :end' >>confcache
-if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
-  if test -w "$cache_file"; then
-    test "x$cache_file" != "x/dev/null" &&
-      { echo "$as_me:$LINENO: updating cache $cache_file" >&5
-echo "$as_me: updating cache $cache_file" >&6;}
-    cat confcache >$cache_file
-  else
-    { echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5
-echo "$as_me: not updating unwritable cache $cache_file" >&6;}
-  fi
-fi
-rm -f confcache
-
-test "x$prefix" = xNONE && prefix=$ac_default_prefix
-# Let make expand exec_prefix.
-test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
-
-DEFS=-DHAVE_CONFIG_H
-
-ac_libobjs=
-ac_ltlibobjs=
-for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
-  # 1. Remove the extension, and $U if already installed.
-  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
-  ac_i=`echo "$ac_i" | sed "$ac_script"`
-  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
-  #    will be set to the directory where LIBOBJS objects are built.
-  ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext"
-  ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo'
-done
-LIBOBJS=$ac_libobjs
-
-LTLIBOBJS=$ac_ltlibobjs
-
-
-if test -z "${MAINTAINER_MODE_TRUE}" && test -z "${MAINTAINER_MODE_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"MAINTAINER_MODE\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"MAINTAINER_MODE\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${HAVE_MPI_TRUE}" && test -z "${HAVE_MPI_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"HAVE_MPI\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"HAVE_MPI\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${AMDEP_TRUE}" && test -z "${AMDEP_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"AMDEP\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"AMDEP\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${am__fastdepCC_TRUE}" && test -z "${am__fastdepCC_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCC\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"am__fastdepCC\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${am__fastdepCXX_TRUE}" && test -z "${am__fastdepCXX_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"am__fastdepCXX\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"am__fastdepCXX\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${USE_ALTERNATE_AR_TRUE}" && test -z "${USE_ALTERNATE_AR_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"USE_ALTERNATE_AR\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"USE_ALTERNATE_AR\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${USE_ALTERNATE_AR_TRUE}" && test -z "${USE_ALTERNATE_AR_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"USE_ALTERNATE_AR\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"USE_ALTERNATE_AR\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${USING_EXPORT_MAKEFILES_TRUE}" && test -z "${USING_EXPORT_MAKEFILES_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"USING_EXPORT_MAKEFILES\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"USING_EXPORT_MAKEFILES\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${USING_PERL_TRUE}" && test -z "${USING_PERL_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"USING_PERL\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"USING_PERL\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${USING_GNUMAKE_TRUE}" && test -z "${USING_GNUMAKE_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"USING_GNUMAKE\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"USING_GNUMAKE\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${BUILD_TESTS_TRUE}" && test -z "${BUILD_TESTS_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"BUILD_TESTS\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"BUILD_TESTS\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-if test -z "${SUB_TEST_TRUE}" && test -z "${SUB_TEST_FALSE}"; then
-  { { echo "$as_me:$LINENO: error: conditional \"SUB_TEST\" was never defined.
-Usually this means the macro was only invoked conditionally." >&5
-echo "$as_me: error: conditional \"SUB_TEST\" was never defined.
-Usually this means the macro was only invoked conditionally." >&2;}
-   { (exit 1); exit 1; }; }
-fi
-
-: ${CONFIG_STATUS=./config.status}
-ac_clean_files_save=$ac_clean_files
-ac_clean_files="$ac_clean_files $CONFIG_STATUS"
-{ echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5
-echo "$as_me: creating $CONFIG_STATUS" >&6;}
-cat >$CONFIG_STATUS <<_ACEOF
-#! $SHELL
-# Generated by $as_me.
-# Run this file to recreate the current configuration.
-# Compiler output produced by configure, useful for debugging
-# configure, is in config.log if it exists.
-
-debug=false
-ac_cs_recheck=false
-ac_cs_silent=false
-SHELL=\${CONFIG_SHELL-$SHELL}
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF
-## --------------------- ##
-## M4sh Initialization.  ##
-## --------------------- ##
-
-# Be more Bourne compatible
-DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
-  emulate sh
-  NULLCMD=:
-  # Zsh 3.x and 4.x performs word splitting on ${1+"$@"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '${1+"$@"}'='"$@"'
-  setopt NO_GLOB_SUBST
-else
-  case `(set -o) 2>/dev/null` in
-  *posix*) set -o posix ;;
-esac
-
-fi
-
-
-
-
-# PATH needs CR
-# Avoid depending upon Character Ranges.
-as_cr_letters='abcdefghijklmnopqrstuvwxyz'
-as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-as_cr_Letters=$as_cr_letters$as_cr_LETTERS
-as_cr_digits='0123456789'
-as_cr_alnum=$as_cr_Letters$as_cr_digits
-
-# The user is always right.
-if test "${PATH_SEPARATOR+set}" != set; then
-  echo "#! /bin/sh" >conf$$.sh
-  echo  "exit 0"   >>conf$$.sh
-  chmod +x conf$$.sh
-  if (PATH="/nonexistent;."; conf$$.sh) >/dev/null 2>&1; then
-    PATH_SEPARATOR=';'
-  else
-    PATH_SEPARATOR=:
-  fi
-  rm -f conf$$.sh
-fi
-
-# Support unset when possible.
-if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
-  as_unset=unset
-else
-  as_unset=false
-fi
-
-
-# IFS
-# We need space, tab and new line, in precisely that order.  Quoting is
-# there to prevent editors from complaining about space-tab.
-# (If _AS_PATH_WALK were called with IFS unset, it would disable word
-# splitting by setting IFS to empty value.)
-as_nl='
-'
-IFS=" ""	$as_nl"
-
-# Find who we are.  Look in the path if we contain no directory separator.
-case $0 in
-  *[\\/]* ) as_myself=$0 ;;
-  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-  test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
-done
-IFS=$as_save_IFS
-
-     ;;
-esac
-# We did not find ourselves, most probably we were run as `sh COMMAND'
-# in which case we are not to be found in the path.
-if test "x$as_myself" = x; then
-  as_myself=$0
-fi
-if test ! -f "$as_myself"; then
-  echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
-  { (exit 1); exit 1; }
-fi
-
-# Work around bugs in pre-3.0 UWIN ksh.
-for as_var in ENV MAIL MAILPATH
-do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
-done
-PS1='$ '
-PS2='> '
-PS4='+ '
-
-# NLS nuisances.
-for as_var in \
-  LANG LANGUAGE LC_ADDRESS LC_ALL LC_COLLATE LC_CTYPE LC_IDENTIFICATION \
-  LC_MEASUREMENT LC_MESSAGES LC_MONETARY LC_NAME LC_NUMERIC LC_PAPER \
-  LC_TELEPHONE LC_TIME
-do
-  if (set +x; test -z "`(eval $as_var=C; export $as_var) 2>&1`"); then
-    eval $as_var=C; export $as_var
-  else
-    ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
-  fi
-done
-
-# Required to use basename.
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
-else
-  as_expr=false
-fi
-
-if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
-  as_basename=basename
-else
-  as_basename=false
-fi
-
-
-# Name of the executable.
-as_me=`$as_basename -- "$0" ||
-$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
-	 X"$0" : 'X\(//\)$' \| \
-	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-echo X/"$0" |
-    sed '/^.*\/\([^/][^/]*\)\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-
-# CDPATH.
-$as_unset CDPATH
-
-
-
-  as_lineno_1=$LINENO
-  as_lineno_2=$LINENO
-  test "x$as_lineno_1" != "x$as_lineno_2" &&
-  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
-
-  # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
-  # uniformly replaced by the line number.  The first 'sed' inserts a
-  # line-number line after each line using $LINENO; the second 'sed'
-  # does the real work.  The second script uses 'N' to pair each
-  # line-number line with the line containing $LINENO, and appends
-  # trailing '-' during substitution so that $LINENO is not a special
-  # case at line end.
-  # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
-  # scripts with optimization help from Paolo Bonzini.  Blame Lee
-  # E. McMahon (1931-1989) for sed's syntax.  :-)
-  sed -n '
-    p
-    /[$]LINENO/=
-  ' <$as_myself |
-    sed '
-      s/[$]LINENO.*/&-/
-      t lineno
-      b
-      :lineno
-      N
-      :loop
-      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
-      t loop
-      s/-\n.*//
-    ' >$as_me.lineno &&
-  chmod +x "$as_me.lineno" ||
-    { echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
-   { (exit 1); exit 1; }; }
-
-  # Don't try to exec as it changes $[0], causing all sort of problems
-  # (the dirname of $[0] is not the place where we might find the
-  # original and so on.  Autoconf is especially sensitive to this).
-  . "./$as_me.lineno"
-  # Exit status is that of the last command.
-  exit
-}
-
-
-if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
-  as_dirname=dirname
-else
-  as_dirname=false
-fi
-
-ECHO_C= ECHO_N= ECHO_T=
-case `echo -n x` in
--n*)
-  case `echo 'x\c'` in
-  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
-  *)   ECHO_C='\c';;
-  esac;;
-*)
-  ECHO_N='-n';;
-esac
-
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
-else
-  as_expr=false
-fi
-
-rm -f conf$$ conf$$.exe conf$$.file
-if test -d conf$$.dir; then
-  rm -f conf$$.dir/conf$$.file
-else
-  rm -f conf$$.dir
-  mkdir conf$$.dir
-fi
-echo >conf$$.file
-if ln -s conf$$.file conf$$ 2>/dev/null; then
-  as_ln_s='ln -s'
-  # ... but there are two gotchas:
-  # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
-  # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
-  # In both cases, we have to default to `cp -p'.
-  ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
-    as_ln_s='cp -p'
-elif ln conf$$.file conf$$ 2>/dev/null; then
-  as_ln_s=ln
-else
-  as_ln_s='cp -p'
-fi
-rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
-rmdir conf$$.dir 2>/dev/null
-
-if mkdir -p . 2>/dev/null; then
-  as_mkdir_p=:
-else
-  test -d ./-p && rmdir ./-p
-  as_mkdir_p=false
-fi
-
-if test -x / >/dev/null 2>&1; then
-  as_test_x='test -x'
-else
-  if ls -dL / >/dev/null 2>&1; then
-    as_ls_L_option=L
-  else
-    as_ls_L_option=
-  fi
-  as_test_x='
-    eval sh -c '\''
-      if test -d "$1"; then
-        test -d "$1/.";
-      else
-	case $1 in
-        -*)set "./$1";;
-	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
-	???[sx]*):;;*)false;;esac;fi
-    '\'' sh
-  '
-fi
-as_executable_p=$as_test_x
-
-# Sed expression to map a string onto a valid CPP name.
-as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
-
-# Sed expression to map a string onto a valid variable name.
-as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
-
-
-exec 6>&1
-
-# Save the log message, to keep $[0] and so on meaningful, and to
-# report actual input values of CONFIG_FILES etc. instead of their
-# values after options handling.
-ac_log="
-This file was extended by ThreadPool $as_me 1.1d, which was
-generated by GNU Autoconf 2.61.  Invocation command line was
-
-  CONFIG_FILES    = $CONFIG_FILES
-  CONFIG_HEADERS  = $CONFIG_HEADERS
-  CONFIG_LINKS    = $CONFIG_LINKS
-  CONFIG_COMMANDS = $CONFIG_COMMANDS
-  $ $0 $@
-
-on `(hostname || uname -n) 2>/dev/null | sed 1q`
-"
-
-_ACEOF
-
-cat >>$CONFIG_STATUS <<_ACEOF
-# Files that config.status was made for.
-config_files="$ac_config_files"
-config_headers="$ac_config_headers"
-config_commands="$ac_config_commands"
-
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF
-ac_cs_usage="\
-\`$as_me' instantiates files from templates according to the
-current configuration.
-
-Usage: $0 [OPTIONS] [FILE]...
-
-  -h, --help       print this help, then exit
-  -V, --version    print version number and configuration settings, then exit
-  -q, --quiet      do not print progress messages
-  -d, --debug      don't remove temporary files
-      --recheck    update $as_me by reconfiguring in the same conditions
-  --file=FILE[:TEMPLATE]
-		   instantiate the configuration file FILE
-  --header=FILE[:TEMPLATE]
-		   instantiate the configuration header FILE
-
-Configuration files:
-$config_files
-
-Configuration headers:
-$config_headers
-
-Configuration commands:
-$config_commands
-
-Report bugs to <bug-autoconf@gnu.org>."
-
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF
-ac_cs_version="\\
-ThreadPool config.status 1.1d
-configured by $0, generated by GNU Autoconf 2.61,
-  with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
-
-Copyright (C) 2006 Free Software Foundation, Inc.
-This config.status script is free software; the Free Software Foundation
-gives unlimited permission to copy, distribute and modify it."
-
-ac_pwd='$ac_pwd'
-srcdir='$srcdir'
-INSTALL='$INSTALL'
-MKDIR_P='$MKDIR_P'
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF
-# If no file are specified by the user, then we need to provide default
-# value.  By we need to know if files were specified by the user.
-ac_need_defaults=:
-while test $# != 0
-do
-  case $1 in
-  --*=*)
-    ac_option=`expr "X$1" : 'X\([^=]*\)='`
-    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
-    ac_shift=:
-    ;;
-  *)
-    ac_option=$1
-    ac_optarg=$2
-    ac_shift=shift
-    ;;
-  esac
-
-  case $ac_option in
-  # Handling of the options.
-  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
-    ac_cs_recheck=: ;;
-  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
-    echo "$ac_cs_version"; exit ;;
-  --debug | --debu | --deb | --de | --d | -d )
-    debug=: ;;
-  --file | --fil | --fi | --f )
-    $ac_shift
-    CONFIG_FILES="$CONFIG_FILES $ac_optarg"
-    ac_need_defaults=false;;
-  --header | --heade | --head | --hea )
-    $ac_shift
-    CONFIG_HEADERS="$CONFIG_HEADERS $ac_optarg"
-    ac_need_defaults=false;;
-  --he | --h)
-    # Conflict between --help and --header
-    { echo "$as_me: error: ambiguous option: $1
-Try \`$0 --help' for more information." >&2
-   { (exit 1); exit 1; }; };;
-  --help | --hel | -h )
-    echo "$ac_cs_usage"; exit ;;
-  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
-  | -silent | --silent | --silen | --sile | --sil | --si | --s)
-    ac_cs_silent=: ;;
-
-  # This is an error.
-  -*) { echo "$as_me: error: unrecognized option: $1
-Try \`$0 --help' for more information." >&2
-   { (exit 1); exit 1; }; } ;;
-
-  *) ac_config_targets="$ac_config_targets $1"
-     ac_need_defaults=false ;;
-
-  esac
-  shift
-done
-
-ac_configure_extra_args=
-
-if $ac_cs_silent; then
-  exec 6>/dev/null
-  ac_configure_extra_args="$ac_configure_extra_args --silent"
-fi
-
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF
-if \$ac_cs_recheck; then
-  echo "running CONFIG_SHELL=$SHELL $SHELL $0 "$ac_configure_args \$ac_configure_extra_args " --no-create --no-recursion" >&6
-  CONFIG_SHELL=$SHELL
-  export CONFIG_SHELL
-  exec $SHELL "$0"$ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
-fi
-
-_ACEOF
-cat >>$CONFIG_STATUS <<\_ACEOF
-exec 5>>config.log
-{
-  echo
-  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
-## Running $as_me. ##
-_ASBOX
-  echo "$ac_log"
-} >&5
-
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF
-#
-# INIT-COMMANDS
-#
-AMDEP_TRUE="$AMDEP_TRUE" ac_aux_dir="$ac_aux_dir"
-
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF
-
-# Handling of arguments.
-for ac_config_target in $ac_config_targets
-do
-  case $ac_config_target in
-    "src/ThreadPool_config.h") CONFIG_HEADERS="$CONFIG_HEADERS src/ThreadPool_config.h:src/ThreadPool_config.h.in" ;;
-    "depfiles") CONFIG_COMMANDS="$CONFIG_COMMANDS depfiles" ;;
-    "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
-    "Makefile.export.threadpool") CONFIG_FILES="$CONFIG_FILES Makefile.export.threadpool" ;;
-    "src/Makefile") CONFIG_FILES="$CONFIG_FILES src/Makefile" ;;
-    "test/Makefile") CONFIG_FILES="$CONFIG_FILES test/Makefile" ;;
-
-  *) { { echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5
-echo "$as_me: error: invalid argument: $ac_config_target" >&2;}
-   { (exit 1); exit 1; }; };;
-  esac
-done
-
-
-# If the user did not use the arguments to specify the items to instantiate,
-# then the envvar interface is used.  Set only those that are not.
-# We use the long form for the default assignment because of an extremely
-# bizarre bug on SunOS 4.1.3.
-if $ac_need_defaults; then
-  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
-  test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers
-  test "${CONFIG_COMMANDS+set}" = set || CONFIG_COMMANDS=$config_commands
-fi
-
-# Have a temporary directory for convenience.  Make it in the build tree
-# simply because there is no reason against having it here, and in addition,
-# creating and moving files from /tmp can sometimes cause problems.
-# Hook for its removal unless debugging.
-# Note that there is a small window in which the directory will not be cleaned:
-# after its creation but before its name has been assigned to `$tmp'.
-$debug ||
-{
-  tmp=
-  trap 'exit_status=$?
-  { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status
-' 0
-  trap '{ (exit 1); exit 1; }' 1 2 13 15
-}
-# Create a (secure) tmp directory for tmp files.
-
-{
-  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
-  test -n "$tmp" && test -d "$tmp"
-}  ||
-{
-  tmp=./conf$$-$RANDOM
-  (umask 077 && mkdir "$tmp")
-} ||
-{
-   echo "$me: cannot create a temporary directory in ." >&2
-   { (exit 1); exit 1; }
-}
-
-#
-# Set up the sed scripts for CONFIG_FILES section.
-#
-
-# No need to generate the scripts if there are no CONFIG_FILES.
-# This happens for instance when ./config.status config.h
-if test -n "$CONFIG_FILES"; then
-
-_ACEOF
-
-
-
-ac_delim='%!_!# '
-for ac_last_try in false false false false false :; do
-  cat >conf$$subs.sed <<_ACEOF
-SHELL!$SHELL$ac_delim
-PATH_SEPARATOR!$PATH_SEPARATOR$ac_delim
-PACKAGE_NAME!$PACKAGE_NAME$ac_delim
-PACKAGE_TARNAME!$PACKAGE_TARNAME$ac_delim
-PACKAGE_VERSION!$PACKAGE_VERSION$ac_delim
-PACKAGE_STRING!$PACKAGE_STRING$ac_delim
-PACKAGE_BUGREPORT!$PACKAGE_BUGREPORT$ac_delim
-exec_prefix!$exec_prefix$ac_delim
-prefix!$prefix$ac_delim
-program_transform_name!$program_transform_name$ac_delim
-bindir!$bindir$ac_delim
-sbindir!$sbindir$ac_delim
-libexecdir!$libexecdir$ac_delim
-datarootdir!$datarootdir$ac_delim
-datadir!$datadir$ac_delim
-sysconfdir!$sysconfdir$ac_delim
-sharedstatedir!$sharedstatedir$ac_delim
-localstatedir!$localstatedir$ac_delim
-includedir!$includedir$ac_delim
-oldincludedir!$oldincludedir$ac_delim
-docdir!$docdir$ac_delim
-infodir!$infodir$ac_delim
-htmldir!$htmldir$ac_delim
-dvidir!$dvidir$ac_delim
-pdfdir!$pdfdir$ac_delim
-psdir!$psdir$ac_delim
-libdir!$libdir$ac_delim
-localedir!$localedir$ac_delim
-mandir!$mandir$ac_delim
-DEFS!$DEFS$ac_delim
-ECHO_C!$ECHO_C$ac_delim
-ECHO_N!$ECHO_N$ac_delim
-ECHO_T!$ECHO_T$ac_delim
-LIBS!$LIBS$ac_delim
-build_alias!$build_alias$ac_delim
-host_alias!$host_alias$ac_delim
-target_alias!$target_alias$ac_delim
-MAINTAINER_MODE_TRUE!$MAINTAINER_MODE_TRUE$ac_delim
-MAINTAINER_MODE_FALSE!$MAINTAINER_MODE_FALSE$ac_delim
-MAINT!$MAINT$ac_delim
-build!$build$ac_delim
-build_cpu!$build_cpu$ac_delim
-build_vendor!$build_vendor$ac_delim
-build_os!$build_os$ac_delim
-host!$host$ac_delim
-host_cpu!$host_cpu$ac_delim
-host_vendor!$host_vendor$ac_delim
-host_os!$host_os$ac_delim
-target!$target$ac_delim
-target_cpu!$target_cpu$ac_delim
-target_vendor!$target_vendor$ac_delim
-target_os!$target_os$ac_delim
-INSTALL_PROGRAM!$INSTALL_PROGRAM$ac_delim
-INSTALL_SCRIPT!$INSTALL_SCRIPT$ac_delim
-INSTALL_DATA!$INSTALL_DATA$ac_delim
-am__isrc!$am__isrc$ac_delim
-CYGPATH_W!$CYGPATH_W$ac_delim
-PACKAGE!$PACKAGE$ac_delim
-VERSION!$VERSION$ac_delim
-ACLOCAL!$ACLOCAL$ac_delim
-AUTOCONF!$AUTOCONF$ac_delim
-AUTOMAKE!$AUTOMAKE$ac_delim
-AUTOHEADER!$AUTOHEADER$ac_delim
-MAKEINFO!$MAKEINFO$ac_delim
-install_sh!$install_sh$ac_delim
-STRIP!$STRIP$ac_delim
-INSTALL_STRIP_PROGRAM!$INSTALL_STRIP_PROGRAM$ac_delim
-mkdir_p!$mkdir_p$ac_delim
-AWK!$AWK$ac_delim
-SET_MAKE!$SET_MAKE$ac_delim
-am__leading_dot!$am__leading_dot$ac_delim
-AMTAR!$AMTAR$ac_delim
-am__tar!$am__tar$ac_delim
-am__untar!$am__untar$ac_delim
-MPI_TEMP_CXX!$MPI_TEMP_CXX$ac_delim
-MPI_CXX!$MPI_CXX$ac_delim
-HAVE_MPI_TRUE!$HAVE_MPI_TRUE$ac_delim
-HAVE_MPI_FALSE!$HAVE_MPI_FALSE$ac_delim
-MPI_CXX_EXISTS!$MPI_CXX_EXISTS$ac_delim
-MPI_CC_EXISTS!$MPI_CC_EXISTS$ac_delim
-MPI_F77_EXISTS!$MPI_F77_EXISTS$ac_delim
-CC!$CC$ac_delim
-CFLAGS!$CFLAGS$ac_delim
-LDFLAGS!$LDFLAGS$ac_delim
-CPPFLAGS!$CPPFLAGS$ac_delim
-ac_ct_CC!$ac_ct_CC$ac_delim
-EXEEXT!$EXEEXT$ac_delim
-OBJEXT!$OBJEXT$ac_delim
-DEPDIR!$DEPDIR$ac_delim
-am__include!$am__include$ac_delim
-am__quote!$am__quote$ac_delim
-AMDEP_TRUE!$AMDEP_TRUE$ac_delim
-AMDEP_FALSE!$AMDEP_FALSE$ac_delim
-AMDEPBACKSLASH!$AMDEPBACKSLASH$ac_delim
-CCDEPMODE!$CCDEPMODE$ac_delim
-am__fastdepCC_TRUE!$am__fastdepCC_TRUE$ac_delim
-am__fastdepCC_FALSE!$am__fastdepCC_FALSE$ac_delim
-_ACEOF
-
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
-    break
-  elif $ac_last_try; then
-    { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
-echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
-   { (exit 1); exit 1; }; }
-  else
-    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
-  fi
-done
-
-ac_eof=`sed -n '/^CEOF[0-9]*$/s/CEOF/0/p' conf$$subs.sed`
-if test -n "$ac_eof"; then
-  ac_eof=`echo "$ac_eof" | sort -nru | sed 1q`
-  ac_eof=`expr $ac_eof + 1`
-fi
-
-cat >>$CONFIG_STATUS <<_ACEOF
-cat >"\$tmp/subs-1.sed" <<\CEOF$ac_eof
-/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
-_ACEOF
-sed '
-s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g
-s/^/s,@/; s/!/@,|#_!!_#|/
-:n
-t n
-s/'"$ac_delim"'$/,g/; t
-s/$/\\/; p
-N; s/^.*\n//; s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g; b n
-' >>$CONFIG_STATUS <conf$$subs.sed
-rm -f conf$$subs.sed
-cat >>$CONFIG_STATUS <<_ACEOF
-CEOF$ac_eof
-_ACEOF
-
-
-ac_delim='%!_!# '
-for ac_last_try in false false false false false :; do
-  cat >conf$$subs.sed <<_ACEOF
-CXX!$CXX$ac_delim
-CXXFLAGS!$CXXFLAGS$ac_delim
-ac_ct_CXX!$ac_ct_CXX$ac_delim
-CXXDEPMODE!$CXXDEPMODE$ac_delim
-am__fastdepCXX_TRUE!$am__fastdepCXX_TRUE$ac_delim
-am__fastdepCXX_FALSE!$am__fastdepCXX_FALSE$ac_delim
-RANLIB!$RANLIB$ac_delim
-USE_ALTERNATE_AR_TRUE!$USE_ALTERNATE_AR_TRUE$ac_delim
-USE_ALTERNATE_AR_FALSE!$USE_ALTERNATE_AR_FALSE$ac_delim
-ALTERNATE_AR!$ALTERNATE_AR$ac_delim
-CXXCPP!$CXXCPP$ac_delim
-USING_EXPORT_MAKEFILES_TRUE!$USING_EXPORT_MAKEFILES_TRUE$ac_delim
-USING_EXPORT_MAKEFILES_FALSE!$USING_EXPORT_MAKEFILES_FALSE$ac_delim
-PERL_EXE!$PERL_EXE$ac_delim
-HAVE_PERL!$HAVE_PERL$ac_delim
-USING_PERL_TRUE!$USING_PERL_TRUE$ac_delim
-USING_PERL_FALSE!$USING_PERL_FALSE$ac_delim
-USING_GNUMAKE_TRUE!$USING_GNUMAKE_TRUE$ac_delim
-USING_GNUMAKE_FALSE!$USING_GNUMAKE_FALSE$ac_delim
-BUILD_TESTS_TRUE!$BUILD_TESTS_TRUE$ac_delim
-BUILD_TESTS_FALSE!$BUILD_TESTS_FALSE$ac_delim
-SUB_TEST_TRUE!$SUB_TEST_TRUE$ac_delim
-SUB_TEST_FALSE!$SUB_TEST_FALSE$ac_delim
-GREP!$GREP$ac_delim
-EGREP!$EGREP$ac_delim
-PTHREAD_CC!$PTHREAD_CC$ac_delim
-PTHREAD_LIBS!$PTHREAD_LIBS$ac_delim
-PTHREAD_CFLAGS!$PTHREAD_CFLAGS$ac_delim
-ac_aux_dir!$ac_aux_dir$ac_delim
-LIBOBJS!$LIBOBJS$ac_delim
-LTLIBOBJS!$LTLIBOBJS$ac_delim
-_ACEOF
-
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 31; then
-    break
-  elif $ac_last_try; then
-    { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
-echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
-   { (exit 1); exit 1; }; }
-  else
-    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
-  fi
-done
-
-ac_eof=`sed -n '/^CEOF[0-9]*$/s/CEOF/0/p' conf$$subs.sed`
-if test -n "$ac_eof"; then
-  ac_eof=`echo "$ac_eof" | sort -nru | sed 1q`
-  ac_eof=`expr $ac_eof + 1`
-fi
-
-cat >>$CONFIG_STATUS <<_ACEOF
-cat >"\$tmp/subs-2.sed" <<\CEOF$ac_eof
-/@[a-zA-Z_][a-zA-Z_0-9]*@/!b end
-_ACEOF
-sed '
-s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g
-s/^/s,@/; s/!/@,|#_!!_#|/
-:n
-t n
-s/'"$ac_delim"'$/,g/; t
-s/$/\\/; p
-N; s/^.*\n//; s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g; b n
-' >>$CONFIG_STATUS <conf$$subs.sed
-rm -f conf$$subs.sed
-cat >>$CONFIG_STATUS <<_ACEOF
-:end
-s/|#_!!_#|//g
-CEOF$ac_eof
-_ACEOF
-
-
-# VPATH may cause trouble with some makes, so we remove $(srcdir),
-# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and
-# trailing colons and then remove the whole line if VPATH becomes empty
-# (actually we leave an empty line to preserve line numbers).
-if test "x$srcdir" = x.; then
-  ac_vpsub='/^[	 ]*VPATH[	 ]*=/{
-s/:*\$(srcdir):*/:/
-s/:*\${srcdir}:*/:/
-s/:*@srcdir@:*/:/
-s/^\([^=]*=[	 ]*\):*/\1/
-s/:*$//
-s/^[^=]*=[	 ]*$//
-}'
-fi
-
-cat >>$CONFIG_STATUS <<\_ACEOF
-fi # test -n "$CONFIG_FILES"
-
-
-for ac_tag in  :F $CONFIG_FILES  :H $CONFIG_HEADERS    :C $CONFIG_COMMANDS
-do
-  case $ac_tag in
-  :[FHLC]) ac_mode=$ac_tag; continue;;
-  esac
-  case $ac_mode$ac_tag in
-  :[FHL]*:*);;
-  :L* | :C*:*) { { echo "$as_me:$LINENO: error: Invalid tag $ac_tag." >&5
-echo "$as_me: error: Invalid tag $ac_tag." >&2;}
-   { (exit 1); exit 1; }; };;
-  :[FH]-) ac_tag=-:-;;
-  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
-  esac
-  ac_save_IFS=$IFS
-  IFS=:
-  set x $ac_tag
-  IFS=$ac_save_IFS
-  shift
-  ac_file=$1
-  shift
-
-  case $ac_mode in
-  :L) ac_source=$1;;
-  :[FH])
-    ac_file_inputs=
-    for ac_f
-    do
-      case $ac_f in
-      -) ac_f="$tmp/stdin";;
-      *) # Look for the file first in the build tree, then in the source tree
-	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
-	 # because $ac_f cannot contain `:'.
-	 test -f "$ac_f" ||
-	   case $ac_f in
-	   [\\/$]*) false;;
-	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
-	   esac ||
-	   { { echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5
-echo "$as_me: error: cannot find input file: $ac_f" >&2;}
-   { (exit 1); exit 1; }; };;
-      esac
-      ac_file_inputs="$ac_file_inputs $ac_f"
-    done
-
-    # Let's still pretend it is `configure' which instantiates (i.e., don't
-    # use $as_me), people would be surprised to read:
-    #    /* config.h.  Generated by config.status.  */
-    configure_input="Generated from "`IFS=:
-	  echo $* | sed 's|^[^:]*/||;s|:[^:]*/|, |g'`" by configure."
-    if test x"$ac_file" != x-; then
-      configure_input="$ac_file.  $configure_input"
-      { echo "$as_me:$LINENO: creating $ac_file" >&5
-echo "$as_me: creating $ac_file" >&6;}
-    fi
-
-    case $ac_tag in
-    *:-:* | *:-) cat >"$tmp/stdin";;
-    esac
-    ;;
-  esac
-
-  ac_dir=`$as_dirname -- "$ac_file" ||
-$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$ac_file" : 'X\(//\)[^/]' \| \
-	 X"$ac_file" : 'X\(//\)$' \| \
-	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
-echo X"$ac_file" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-  { as_dir="$ac_dir"
-  case $as_dir in #(
-  -*) as_dir=./$as_dir;;
-  esac
-  test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || {
-    as_dirs=
-    while :; do
-      case $as_dir in #(
-      *\'*) as_qdir=`echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #(
-      *) as_qdir=$as_dir;;
-      esac
-      as_dirs="'$as_qdir' $as_dirs"
-      as_dir=`$as_dirname -- "$as_dir" ||
-$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_dir" : 'X\(//\)[^/]' \| \
-	 X"$as_dir" : 'X\(//\)$' \| \
-	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-echo X"$as_dir" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      test -d "$as_dir" && break
-    done
-    test -z "$as_dirs" || eval "mkdir $as_dirs"
-  } || test -d "$as_dir" || { { echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5
-echo "$as_me: error: cannot create directory $as_dir" >&2;}
-   { (exit 1); exit 1; }; }; }
-  ac_builddir=.
-
-case "$ac_dir" in
-.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
-*)
-  ac_dir_suffix=/`echo "$ac_dir" | sed 's,^\.[\\/],,'`
-  # A ".." for each directory in $ac_dir_suffix.
-  ac_top_builddir_sub=`echo "$ac_dir_suffix" | sed 's,/[^\\/]*,/..,g;s,/,,'`
-  case $ac_top_builddir_sub in
-  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
-  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
-  esac ;;
-esac
-ac_abs_top_builddir=$ac_pwd
-ac_abs_builddir=$ac_pwd$ac_dir_suffix
-# for backward compatibility:
-ac_top_builddir=$ac_top_build_prefix
-
-case $srcdir in
-  .)  # We are building in place.
-    ac_srcdir=.
-    ac_top_srcdir=$ac_top_builddir_sub
-    ac_abs_top_srcdir=$ac_pwd ;;
-  [\\/]* | ?:[\\/]* )  # Absolute name.
-    ac_srcdir=$srcdir$ac_dir_suffix;
-    ac_top_srcdir=$srcdir
-    ac_abs_top_srcdir=$srcdir ;;
-  *) # Relative name.
-    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
-    ac_top_srcdir=$ac_top_build_prefix$srcdir
-    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
-esac
-ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
-
-
-  case $ac_mode in
-  :F)
-  #
-  # CONFIG_FILE
-  #
-
-  case $INSTALL in
-  [\\/$]* | ?:[\\/]* ) ac_INSTALL=$INSTALL ;;
-  *) ac_INSTALL=$ac_top_build_prefix$INSTALL ;;
-  esac
-  ac_MKDIR_P=$MKDIR_P
-  case $MKDIR_P in
-  [\\/$]* | ?:[\\/]* ) ;;
-  */*) ac_MKDIR_P=$ac_top_build_prefix$MKDIR_P ;;
-  esac
-_ACEOF
-
-cat >>$CONFIG_STATUS <<\_ACEOF
-# If the template does not know about datarootdir, expand it.
-# FIXME: This hack should be removed a few years after 2.60.
-ac_datarootdir_hack=; ac_datarootdir_seen=
-
-case `sed -n '/datarootdir/ {
-  p
-  q
-}
-/@datadir@/p
-/@docdir@/p
-/@infodir@/p
-/@localedir@/p
-/@mandir@/p
-' $ac_file_inputs` in
-*datarootdir*) ac_datarootdir_seen=yes;;
-*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
-  { echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
-echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
-_ACEOF
-cat >>$CONFIG_STATUS <<_ACEOF
-  ac_datarootdir_hack='
-  s&@datadir@&$datadir&g
-  s&@docdir@&$docdir&g
-  s&@infodir@&$infodir&g
-  s&@localedir@&$localedir&g
-  s&@mandir@&$mandir&g
-    s&\\\${datarootdir}&$datarootdir&g' ;;
-esac
-_ACEOF
-
-# Neutralize VPATH when `$srcdir' = `.'.
-# Shell code in configure.ac might set extrasub.
-# FIXME: do we really want to maintain this feature?
-cat >>$CONFIG_STATUS <<_ACEOF
-  sed "$ac_vpsub
-$extrasub
-_ACEOF
-cat >>$CONFIG_STATUS <<\_ACEOF
-:t
-/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
-s&@configure_input@&$configure_input&;t t
-s&@top_builddir@&$ac_top_builddir_sub&;t t
-s&@srcdir@&$ac_srcdir&;t t
-s&@abs_srcdir@&$ac_abs_srcdir&;t t
-s&@top_srcdir@&$ac_top_srcdir&;t t
-s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
-s&@builddir@&$ac_builddir&;t t
-s&@abs_builddir@&$ac_abs_builddir&;t t
-s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
-s&@INSTALL@&$ac_INSTALL&;t t
-s&@MKDIR_P@&$ac_MKDIR_P&;t t
-$ac_datarootdir_hack
-" $ac_file_inputs | sed -f "$tmp/subs-1.sed" | sed -f "$tmp/subs-2.sed" >$tmp/out
-
-test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
-  { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } &&
-  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } &&
-  { echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir'
-which seems to be undefined.  Please make sure it is defined." >&5
-echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
-which seems to be undefined.  Please make sure it is defined." >&2;}
-
-  rm -f "$tmp/stdin"
-  case $ac_file in
-  -) cat "$tmp/out"; rm -f "$tmp/out";;
-  *) rm -f "$ac_file"; mv "$tmp/out" $ac_file;;
-  esac
- ;;
-  :H)
-  #
-  # CONFIG_HEADER
-  #
-_ACEOF
-
-# Transform confdefs.h into a sed script `conftest.defines', that
-# substitutes the proper values into config.h.in to produce config.h.
-rm -f conftest.defines conftest.tail
-# First, append a space to every undef/define line, to ease matching.
-echo 's/$/ /' >conftest.defines
-# Then, protect against being on the right side of a sed subst, or in
-# an unquoted here document, in config.status.  If some macros were
-# called several times there might be several #defines for the same
-# symbol, which is useless.  But do not sort them, since the last
-# AC_DEFINE must be honored.
-ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]*
-# These sed commands are passed to sed as "A NAME B PARAMS C VALUE D", where
-# NAME is the cpp macro being defined, VALUE is the value it is being given.
-# PARAMS is the parameter list in the macro definition--in most cases, it's
-# just an empty string.
-ac_dA='s,^\\([	 #]*\\)[^	 ]*\\([	 ]*'
-ac_dB='\\)[	 (].*,\\1define\\2'
-ac_dC=' '
-ac_dD=' ,'
-
-uniq confdefs.h |
-  sed -n '
-	t rset
-	:rset
-	s/^[	 ]*#[	 ]*define[	 ][	 ]*//
-	t ok
-	d
-	:ok
-	s/[\\&,]/\\&/g
-	s/^\('"$ac_word_re"'\)\(([^()]*)\)[	 ]*\(.*\)/ '"$ac_dA"'\1'"$ac_dB"'\2'"${ac_dC}"'\3'"$ac_dD"'/p
-	s/^\('"$ac_word_re"'\)[	 ]*\(.*\)/'"$ac_dA"'\1'"$ac_dB$ac_dC"'\2'"$ac_dD"'/p
-  ' >>conftest.defines
-
-# Remove the space that was appended to ease matching.
-# Then replace #undef with comments.  This is necessary, for
-# example, in the case of _POSIX_SOURCE, which is predefined and required
-# on some systems where configure will not decide to define it.
-# (The regexp can be short, since the line contains either #define or #undef.)
-echo 's/ $//
-s,^[	 #]*u.*,/* & */,' >>conftest.defines
-
-# Break up conftest.defines:
-ac_max_sed_lines=50
-
-# First sed command is:	 sed -f defines.sed $ac_file_inputs >"$tmp/out1"
-# Second one is:	 sed -f defines.sed "$tmp/out1" >"$tmp/out2"
-# Third one will be:	 sed -f defines.sed "$tmp/out2" >"$tmp/out1"
-# et cetera.
-ac_in='$ac_file_inputs'
-ac_out='"$tmp/out1"'
-ac_nxt='"$tmp/out2"'
-
-while :
-do
-  # Write a here document:
-    cat >>$CONFIG_STATUS <<_ACEOF
-    # First, check the format of the line:
-    cat >"\$tmp/defines.sed" <<\\CEOF
-/^[	 ]*#[	 ]*undef[	 ][	 ]*$ac_word_re[	 ]*\$/b def
-/^[	 ]*#[	 ]*define[	 ][	 ]*$ac_word_re[(	 ]/b def
-b
-:def
-_ACEOF
-  sed ${ac_max_sed_lines}q conftest.defines >>$CONFIG_STATUS
-  echo 'CEOF
-    sed -f "$tmp/defines.sed"' "$ac_in >$ac_out" >>$CONFIG_STATUS
-  ac_in=$ac_out; ac_out=$ac_nxt; ac_nxt=$ac_in
-  sed 1,${ac_max_sed_lines}d conftest.defines >conftest.tail
-  grep . conftest.tail >/dev/null || break
-  rm -f conftest.defines
-  mv conftest.tail conftest.defines
-done
-rm -f conftest.defines conftest.tail
-
-echo "ac_result=$ac_in" >>$CONFIG_STATUS
-cat >>$CONFIG_STATUS <<\_ACEOF
-  if test x"$ac_file" != x-; then
-    echo "/* $configure_input  */" >"$tmp/config.h"
-    cat "$ac_result" >>"$tmp/config.h"
-    if diff $ac_file "$tmp/config.h" >/dev/null 2>&1; then
-      { echo "$as_me:$LINENO: $ac_file is unchanged" >&5
-echo "$as_me: $ac_file is unchanged" >&6;}
-    else
-      rm -f $ac_file
-      mv "$tmp/config.h" $ac_file
-    fi
-  else
-    echo "/* $configure_input  */"
-    cat "$ac_result"
-  fi
-  rm -f "$tmp/out12"
-# Compute $ac_file's index in $config_headers.
-_am_stamp_count=1
-for _am_header in $config_headers :; do
-  case $_am_header in
-    $ac_file | $ac_file:* )
-      break ;;
-    * )
-      _am_stamp_count=`expr $_am_stamp_count + 1` ;;
-  esac
-done
-echo "timestamp for $ac_file" >`$as_dirname -- $ac_file ||
-$as_expr X$ac_file : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X$ac_file : 'X\(//\)[^/]' \| \
-	 X$ac_file : 'X\(//\)$' \| \
-	 X$ac_file : 'X\(/\)' \| . 2>/dev/null ||
-echo X$ac_file |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`/stamp-h$_am_stamp_count
- ;;
-
-  :C)  { echo "$as_me:$LINENO: executing $ac_file commands" >&5
-echo "$as_me: executing $ac_file commands" >&6;}
- ;;
-  esac
-
-
-  case $ac_file$ac_mode in
-    "depfiles":C) test x"$AMDEP_TRUE" != x"" || for mf in $CONFIG_FILES; do
-  # Strip MF so we end up with the name of the file.
-  mf=`echo "$mf" | sed -e 's/:.*$//'`
-  # Check whether this is an Automake generated Makefile or not.
-  # We used to match only the files named `Makefile.in', but
-  # some people rename them; so instead we look at the file content.
-  # Grep'ing the first line is not enough: some people post-process
-  # each Makefile.in and add a new line on top of each file to say so.
-  # Grep'ing the whole file is not good either: AIX grep has a line
-  # limit of 2048, but all sed's we know have understand at least 4000.
-  if sed 10q "$mf" | grep '^#.*generated by automake' > /dev/null 2>&1; then
-    dirpart=`$as_dirname -- "$mf" ||
-$as_expr X"$mf" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$mf" : 'X\(//\)[^/]' \| \
-	 X"$mf" : 'X\(//\)$' \| \
-	 X"$mf" : 'X\(/\)' \| . 2>/dev/null ||
-echo X"$mf" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-  else
-    continue
-  fi
-  # Extract the definition of DEPDIR, am__include, and am__quote
-  # from the Makefile without running `make'.
-  DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
-  test -z "$DEPDIR" && continue
-  am__include=`sed -n 's/^am__include = //p' < "$mf"`
-  test -z "am__include" && continue
-  am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
-  # When using ansi2knr, U may be empty or an underscore; expand it
-  U=`sed -n 's/^U = //p' < "$mf"`
-  # Find all dependency output files, they are included files with
-  # $(DEPDIR) in their names.  We invoke sed twice because it is the
-  # simplest approach to changing $(DEPDIR) to its actual value in the
-  # expansion.
-  for file in `sed -n "
-    s/^$am__include $am__quote\(.*(DEPDIR).*\)$am__quote"'$/\1/p' <"$mf" | \
-       sed -e 's/\$(DEPDIR)/'"$DEPDIR"'/g' -e 's/\$U/'"$U"'/g'`; do
-    # Make sure the directory exists.
-    test -f "$dirpart/$file" && continue
-    fdir=`$as_dirname -- "$file" ||
-$as_expr X"$file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$file" : 'X\(//\)[^/]' \| \
-	 X"$file" : 'X\(//\)$' \| \
-	 X"$file" : 'X\(/\)' \| . 2>/dev/null ||
-echo X"$file" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-    { as_dir=$dirpart/$fdir
-  case $as_dir in #(
-  -*) as_dir=./$as_dir;;
-  esac
-  test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || {
-    as_dirs=
-    while :; do
-      case $as_dir in #(
-      *\'*) as_qdir=`echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #(
-      *) as_qdir=$as_dir;;
-      esac
-      as_dirs="'$as_qdir' $as_dirs"
-      as_dir=`$as_dirname -- "$as_dir" ||
-$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_dir" : 'X\(//\)[^/]' \| \
-	 X"$as_dir" : 'X\(//\)$' \| \
-	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-echo X"$as_dir" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      test -d "$as_dir" && break
-    done
-    test -z "$as_dirs" || eval "mkdir $as_dirs"
-  } || test -d "$as_dir" || { { echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5
-echo "$as_me: error: cannot create directory $as_dir" >&2;}
-   { (exit 1); exit 1; }; }; }
-    # echo "creating $dirpart/$file"
-    echo '# dummy' > "$dirpart/$file"
-  done
-done
- ;;
-
-  esac
-done # for ac_tag
-
-
-{ (exit 0); exit 0; }
-_ACEOF
-chmod +x $CONFIG_STATUS
-ac_clean_files=$ac_clean_files_save
-
-
-# configure is writing to config.log, and then calls config.status.
-# config.status does its own redirection, appending to config.log.
-# Unfortunately, on DOS this fails, as config.log is still kept open
-# by configure, so config.status won't be able to write to it; its
-# output is simply discarded.  So we exec the FD to /dev/null,
-# effectively closing config.log, so it can be properly (re)opened and
-# appended to by config.status.  When coming back to configure, we
-# need to make the FD available again.
-if test "$no_create" != yes; then
-  ac_cs_success=:
-  ac_config_status_args=
-  test "$silent" = yes &&
-    ac_config_status_args="$ac_config_status_args --quiet"
-  exec 5>/dev/null
-  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
-  exec 5>>config.log
-  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
-  # would make configure fail if this is the last instruction.
-  $ac_cs_success || { (exit 1); exit 1; }
-fi
-
-
-# Bye World!
-echo "---------------------------------------------"
-echo "Finished Running ThreadPool Configure Script"
-echo "---------------------------------------------"
diff --git a/kokkos/basic/optional/ThreadPool/configure.ac b/kokkos/basic/optional/ThreadPool/configure.ac
deleted file mode 100644
index 12778f4..0000000
--- a/kokkos/basic/optional/ThreadPool/configure.ac
+++ /dev/null
@@ -1,240 +0,0 @@
-# ------------------------------------------------------------------------
-# Process this file with autoconf to produce a configure script.
-# ------------------------------------------------------------------------
-
-# @HEADER
-# ************************************************************************
-# 
-#                           ThreadPool Package
-#                 Copyright (2008) Sandia Corporation
-# 
-# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-# license for use of this work by or on behalf of the U.S. Government.
-# 
-# This library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
-#  
-# This library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#  
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
-# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
-# 
-# ************************************************************************
-# @HEADER
-
-# ------------------------------------------------------------------------
-# Initialization 
-# ------------------------------------------------------------------------
-
-# This must be the first line in configure.ac.
-# Optional 3rd argument is email address for bugs.
-
-#  #np# - package name, version number, and e-mail address below
-AC_INIT(ThreadPool, 1.1d, hcedwar@sandia.gov)
-
-# Hello World!
-echo "----------------------------------------"
-echo "Running ThreadPool Configure Script"
-echo "----------------------------------------"
-
-# This is to protect against accidentally specifying the wrong
-# directory with --srcdir.  Any file in that directory will do,
-# preferably one that is unlikely to be removed or renamed.
-
-AC_CONFIG_SRCDIR([src/TPI.c])
-
-# Specify directory for auxillary build tools (e.g., install-sh,
-# config.sub, config.guess) and M4 files.
-
-AC_CONFIG_AUX_DIR(config)
-#  #auto np# - Change file names in next line 
-# Configure should create src/ThreadPool_config.h from src/ThreadPool_config.h.in
-
-AM_CONFIG_HEADER(src/ThreadPool_config.h:src/ThreadPool_config.h.in)
-
-# Allow users to specify their own "install" command.  If none is specified,
-# the default is install-sh found in the config subdirectory.
-                                                                                
-AC_ARG_WITH(install,
- [AC_HELP_STRING([--with-install=INSTALL_PROGRAM],
- [Use the installation program INSTALL_PROGRAM rather the default that is provided.  For example --with-install="/path/install -p"])],
- [
-   INSTALL=$withval
-   INSTALL_PROGRAM=$withval
-   INSTALL_SCRIPT=$withval
-   INSTALL_DATA="$withval -m 644"
- ],)
-                                                                                
-# AM_MAINTAINER_MODE turns off maintainer-only makefile targets by
-# default, and changes configure to understand a
-# --enable-maintainer-mode option. --enable-maintainer-mode turns the
-# maintainer-only targets back on. The maintainer-only makefile
-# targets permit end users to clean automatically-generated files such
-# as configure, which means they have to have autoconf and automake
-# installed to repair the damage. AM_MAINTAINER_MODE makes it a bit
-# harder for users to shoot themselves in the foot.
-
-AM_MAINTAINER_MODE
-
-# Define $build, $host, $target, etc
-
-AC_CANONICAL_TARGET
-
-# Use automake
-
-#  - Required version of automake.
-AM_INIT_AUTOMAKE(1.10 no-define tar-ustar)
-
-# Specify required version of autoconf.
-
-AC_PREREQ(2.61)
-
-# ------------------------------------------------------------------------
-# Check to see if MPI enabled and if any special configuration done
-# ------------------------------------------------------------------------
-
-TAC_ARG_CONFIG_MPI
-
-#  #np# - can eliminate compiler checks below if your package does not use the
-#         language corresponding to the check.  Please note that if you use
-#	  F77_FUNC to determine Fortran name mangling, you should not remove
-#	  the Fortran compiler check or the check for Fortran flags.  Doing
-#	  so will prevent the detection of the proper name mangling in some
-#	  cases.
-# ------------------------------------------------------------------------
-# Checks for programs
-# ------------------------------------------------------------------------
-
-AC_PROG_CC(cc gcc)
-AC_PROG_CXX(CC g++ c++ cxx)
-#AC_PROG_F77(f77 g77 gfortran f90 xlf90 f95)
-AC_PROG_RANLIB
-
-# Check if --with-flags present, prepend any specs to FLAGS
-
-TAC_ARG_WITH_FLAGS(ccflags, CCFLAGS)
-TAC_ARG_WITH_FLAGS(cxxflags, CXXFLAGS)
-TAC_ARG_WITH_FLAGS(cflags, CFLAGS)
-#TAC_ARG_WITH_FLAGS(fflags, FFLAGS)
-TAC_ARG_WITH_LIBS
-TAC_ARG_WITH_FLAGS(ldflags, LDFLAGS)
-
-# ------------------------------------------------------------------------
-# Alternate archiver
-# ------------------------------------------------------------------------
-
-TAC_ARG_WITH_AR
-
-# ------------------------------------------------------------------------
-# MPI link check
-# ------------------------------------------------------------------------
-TAC_ARG_CHECK_MPI
-
-# ------------------------------------------------------------------------
-# Checks for Makefile.export related systems
-# ------------------------------------------------------------------------
-TAC_ARG_ENABLE_EXPORT_MAKEFILES(yes)
-
-# ------------------------------------------------------------------------
-# Checks if tests and examples should be built
-# ------------------------------------------------------------------------
-
-#  #np# - These options can disable the tests and examples of a package.
-#  #np# - Packages that do not have tests or examples should #-out the 
-#  #np# - option(s) that does (do) not apply.
-
-TAC_ARG_ENABLE_FEATURE(tests, [Make tests for all Trilinos packages buildable with 'make tests'], TESTS, yes)
-TAC_ARG_ENABLE_FEATURE_SUB_CHECK( threadpool, tests, [Make ThreadPool tests buildable with 'make tests'], NEW_PACKAGE_TESTS)
-AM_CONDITIONAL(BUILD_TESTS, test "X$ac_cv_use_threadpool_tests" != "Xno")
-
-#TAC_ARG_ENABLE_FEATURE(examples, [Make examples for all Trilinos packages buildable with 'make examples'], EXAMPLES, yes)
-#TAC_ARG_ENABLE_FEATURE_SUB_CHECK( new_package, examples, [Make New_Package examples buildable with 'make examples'], NEW_PACKAGE_EXAMPLES)
-#AM_CONDITIONAL(BUILD_EXAMPLES, test "X$ac_cv_use_new_package_examples" != "Xno")
-
-#We now build tests and examples through separate make targets, rather than
-#during "make".  We still need to conditionally include the test and example
-#in SUBDIRS, even though SUB_TEST and SUB_EXAMPLE will never be
-#defined, so that the tests and examples are included in the distribution
-#tarball.
-AM_CONDITIONAL(SUB_TEST, test "X$ac_cv_use_sub_test" = "Xyes")
-#AM_CONDITIONAL(SUB_EXAMPLE, test "X$ac_cv_use_sub_example" = "Xyes")
-
-TAC_ARG_ENABLE_FEATURE(libcheck, [Check for some third-party libraries.  (Cannot be disabled unless tests and examples are also disabled.)], LIBCHECK, yes)
-
-# ------------------------------------------------------------------------
-# Specify other directories 
-# ------------------------------------------------------------------------
-
-# enable use of --with-libdirs="-Llibdir1 -Llibdir2 ..." to prepend to LDFLAGS
-TAC_ARG_WITH_LIBDIRS
-# enable use of --with-incdirs="-Lincdir1 -Lincdir2 ..." to prepend to CPPFLAGS
-TAC_ARG_WITH_INCDIRS
-
-# #np# - Yet another opportunity to remove code if you aren't
-# using Fortran
-# Define F77_FUNC that will be used to link with Fortran subroutines. - trash WORKGXX 
-#AC_F77_WRAPPERS
-
-# ------------------------------------------------------------------------
-# Checks for libraries
-# ------------------------------------------------------------------------
-
-# If tests, examples and libcheck are disabled, we don't have to check
-# for these libraries.
-
-# #np# -
-# If a package does not have tests or examples, the corresponding check(s)
-# should be pulled out of the "if" statement below.
-#if test "X$ac_cv_use_new_package_examples" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
-if test "X$ac_cv_use_threadpool_tests" != "Xno" || test "X$ac_cv_use_libcheck" != "Xno"; then
-
-ACX_PTHREAD
-LIBS="$PTHREAD_LIBS $LIBS"
-CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-CC="$PTHREAD_CC"
-
-fi
-# end of the list of libraries that don't need to be checked for if
-# tests and examples are disabled.
-
-# ------------------------------------------------------------------------
-# Checks for linker characteristics
-# ------------------------------------------------------------------------
-
-# Determine libraries needed for linking with Fortran
-#AC_F77_LIBRARY_LDFLAGS
-
-
-# ------------------------------------------------------------------------
-# Perform substitutions in output files
-# ------------------------------------------------------------------------
-
-AC_SUBST(ac_aux_dir)
-
-# ------------------------------------------------------------------------
-# Output files
-# ------------------------------------------------------------------------
-##
-#  You will need to change AC_CONFIG_FILES below and Makefile.am
-#  to add a new directory.
-AC_CONFIG_FILES([
-		Makefile
-		Makefile.export.threadpool
-		src/Makefile
-		test/Makefile
-		])
-
-AC_OUTPUT()
-
-# Bye World!
-echo "---------------------------------------------"
-echo "Finished Running ThreadPool Configure Script"
-echo "---------------------------------------------"
diff --git a/kokkos/basic/optional/ThreadPool/src/CMakeLists.txt b/kokkos/basic/optional/ThreadPool/src/CMakeLists.txt
deleted file mode 100644
index 41a1f39..0000000
--- a/kokkos/basic/optional/ThreadPool/src/CMakeLists.txt
+++ /dev/null
@@ -1,70 +0,0 @@
-
-INCLUDE(PackageLibraryMacros)
-
-#
-# A) Package-specific configuration options
-#
-
-PACKAGE_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
-
-#
-# B) Define the header and source files (and directories)
-#
-
-#
-# src
-#
-
-SET(HEADERS "")
-SET(SOURCES "")
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
-
-SET(HEADERS ${HEADERS}
-  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h
-  )
-
-#
-# Core files
-#
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-
-APPEND_SET(HEADERS
-  TPI.h
-  TPI.hpp
-  )
-
-APPEND_SET(SOURCES
-  TPI.c
-  )
-
-#
-# Util files
-#
-APPEND_SET(SOURCES
-  TPI_Walltime.c
-  )
-
-######################################
-
-APPEND_SET(HEADERS
-  )
-
-APPEND_SET(SOURCES
-  )
-
-######################################
-IF (TPL_ENABLE_MPI)
-ENDIF()
-
-#
-# C) Define the targets for package's library(s)
-#
-
-PACKAGE_ADD_LIBRARY(
-  tpi
-  HEADERS ${HEADERS}
-  SOURCES ${SOURCES}
-  )
diff --git a/kokkos/basic/optional/ThreadPool/src/Makefile.am b/kokkos/basic/optional/ThreadPool/src/Makefile.am
deleted file mode 100644
index 44c1621..0000000
--- a/kokkos/basic/optional/ThreadPool/src/Makefile.am
+++ /dev/null
@@ -1,140 +0,0 @@
-# @HEADER
-# ************************************************************************
-# 
-#                          ThreadPool Package
-#                 Copyright (2008) Sandia Corporation
-# 
-# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-# license for use of this work by or on behalf of the U.S. Government.
-# 
-# This library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
-#  
-# This library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#  
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
-# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
-# 
-# ************************************************************************
-# @HEADER
-
-# The following line helps the test harness recover from build errors.
-
-all-local:
-	@echo ""
-	@echo "Trilinos package ThreadPool subdirectory src built successfully."
-	@echo ""
-
-# ------------------------------------------------------------------------
-# For each category, create two variables - NAME and NAME_H. The
-# second is the list of headers to be installed, i.e., any header that
-# might someday be needed by some other code outside New_Package. The first is
-# the list of all source and any other header files.
-# ------------------------------------------------------------------------
-
-#np# Make sure to list all source files in one of the following categories.
-
-CORE = $(srcdir)/TPI.c
-
-CORE_H = \
-	$(srcdir)/TPI.h \
-	$(srcdir)/TPI.hpp
-
-UTIL = \
-	$(srcdir)/TPI_Walltime.c
-
-
-# ------------------------------------------------------------------------
-# ThreadPool library specifications
-# ------------------------------------------------------------------------
-#np# replace new_package with the name of the package being autotool'ed here
-THREADPOOL_LIB = libtpi.a
-
-#np# replace new_package with the name of the package being autotool'ed here
-THREADPOOL_H = \
-	$(CORE_H)
-
-#np# replace new_package with the name of the package being autotool'ed here
-libtpi_a_SOURCES = \
-	$(CORE) \
-	$(UTIL)
-
-#np# replace new_package with the name of the package being autotool'ed here
-#EXTRA_libtpi_a_SOURCES =
-
-include $(top_builddir)/Makefile.export.threadpool
-
-if USING_GNUMAKE
-EXPORT_INCLUDES = $(shell $(PERL_EXE) $(top_srcdir)/config/strip_dup_incl_paths.pl $(THREADPOOL_INCLUDES))
-else
-EXPORT_INCLUDES = $(THREADPOOL_INCLUDES)
-endif
-
-AM_CPPFLAGS = $(EXPORT_INCLUDES)
-
-# ------------------------------------------------------------------------
-# For using a special archiver
-# ------------------------------------------------------------------------
-
-if USE_ALTERNATE_AR
-
-libtpi_a_AR = $(ALTERNATE_AR)
-else
-
-libtpi_a_AR = $(AR) cru
-
-endif
-
-# ------------------------------------------------------------------------
-# Some C++ compilers create extra .o-files for templates. We need to
-# be sure to include these, and this is the hack to do it.
-# ------------------------------------------------------------------------
-
-libtpi_a_LIBADD = $(XTRALDADD)
-
-# ------------------------------------------------------------------------
-# List of all libraries to install in $(libexecdir)
-# ------------------------------------------------------------------------
-
-lib_LIBRARIES = $(THREADPOOL_LIB)
-
-# ------------------------------------------------------------------------
-# List of all headers to install in $(includedir)
-# ------------------------------------------------------------------------
-
-#np# replace new_package with the name of the package being autotool'ed here
-include_HEADERS = $(THREADPOOL_H) 
-
-# ------------------------------------------------------------------------
-# Special stuff to install in our special $(execincludedir)
-# ------------------------------------------------------------------------
-
-# SPECIAL NOTE: New_Package_config.h is a machine-dependent file, so we need
-# to install it in the machine-dependent directory. However, that is
-# not a default installation directory, so we had to create it
-# special.
-
-# All Trilinos headers are now installed in the same directory
-execincludedir = $(includedir)
-#np# replace new_package with the name of the package being autotool'ed here
-nodist_execinclude_HEADERS = ThreadPool_config.h
-
-# ------------------------------------------------------------------------
-# Files to be deleted by 'make maintainer-clean'
-# ------------------------------------------------------------------------
-
-MAINTAINERCLEANFILES = Makefile.in 
-
-
-
-
-
-
diff --git a/kokkos/basic/optional/ThreadPool/src/Makefile.in b/kokkos/basic/optional/ThreadPool/src/Makefile.in
deleted file mode 100644
index 4dd7802..0000000
--- a/kokkos/basic/optional/ThreadPool/src/Makefile.in
+++ /dev/null
@@ -1,680 +0,0 @@
-# Makefile.in generated by automake 1.10 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-@SET_MAKE@
-
-# @HEADER
-# ************************************************************************
-# 
-#                          ThreadPool Package
-#                 Copyright (2008) Sandia Corporation
-# 
-# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-# license for use of this work by or on behalf of the U.S. Government.
-# 
-# This library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
-#  
-# This library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#  
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
-# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
-# 
-# ************************************************************************
-# @HEADER
-
-# The following line helps the test harness recover from build errors.
-
-
-VPATH = @srcdir@
-pkgdatadir = $(datadir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-subdir = src
-DIST_COMMON = $(include_HEADERS) $(srcdir)/Makefile.am \
-	$(srcdir)/Makefile.in $(srcdir)/ThreadPool_config.h.in
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/config/acx_pthread.m4 \
-	$(top_srcdir)/config/tac_arg_check_mpi.m4 \
-	$(top_srcdir)/config/tac_arg_config_mpi.m4 \
-	$(top_srcdir)/config/tac_arg_enable_export-makefiles.m4 \
-	$(top_srcdir)/config/tac_arg_enable_feature.m4 \
-	$(top_srcdir)/config/tac_arg_enable_feature_sub_check.m4 \
-	$(top_srcdir)/config/tac_arg_with_ar.m4 \
-	$(top_srcdir)/config/tac_arg_with_flags.m4 \
-	$(top_srcdir)/config/tac_arg_with_incdirs.m4 \
-	$(top_srcdir)/config/tac_arg_with_libdirs.m4 \
-	$(top_srcdir)/config/tac_arg_with_libs.m4 \
-	$(top_srcdir)/config/tac_arg_with_perl.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = ThreadPool_config.h
-CONFIG_CLEAN_FILES =
-am__vpath_adj_setup = srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`;
-am__vpath_adj = case $$p in \
-    $(srcdir)/*) f=`echo "$$p" | sed "s|^$$srcdirstrip/||"`;; \
-    *) f=$$p;; \
-  esac;
-am__strip_dir = `echo $$p | sed -e 's|^.*/||'`;
-am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)" \
-	"$(DESTDIR)$(execincludedir)"
-libLIBRARIES_INSTALL = $(INSTALL_DATA)
-LIBRARIES = $(lib_LIBRARIES)
-AR = ar
-ARFLAGS = cru
-libtpi_a_DEPENDENCIES =
-am__objects_1 = TPI.$(OBJEXT)
-am__objects_2 = TPI_Walltime.$(OBJEXT)
-am_libtpi_a_OBJECTS = $(am__objects_1) $(am__objects_2)
-libtpi_a_OBJECTS = $(am_libtpi_a_OBJECTS)
-DEFAULT_INCLUDES = -I.@am__isrc@
-depcomp = $(SHELL) $(top_srcdir)/config/depcomp
-am__depfiles_maybe = depfiles
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-CCLD = $(CC)
-LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-SOURCES = $(libtpi_a_SOURCES)
-DIST_SOURCES = $(libtpi_a_SOURCES)
-includeHEADERS_INSTALL = $(INSTALL_HEADER)
-nodist_execincludeHEADERS_INSTALL = $(INSTALL_HEADER)
-HEADERS = $(include_HEADERS) $(nodist_execinclude_HEADERS)
-ETAGS = etags
-CTAGS = ctags
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = @ACLOCAL@
-ALTERNATE_AR = @ALTERNATE_AR@
-AMTAR = @AMTAR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = @EXEEXT@
-GREP = @GREP@
-HAVE_PERL = @HAVE_PERL@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-LDFLAGS = @LDFLAGS@
-LIBOBJS = @LIBOBJS@
-LIBS = @LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
-MAKEINFO = @MAKEINFO@
-MKDIR_P = @MKDIR_P@
-MPI_CC_EXISTS = @MPI_CC_EXISTS@
-MPI_CXX = @MPI_CXX@
-MPI_CXX_EXISTS = @MPI_CXX_EXISTS@
-MPI_F77_EXISTS = @MPI_F77_EXISTS@
-MPI_TEMP_CXX = @MPI_TEMP_CXX@
-OBJEXT = @OBJEXT@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PERL_EXE = @PERL_EXE@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-VERSION = @VERSION@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_aux_dir = @ac_aux_dir@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-
-# ------------------------------------------------------------------------
-# For each category, create two variables - NAME and NAME_H. The
-# second is the list of headers to be installed, i.e., any header that
-# might someday be needed by some other code outside New_Package. The first is
-# the list of all source and any other header files.
-# ------------------------------------------------------------------------
-
-#np# Make sure to list all source files in one of the following categories.
-CORE = $(srcdir)/TPI.c
-CORE_H = \
-	$(srcdir)/TPI.h \
-	$(srcdir)/TPI.hpp
-
-UTIL = \
-	$(srcdir)/TPI_Walltime.c
-
-
-# ------------------------------------------------------------------------
-# ThreadPool library specifications
-# ------------------------------------------------------------------------
-#np# replace new_package with the name of the package being autotool'ed here
-THREADPOOL_LIB = libtpi.a
-
-#np# replace new_package with the name of the package being autotool'ed here
-THREADPOOL_H = \
-	$(CORE_H)
-
-
-#np# replace new_package with the name of the package being autotool'ed here
-libtpi_a_SOURCES = \
-	$(CORE) \
-	$(UTIL)
-
-@USING_GNUMAKE_FALSE@EXPORT_INCLUDES = $(THREADPOOL_INCLUDES)
-@USING_GNUMAKE_TRUE@EXPORT_INCLUDES = $(shell $(PERL_EXE) $(top_srcdir)/config/strip_dup_incl_paths.pl $(THREADPOOL_INCLUDES))
-AM_CPPFLAGS = $(EXPORT_INCLUDES)
-@USE_ALTERNATE_AR_FALSE@libtpi_a_AR = $(AR) cru
-
-# ------------------------------------------------------------------------
-# For using a special archiver
-# ------------------------------------------------------------------------
-@USE_ALTERNATE_AR_TRUE@libtpi_a_AR = $(ALTERNATE_AR)
-
-# ------------------------------------------------------------------------
-# Some C++ compilers create extra .o-files for templates. We need to
-# be sure to include these, and this is the hack to do it.
-# ------------------------------------------------------------------------
-libtpi_a_LIBADD = $(XTRALDADD)
-
-# ------------------------------------------------------------------------
-# List of all libraries to install in $(libexecdir)
-# ------------------------------------------------------------------------
-lib_LIBRARIES = $(THREADPOOL_LIB)
-
-# ------------------------------------------------------------------------
-# List of all headers to install in $(includedir)
-# ------------------------------------------------------------------------
-
-#np# replace new_package with the name of the package being autotool'ed here
-include_HEADERS = $(THREADPOOL_H) 
-
-# ------------------------------------------------------------------------
-# Special stuff to install in our special $(execincludedir)
-# ------------------------------------------------------------------------
-
-# SPECIAL NOTE: New_Package_config.h is a machine-dependent file, so we need
-# to install it in the machine-dependent directory. However, that is
-# not a default installation directory, so we had to create it
-# special.
-
-# All Trilinos headers are now installed in the same directory
-execincludedir = $(includedir)
-#np# replace new_package with the name of the package being autotool'ed here
-nodist_execinclude_HEADERS = ThreadPool_config.h
-
-# ------------------------------------------------------------------------
-# Files to be deleted by 'make maintainer-clean'
-# ------------------------------------------------------------------------
-MAINTAINERCLEANFILES = Makefile.in 
-all: ThreadPool_config.h
-	$(MAKE) $(AM_MAKEFLAGS) all-am
-
-.SUFFIXES:
-.SUFFIXES: .c .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \
-		&& exit 0; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  src/Makefile'; \
-	cd $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign  src/Makefile
-.PRECIOUS: Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-ThreadPool_config.h: stamp-h1
-	@if test ! -f $@; then \
-	  rm -f stamp-h1; \
-	  $(MAKE) $(AM_MAKEFLAGS) stamp-h1; \
-	else :; fi
-
-stamp-h1: $(srcdir)/ThreadPool_config.h.in $(top_builddir)/config.status
-	@rm -f stamp-h1
-	cd $(top_builddir) && $(SHELL) ./config.status src/ThreadPool_config.h
-$(srcdir)/ThreadPool_config.h.in: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) 
-	cd $(top_srcdir) && $(AUTOHEADER)
-	rm -f stamp-h1
-	touch $@
-
-distclean-hdr:
-	-rm -f ThreadPool_config.h stamp-h1
-install-libLIBRARIES: $(lib_LIBRARIES)
-	@$(NORMAL_INSTALL)
-	test -z "$(libdir)" || $(MKDIR_P) "$(DESTDIR)$(libdir)"
-	@list='$(lib_LIBRARIES)'; for p in $$list; do \
-	  if test -f $$p; then \
-	    f=$(am__strip_dir) \
-	    echo " $(libLIBRARIES_INSTALL) '$$p' '$(DESTDIR)$(libdir)/$$f'"; \
-	    $(libLIBRARIES_INSTALL) "$$p" "$(DESTDIR)$(libdir)/$$f"; \
-	  else :; fi; \
-	done
-	@$(POST_INSTALL)
-	@list='$(lib_LIBRARIES)'; for p in $$list; do \
-	  if test -f $$p; then \
-	    p=$(am__strip_dir) \
-	    echo " $(RANLIB) '$(DESTDIR)$(libdir)/$$p'"; \
-	    $(RANLIB) "$(DESTDIR)$(libdir)/$$p"; \
-	  else :; fi; \
-	done
-
-uninstall-libLIBRARIES:
-	@$(NORMAL_UNINSTALL)
-	@list='$(lib_LIBRARIES)'; for p in $$list; do \
-	  p=$(am__strip_dir) \
-	  echo " rm -f '$(DESTDIR)$(libdir)/$$p'"; \
-	  rm -f "$(DESTDIR)$(libdir)/$$p"; \
-	done
-
-clean-libLIBRARIES:
-	-test -z "$(lib_LIBRARIES)" || rm -f $(lib_LIBRARIES)
-libtpi.a: $(libtpi_a_OBJECTS) $(libtpi_a_DEPENDENCIES) 
-	-rm -f libtpi.a
-	$(libtpi_a_AR) libtpi.a $(libtpi_a_OBJECTS) $(libtpi_a_LIBADD)
-	$(RANLIB) libtpi.a
-
-mostlyclean-compile:
-	-rm -f *.$(OBJEXT)
-
-distclean-compile:
-	-rm -f *.tab.c
-
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TPI.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/TPI_Walltime.Po@am__quote@
-
-.c.o:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c $<
-
-.c.obj:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
-
-TPI.o: $(srcdir)/TPI.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI.o -MD -MP -MF $(DEPDIR)/TPI.Tpo -c -o TPI.o `test -f '$(srcdir)/TPI.c' || echo '$(srcdir)/'`$(srcdir)/TPI.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI.Tpo $(DEPDIR)/TPI.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI.c' object='TPI.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI.o `test -f '$(srcdir)/TPI.c' || echo '$(srcdir)/'`$(srcdir)/TPI.c
-
-TPI.obj: $(srcdir)/TPI.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI.obj -MD -MP -MF $(DEPDIR)/TPI.Tpo -c -o TPI.obj `if test -f '$(srcdir)/TPI.c'; then $(CYGPATH_W) '$(srcdir)/TPI.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI.Tpo $(DEPDIR)/TPI.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI.c' object='TPI.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI.obj `if test -f '$(srcdir)/TPI.c'; then $(CYGPATH_W) '$(srcdir)/TPI.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI.c'; fi`
-
-TPI_Walltime.o: $(srcdir)/TPI_Walltime.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI_Walltime.o -MD -MP -MF $(DEPDIR)/TPI_Walltime.Tpo -c -o TPI_Walltime.o `test -f '$(srcdir)/TPI_Walltime.c' || echo '$(srcdir)/'`$(srcdir)/TPI_Walltime.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI_Walltime.Tpo $(DEPDIR)/TPI_Walltime.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI_Walltime.c' object='TPI_Walltime.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI_Walltime.o `test -f '$(srcdir)/TPI_Walltime.c' || echo '$(srcdir)/'`$(srcdir)/TPI_Walltime.c
-
-TPI_Walltime.obj: $(srcdir)/TPI_Walltime.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT TPI_Walltime.obj -MD -MP -MF $(DEPDIR)/TPI_Walltime.Tpo -c -o TPI_Walltime.obj `if test -f '$(srcdir)/TPI_Walltime.c'; then $(CYGPATH_W) '$(srcdir)/TPI_Walltime.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI_Walltime.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/TPI_Walltime.Tpo $(DEPDIR)/TPI_Walltime.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$(srcdir)/TPI_Walltime.c' object='TPI_Walltime.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o TPI_Walltime.obj `if test -f '$(srcdir)/TPI_Walltime.c'; then $(CYGPATH_W) '$(srcdir)/TPI_Walltime.c'; else $(CYGPATH_W) '$(srcdir)/$(srcdir)/TPI_Walltime.c'; fi`
-install-includeHEADERS: $(include_HEADERS)
-	@$(NORMAL_INSTALL)
-	test -z "$(includedir)" || $(MKDIR_P) "$(DESTDIR)$(includedir)"
-	@list='$(include_HEADERS)'; for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  f=$(am__strip_dir) \
-	  echo " $(includeHEADERS_INSTALL) '$$d$$p' '$(DESTDIR)$(includedir)/$$f'"; \
-	  $(includeHEADERS_INSTALL) "$$d$$p" "$(DESTDIR)$(includedir)/$$f"; \
-	done
-
-uninstall-includeHEADERS:
-	@$(NORMAL_UNINSTALL)
-	@list='$(include_HEADERS)'; for p in $$list; do \
-	  f=$(am__strip_dir) \
-	  echo " rm -f '$(DESTDIR)$(includedir)/$$f'"; \
-	  rm -f "$(DESTDIR)$(includedir)/$$f"; \
-	done
-install-nodist_execincludeHEADERS: $(nodist_execinclude_HEADERS)
-	@$(NORMAL_INSTALL)
-	test -z "$(execincludedir)" || $(MKDIR_P) "$(DESTDIR)$(execincludedir)"
-	@list='$(nodist_execinclude_HEADERS)'; for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  f=$(am__strip_dir) \
-	  echo " $(nodist_execincludeHEADERS_INSTALL) '$$d$$p' '$(DESTDIR)$(execincludedir)/$$f'"; \
-	  $(nodist_execincludeHEADERS_INSTALL) "$$d$$p" "$(DESTDIR)$(execincludedir)/$$f"; \
-	done
-
-uninstall-nodist_execincludeHEADERS:
-	@$(NORMAL_UNINSTALL)
-	@list='$(nodist_execinclude_HEADERS)'; for p in $$list; do \
-	  f=$(am__strip_dir) \
-	  echo " rm -f '$(DESTDIR)$(execincludedir)/$$f'"; \
-	  rm -f "$(DESTDIR)$(execincludedir)/$$f"; \
-	done
-
-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS:  $(HEADERS) $(SOURCES) ThreadPool_config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	tags=; \
-	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS) ThreadPool_config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	    $$tags $$unique; \
-	fi
-ctags: CTAGS
-CTAGS:  $(HEADERS) $(SOURCES) ThreadPool_config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	tags=; \
-	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS) ThreadPool_config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	test -z "$(CTAGS_ARGS)$$tags$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$tags $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && cd $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) $$here
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
-	    fi; \
-	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
-	  else \
-	    test -f $(distdir)/$$file \
-	    || cp -p $$d/$$file $(distdir)/$$file \
-	    || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-am
-all-am: Makefile $(LIBRARIES) $(HEADERS) ThreadPool_config.h all-local
-installdirs:
-	for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(includedir)" "$(DESTDIR)$(execincludedir)"; do \
-	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
-	done
-install: install-am
-install-exec: install-exec-am
-install-data: install-data-am
-uninstall: uninstall-am
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-am
-install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-	-test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES)
-clean: clean-am
-
-clean-am: clean-generic clean-libLIBRARIES mostlyclean-am
-
-distclean: distclean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-distclean-am: clean-am distclean-compile distclean-generic \
-	distclean-hdr distclean-tags
-
-dvi: dvi-am
-
-dvi-am:
-
-html: html-am
-
-info: info-am
-
-info-am:
-
-install-data-am: install-includeHEADERS
-
-install-dvi: install-dvi-am
-
-install-exec-am: install-libLIBRARIES \
-	install-nodist_execincludeHEADERS
-
-install-html: install-html-am
-
-install-info: install-info-am
-
-install-man:
-
-install-pdf: install-pdf-am
-
-install-ps: install-ps-am
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-am
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-am
-
-mostlyclean-am: mostlyclean-compile mostlyclean-generic
-
-pdf: pdf-am
-
-pdf-am:
-
-ps: ps-am
-
-ps-am:
-
-uninstall-am: uninstall-includeHEADERS uninstall-libLIBRARIES \
-	uninstall-nodist_execincludeHEADERS
-
-.MAKE: install-am install-strip
-
-.PHONY: CTAGS GTAGS all all-am all-local check check-am clean \
-	clean-generic clean-libLIBRARIES ctags distclean \
-	distclean-compile distclean-generic distclean-hdr \
-	distclean-tags distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-includeHEADERS install-info \
-	install-info-am install-libLIBRARIES install-man \
-	install-nodist_execincludeHEADERS install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs maintainer-clean \
-	maintainer-clean-generic mostlyclean mostlyclean-compile \
-	mostlyclean-generic pdf pdf-am ps ps-am tags uninstall \
-	uninstall-am uninstall-includeHEADERS uninstall-libLIBRARIES \
-	uninstall-nodist_execincludeHEADERS
-
-
-all-local:
-	@echo ""
-	@echo "Trilinos package ThreadPool subdirectory src built successfully."
-	@echo ""
-
-#np# replace new_package with the name of the package being autotool'ed here
-#EXTRA_libtpi_a_SOURCES =
-
-include $(top_builddir)/Makefile.export.threadpool
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/kokkos/basic/optional/ThreadPool/src/TPI.c b/kokkos/basic/optional/ThreadPool/src/TPI.c
deleted file mode 100644
index f2b1566..0000000
--- a/kokkos/basic/optional/ThreadPool/src/TPI.c
+++ /dev/null
@@ -1,1016 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards
- */
-
-/*--------------------------------------------------------------------*/
-
-#include <TPI.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <ThreadPool_config.h>
-
-/*--------------------------------------------------------------------*/
-/*----------- PTHREAD CONFIGURATION (BEGIN) --------------------------*/
-/*--------------------------------------------------------------------*/
-
-#if	defined( HAVE_PTHREAD )
-
-#include <errno.h>
-#include <pthread.h>
-#include <sched.h>
-
-/*--------------------------------------------------------------------*/
-/*---------------- COMPILER SPECIFICS (BEGIN) ------------------------*/
-/*--------------------------------------------------------------------*/
-
-/*  Performance is heavily impacted by an
- *  atomic decrement of the work counter.
- *  Optimize this if at all possible.
- */
-
-#if	defined( __INTEL_COMPILER )
-
-#define THREADPOOL_CONFIG "PTHREAD SCHED_YIELD"
-
-#elif	defined( __linux__ ) && \
-	defined( __GNUC__ ) && ( 4 <= __GNUC__ )
-
-#define THREADPOOL_CONFIG "PTHREAD SCHED_YIELD ATOMIC_SYNC"
-
-#define atomic_fetch_and_decrement( VALUE_PTR )	\
-	__sync_fetch_and_sub( VALUE_PTR , 1 )
-
-#else
-
-#define THREADPOOL_CONFIG "PTHREAD SCHED_YIELD"
-
-#endif
-
-#if ! defined( atomic_fetch_and_decrement )
-
-static int atomic_fetch_and_decrement( volatile int * value )
-{
-  static pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER ;
-  int result ;
-  while ( EBUSY == pthread_mutex_trylock( & atomic_lock ) );
-  result = ( *value )-- ;
-  pthread_mutex_unlock( & atomic_lock );
-  return result ;
-}
-
-#endif
-
-/*--------------------------------------------------------------------*/
-/*---------------- COMPILER SPECIFICS (END) --------------------------*/
-/*--------------------------------------------------------------------*/
-
-typedef pthread_mutex_t  local_lock_type ;
-
-#else /* ! defined( HAVE_PTHREAD ) */
-
-#define THREADPOOL_CONFIG "NO THREADING"
-
-typedef int  local_lock_type ;
-
-#endif
-
-/*--------------------------------------------------------------------*/
-/*----------- PTHREAD CONFIGURATION (END) ----------------------------*/
-/*--------------------------------------------------------------------*/
-
-const char * TPI_Version()
-{
-  static const char version_string[] =
-    "TPI Version 1.1 , November 2009 , Configuration = " THREADPOOL_CONFIG ;
-
-  return version_string ;
-}
-
-/*--------------------------------------------------------------------*/
-
-enum { THREAD_COUNT_MAX = 256 };
-enum { LOCK_COUNT_MAX   = 32 };
-
-struct ThreadPool_Data ;
-
-typedef struct Thread_Data {
-  struct Thread_Data * m_thread_fan ; /* Fan-in / fan-out begin */
-  void               * m_reduce ;     /* Reduction memory */
-  long                 m_rank ;
-  long                 m_barrier_wait_max ;
-  long                 m_barrier_wait_total ;
-  long                 m_barrier_wait_count ;
-  volatile long        m_control ;
-} Thread ;
-
-typedef struct ThreadPool_Data {
-  TPI_work_subprogram   m_work_routine ;
-  const void *          m_work_info ;
-  TPI_reduce_join       m_reduce_join ;
-  TPI_reduce_init       m_reduce_init ;
-  unsigned char       * m_reduce_alloc ;
-  int                   m_reduce_alloc_size ;
-  int                   m_thread_count ;
-  int                   m_lock_init ;
-  int                   m_lock_count ;
-  int                   m_work_thread_count ;
-  int                   m_work_count ;
-  int                   m_work_count_claim ;
-
-  Thread                m_thread[ THREAD_COUNT_MAX ];
-  local_lock_type       m_lock[ LOCK_COUNT_MAX ];
-} ThreadPool ;
-
-
-static ThreadPool thread_pool =
-{
-  /* m_work_routine        */  NULL ,
-  /* m_work_info           */  NULL ,
-  /* m_reduce_join         */  NULL ,
-  /* m_reduce_init         */  NULL ,
-  /* m_reduce_alloc        */  NULL ,
-  /* m_reduce_alloc_size   */  0 ,
-  /* m_thread_count        */  0 ,
-  /* m_lock_init           */  0 ,
-  /* m_lock_count          */  0 ,
-  /* m_work_thread_count   */  0 ,
-  /* m_work_count          */  0 ,
-  /* m_work_count_claim    */  0
-};
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-#if defined( HAVE_PTHREAD )
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-int TPI_Lock( int i )
-{
-  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
-
-  if ( ! result ) {
-    pthread_mutex_t * const lock = thread_pool.m_lock + i ;
-
-    while ( EBUSY == ( result = pthread_mutex_trylock( lock ) ) );
-
-    if ( result ) { result = TPI_ERROR_LOCK ; }
-  }
-  return result ;
-}
-
-int TPI_Unlock( int i )
-{
-  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
-
-  if ( ! result && pthread_mutex_unlock( thread_pool.m_lock + i ) ) {
-    result = TPI_ERROR_LOCK ;
-  }
-
-  return result ;
-}
-
-static int local_set_lock_count( const int lock_count )
-{
-  int result = lock_count < 0 || LOCK_COUNT_MAX < lock_count
-             ? TPI_ERROR_SIZE : 0 ;
-
-  while ( ! result && thread_pool.m_lock_init < lock_count ) {
-
-    pthread_mutex_t * const lock = thread_pool.m_lock +
-                                   thread_pool.m_lock_init ;
-
-    if ( pthread_mutex_init( lock , NULL ) ) {
-      result = TPI_ERROR_INTERNAL ;
-    }
-    else {
-      ++( thread_pool.m_lock_init );
-    }
-  }
-
-  return result ;
-}
-
-static void local_destroy_locks()
-{
-  while ( thread_pool.m_lock_init ) {
-    --( thread_pool.m_lock_init );
-    pthread_mutex_destroy( thread_pool.m_lock + thread_pool.m_lock_init );
-  }
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-/* Run work if any, then wait for child threads to block. */
-
-static void local_run( Thread * const this_thread , void * reduce )
-{
-  struct TPI_Work_Struct work ;
-
-  work.info       = thread_pool.m_work_info ;
-  work.reduce     = reduce ;
-  work.count      = thread_pool.m_work_count ;
-  work.lock_count = thread_pool.m_lock_count ;
-
-  if ( work.count <= thread_pool.m_work_thread_count ) {
-
-    work.rank = ( thread_pool.m_thread_count - 1 ) - this_thread->m_rank ;
-
-    if ( work.rank < work.count ) {
-      (*thread_pool.m_work_routine)( & work );
-    }
-  }
-  else {
-
-    int * const claim = & thread_pool.m_work_count_claim ;
-
-    while ( 0 < ( work.rank = atomic_fetch_and_decrement( claim ))) {
-
-      work.rank = work.count - work.rank ;
-
-      (*thread_pool.m_work_routine)( & work );
-    }
-  }
-}
-
-static int wait_thread( volatile long * const control , const int val )
-{
-  int count = 0 ;
-  while ( val == *control ) {
-    sched_yield();
-    ++count ;
-  }
-  return count ;
-}
-
-static void local_barrier_wait( Thread * const this_thread ,
-                                Thread * const thread )
-{
-  const long count = wait_thread( & thread->m_control , 1 );
-
-  ++( this_thread->m_barrier_wait_count );
-
-  this_thread->m_barrier_wait_total += count ;
-
-  if ( this_thread->m_barrier_wait_max < count ) {
-    this_thread->m_barrier_wait_max = count ;
-  }
-}
-
-static void local_barrier( Thread * const this_thread )
-{
-  Thread * const thread_beg = this_thread[0].m_thread_fan ;
-  Thread *       thread     = this_thread[1].m_thread_fan ;
-
-  if ( ! thread_pool.m_work_routine ) {
-    while ( thread_beg < thread ) {
-      --thread ; local_barrier_wait( this_thread , thread );
-    }
-  }
-  else if ( ! thread_pool.m_reduce_join ) {
-
-    local_run( this_thread , NULL );
-
-    while ( thread_beg < thread ) {
-      --thread ; local_barrier_wait( this_thread , thread );
-    }
-  }
-  else {
-
-    /* Work data for the reduction initialization and join */
-
-    struct TPI_Work_Struct work ;
-
-    work.info       = thread_pool.m_work_info ;
-    work.reduce     = this_thread->m_reduce ;
-    work.count      = -1 ;
-    work.rank       = -1 ;
-    work.lock_count = -1 ;
-
-    /* Initialize reduction value for non-root thread */
-
-    if ( this_thread->m_rank ) { (*thread_pool.m_reduce_init)( & work ); }
-
-    /* Run the work routine with barrier blocking */
-
-    local_run( this_thread , work.reduce );
-
-    /* Reduction of thread's contributions */
-
-    while ( thread_beg < thread ) {
-      --thread ; local_barrier_wait( this_thread , thread );
-      (*thread_pool.m_reduce_join)( & work , thread->m_reduce );
-    }
-  }
-}
- 
-/*--------------------------------------------------------------------*/
-/*  The driver given to 'pthread_create'.
- *  Run work until told to terminate.
- */
-static void * local_driver( void * arg )
-{
-  Thread * const this_thread = (Thread *) arg ;
-
-  do {
-    /* Wait for my subtree of threads to complete */
-    local_barrier( this_thread );
-
-    this_thread->m_control = 0 ;
-
-    /*  Spin until I am activated. */
-    wait_thread( & this_thread->m_control , 0 );
-
-  } while ( thread_pool.m_work_routine );
-
-  local_barrier( this_thread ); /* Termination barrier */
-
-  this_thread->m_control = 0 ;
-
-  return NULL ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
- 
-static void alloc_reduce( int reduce_size )
-{
-  const int alloc_count = thread_pool.m_thread_count - 1 ;
-
-  if ( thread_pool.m_reduce_alloc_size < alloc_count * reduce_size ) {
-
-    const int grain_shift  = 8 ; /* grain_size = 0x80 */
-    const int grain_size   = 1 << grain_shift ; /* Byte grain size */
-    const int grain_count  = ( reduce_size + grain_size - 1 ) >> grain_shift ;
-    const int reduce_grain = grain_size * grain_count ; 
-    const int alloc_size   = alloc_count * reduce_grain ;
-
-    int i ;
-
-    if ( thread_pool.m_reduce_alloc ) {
-      thread_pool.m_reduce_alloc =
-        (unsigned char *) realloc( thread_pool.m_reduce_alloc , alloc_size );
-    }
-    else {
-      thread_pool.m_reduce_alloc = (unsigned char *) malloc( alloc_size );
-    }
-
-    thread_pool.m_reduce_alloc_size = alloc_size ;
-
-    for ( i = 0 ; i < alloc_count ; ++i ) {
-      thread_pool.m_thread[i+1].m_reduce =
-        thread_pool.m_reduce_alloc + reduce_grain * i ;
-    }
-  }
-}
-
-static int local_start(
-  int                   work_thread_count ,
-  TPI_work_subprogram   work_subprogram  ,
-  const void *          work_info ,
-  int                   work_count  ,
-  int                   lock_count ,
-  TPI_reduce_join       reduce_join ,
-  TPI_reduce_init       reduce_init ,
-  int                   reduce_size ,
-  void *                reduce_data )
-{
-  const int result = lock_count ? local_set_lock_count( lock_count ) : 0 ;
-
-  if ( ! result ) {
-
-    thread_pool.m_work_routine     = work_subprogram ;
-    thread_pool.m_work_info        = work_info ;
-    thread_pool.m_work_count       = work_count ;
-    thread_pool.m_lock_count       = lock_count ;
-    thread_pool.m_thread->m_reduce = reduce_data ;
-
-    if ( 1 < thread_pool.m_thread_count ) {
-
-      if ( reduce_size ) { alloc_reduce( reduce_size ); }
-
-      thread_pool.m_reduce_join       = reduce_join ;
-      thread_pool.m_reduce_init       = reduce_init ;
-      thread_pool.m_work_thread_count = work_thread_count ;
-      thread_pool.m_work_count_claim  = work_count ;
-
-      /* Activate the spinning worker threads */
-      {
-        Thread * const thread_beg = thread_pool.m_thread + 1 ;
-        Thread *       thread     = thread_pool.m_thread +
-                                    thread_pool.m_thread_count ;
-
-        while ( thread_beg < thread ) { (--thread)->m_control = 1 ; }
-      }
-    }
-  }
-
-  return result ;
-}
-
-static void local_wait()
-{
-  if ( 1 < thread_pool.m_thread_count ) {
-
-    local_barrier( thread_pool.m_thread );
-
-    thread_pool.m_reduce_join       = NULL ;
-    thread_pool.m_reduce_init       = NULL ;
-    thread_pool.m_work_thread_count = 0 ;
-    thread_pool.m_work_count_claim  = 0 ;
-  }
-  else {
-    struct TPI_Work_Struct w = { NULL , NULL , 0 , 0 , 0 };
-
-    w.info       = thread_pool.m_work_info ;
-    w.count      = thread_pool.m_work_count ;
-    w.lock_count = thread_pool.m_lock_count ;
-    w.reduce     = thread_pool.m_thread->m_reduce ;
-
-    for ( w.rank = 0 ; w.rank < w.count ; ++( w.rank ) ) {
-      (* thread_pool.m_work_routine )( & w );
-    }
-  }
-
-  thread_pool.m_work_routine     = NULL ;
-  thread_pool.m_work_info        = NULL ;
-  thread_pool.m_work_count       = 0 ;
-  thread_pool.m_lock_count       = 0 ;
-  thread_pool.m_thread->m_reduce = NULL ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-int TPI_Init( int n )
-{
-  int result = thread_pool.m_thread_count ? TPI_ERROR_ACTIVE : 0 ;
-
-  if ( ! result && ( n < 1 || THREAD_COUNT_MAX + 1 <= n ) ) {
-    result = TPI_ERROR_SIZE ;
-  }
-
-  if ( ! result ) {
-    pthread_attr_t attr ;
-
-    if ( pthread_attr_init( & attr )
-         || pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM )
-         || pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
-      result = TPI_ERROR_INTERNAL ;
-    }
-
-    if ( ! result ) {
-      int thread_rank = 0 ;
-      int count = 1 ;
-
-      /* Initialize one lock for blocking and unblocking */
-
-      local_set_lock_count( 1 );
-
-      /* Initialize threads with fan-in / fan-out span of threads */
-
-      for ( thread_rank = 0 ; thread_rank <= n ; ++thread_rank ) {
-        Thread * const thread = thread_pool.m_thread + thread_rank ;
-
-        thread->m_thread_fan         = thread_pool.m_thread + count ;
-        thread->m_reduce             = NULL ;
-        thread->m_rank               = thread_rank ;
-        thread->m_barrier_wait_max   = 0 ;
-        thread->m_barrier_wait_total = 0 ;
-        thread->m_barrier_wait_count = 0 ;
-        thread->m_control            = 1 ;
-
-        {
-          int up = 1 ;
-          while ( up <= thread_rank )    { up <<= 1 ; }
-          while ( thread_rank + up < n ) { up <<= 1 ; ++count ; }
-        }
-      }
-
-      thread_pool.m_thread_count = n ;
-
-      /* Create threads last-to-first for start up fan-in barrier */
-
-      for ( thread_rank = n ; ! result && 1 < thread_rank ; ) {
-        Thread * const thread = thread_pool.m_thread + --thread_rank ;
-
-        pthread_t pt ;
-
-        if ( pthread_create( & pt, & attr, & local_driver, thread ) ) {
-          thread->m_control = 0 ;
-          result = TPI_ERROR_INTERNAL ;
-        }
-      }
-
-      /* If a thread-spawn failed, terminate the created threads */
-
-      if ( result ) {
-        while ( thread_rank < --( thread_pool.m_thread_count ) ) {
-          Thread * thread = thread_pool.m_thread + thread_pool.m_thread_count ;
-          wait_thread( & thread->m_control , 1 ); /* Wait for blocking */
-          thread->m_control = 1 ; /* Reactivate thread */
-          wait_thread( & thread->m_control , 1 ); /* Wait for termination */
-        }
-        thread_pool.m_thread_count = 0 ;
-      }
-
-      pthread_attr_destroy( & attr );
-    }
-  }
-
-  if ( ! result ) {
-    local_barrier( thread_pool.m_thread );
-    result = n ;
-  }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
-int TPI_Finalize()
-{
-  static int print_statistics = 0 ;
-
-  int result ;
-
-  result = NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : 0 ;
-
-  if ( ! result ) {
-
-    /* Wake up threads then wait for them to terminate */
-    local_start( 0 , NULL , NULL , 0 ,
-                 0 , NULL , NULL , 0 , NULL );
-
-    local_wait();
-
-    if ( print_statistics ) {
-      int i = 0 ;
-      for ( ; i < thread_pool.m_thread_count ; ++i ) {
-        if ( thread_pool.m_thread[i].m_barrier_wait_count ) {
-          long mean = ( thread_pool.m_thread[i].m_barrier_wait_total + 0.5 ) /
-                        thread_pool.m_thread[i].m_barrier_wait_count ;
-          fprintf(stdout,"Thread[%d] barrier_wait( max %ld , mean %ld )\n", i ,
-                   thread_pool.m_thread[i].m_barrier_wait_max , mean );
-        }
-      }
-    }
-
-    thread_pool.m_thread_count = 0 ;
-
-    local_destroy_locks();
-
-    if ( thread_pool.m_reduce_alloc ) {
-      free( thread_pool.m_reduce_alloc );
-      thread_pool.m_reduce_alloc = NULL ;
-      thread_pool.m_reduce_alloc_size = 0 ;
-    }
-  }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-static void local_block( TPI_Work * work )
-{
-  if ( work->rank ) {
-    pthread_mutex_lock(   thread_pool.m_lock );
-    pthread_mutex_unlock( thread_pool.m_lock );
-  }
-}
-
-int TPI_Block()
-{
-  const int result =
-    NULL != thread_pool.m_work_routine       ? TPI_ERROR_ACTIVE : (
-    pthread_mutex_lock( thread_pool.m_lock ) ? TPI_ERROR_INTERNAL :
-
-    local_start( thread_pool.m_thread_count ,
-                 local_block , NULL ,
-                 thread_pool.m_thread_count ,
-                 0 /* lock_count */ ,
-                 NULL , NULL , 0 , NULL ) );
-
-  return result ;
-}
-
-int TPI_Unblock()
-{
-  const int result =
-    local_block != thread_pool.m_work_routine  ? TPI_ERROR_ACTIVE : (
-    pthread_mutex_unlock( thread_pool.m_lock ) ? TPI_ERROR_INTERNAL : 0 );
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-int TPI_Isblocked()
-{
-  return local_block == thread_pool.m_work_routine ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-#else /* ! defined( HAVE_PTHREAD ) */
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-int TPI_Lock( int i )
-{
-  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
-
-  if ( ! result ) {
-    if ( 0 != thread_pool.m_lock[i] ) {
-      result = TPI_ERROR_LOCK ;
-    }
-    else {
-      thread_pool.m_lock[i] = 1 ;
-    }
-  }
-  return result ;
-}
-
-int TPI_Unlock( int i )
-{
-  int result = i < 0 || thread_pool.m_lock_count <= i ? TPI_ERROR_SIZE : 0 ;
-
-  if ( ! result ) {
-    if ( 0 == thread_pool.m_lock[i] ) {
-      result = TPI_ERROR_LOCK ;
-    }
-    else {
-      thread_pool.m_lock[i] = 0 ;
-    }
-  }
-  return result ;
-}
-
-static int local_set_lock_count( const int lock_count )
-{
-  int result = lock_count < 0 || LOCK_COUNT_MAX < lock_count
-             ? TPI_ERROR_SIZE : 0 ;
-
-  while ( thread_pool.m_lock_init < lock_count ) {
-
-    thread_pool.m_lock[ thread_pool.m_lock_init ] = 0 ;
-
-    ++( thread_pool.m_lock_init );
-  }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
-static int local_start(
-  int                   work_thread_count ,
-  TPI_work_subprogram   work_subprogram  ,
-  const void *          work_info ,
-  int                   work_count  ,
-  int                   lock_count ,
-  TPI_reduce_join       reduce_join ,
-  TPI_reduce_init       reduce_init ,
-  int                   reduce_size ,
-  void *                reduce_data )
-{
-  const int result = lock_count ? local_set_lock_count( lock_count ) : 0 ;
-
-  if ( ! result ) {
-    thread_pool.m_work_routine     = work_subprogram ;
-    thread_pool.m_work_info        = work_info ;
-    thread_pool.m_work_count       = work_count ;
-    thread_pool.m_lock_count       = lock_count ;
-    thread_pool.m_thread->m_reduce = reduce_data ;
-  }
-
-  return result ;
-}
-
-static void local_wait()
-{
-  struct TPI_Work_Struct w = { NULL , NULL , 0 , 0 , 0 };
-
-  w.info       = thread_pool.m_work_info ;
-  w.count      = thread_pool.m_work_count ;
-  w.lock_count = thread_pool.m_lock_count ;
-  w.reduce     = thread_pool.m_thread->m_reduce ;
-
-  for ( w.rank = 0 ; w.rank < w.count ; ++( w.rank ) ) {
-    (* thread_pool.m_work_routine )( & w );
-  }
-
-  thread_pool.m_work_routine     = NULL ;
-  thread_pool.m_work_info        = NULL ;
-  thread_pool.m_work_count       = 0 ;
-  thread_pool.m_lock_count       = 0 ;
-  thread_pool.m_thread->m_reduce = NULL ;
-}
-
-/*--------------------------------------------------------------------*/
-
-static void local_block( TPI_Work * work ) {}
-
-int TPI_Block()
-{
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE :
-
-    local_start( thread_pool.m_thread_count ,
-                 local_block , NULL ,
-                 thread_pool.m_thread_count ,
-                 0 /* lock_count */ ,
-                 NULL , NULL , 0 , NULL ) ;
-
-  return result ;
-}
-
-int TPI_Unblock()
-{
-  const int result =
-    local_block != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : 0 ;
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-int TPI_Isblocked()
-{
-  return local_block == thread_pool.m_work_routine ;
-}
-
-/*--------------------------------------------------------------------*/
-
-int TPI_Init( int n )
-{
-  int result = thread_pool.m_thread_count ? TPI_ERROR_ACTIVE : 0 ;
-
-  if ( ! result && ( n < 1 || THREAD_COUNT_MAX + 1 <= n ) ) {
-    result = TPI_ERROR_SIZE ;
-  }
-  else {
-    Thread * const thread = thread_pool.m_thread ;
-
-    thread->m_thread_fan         = NULL ;
-    thread->m_reduce             = NULL ;
-    thread->m_rank               = 0 ;
-    thread->m_barrier_wait_max   = 0 ;
-    thread->m_barrier_wait_total = 0 ;
-    thread->m_barrier_wait_count = 0 ;
-    thread->m_control            = 1 ;
-
-    thread_pool.m_thread_count = result = n ;
-
-    /* Initialize one lock for blocking and unblocking */
-
-    local_set_lock_count( 1 );
-  }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
-int TPI_Finalize()
-{
-  int result =  NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : 0 ;
-
-  if ( ! result ) {
-    thread_pool.m_thread_count = 0 ;
-    thread_pool.m_lock_init = 0 ;
-  }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-#endif
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-int TPI_Wait()
-{
-  const int result =
-    ( NULL        == thread_pool.m_work_routine ||
-      local_block == thread_pool.m_work_routine ) ? TPI_ERROR_ACTIVE : 0 ;
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-int TPI_Start( TPI_work_subprogram work_subprogram  ,
-               const void *        work_info ,
-               int                 work_count ,
-               int                 lock_count )
-{
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    work_count  < 0                    ? TPI_ERROR_SIZE :
-    local_start( thread_pool.m_thread_count - 1 ,
-                 work_subprogram , work_info , work_count , lock_count ,
-                 NULL , NULL , 0 , NULL ) ) );
-
-  return result ;
-}
-
-int TPI_Run( TPI_work_subprogram work_subprogram  ,
-             const void *        work_info ,
-             int                 work_count ,
-             int                 lock_count )
-{
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    work_count  < 0                    ? TPI_ERROR_SIZE :
-    local_start( thread_pool.m_thread_count ,
-                 work_subprogram , work_info , work_count , lock_count ,
-                 NULL , NULL , 0 , NULL ) ) );
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-int TPI_Run_threads( TPI_work_subprogram work_subprogram  ,
-                     const void *        work_info ,
-                     int                 lock_count  )
-{
-  const int work_count = 0 < thread_pool.m_thread_count ?
-                             thread_pool.m_thread_count : 1 ;
-
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    local_start( thread_pool.m_thread_count ,
-                 work_subprogram , work_info , work_count , lock_count ,
-                 NULL , NULL , 0 , NULL ) ) );
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-int TPI_Start_threads( TPI_work_subprogram work_subprogram  ,
-                       const void *        work_info ,
-                       int                 lock_count  )
-{
-  const int work_count = 1 < thread_pool.m_thread_count ?
-                             thread_pool.m_thread_count - 1 : 1 ;
-
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    local_start( thread_pool.m_thread_count - 1 ,
-                 work_subprogram , work_info , work_count , lock_count ,
-                 NULL , NULL , 0 , NULL ) ) );
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-int TPI_Run_reduce( TPI_work_subprogram   work_subprogram  ,
-                    const void *          work_info ,
-                    int                   work_count  ,
-                    TPI_reduce_join       reduce_join ,
-                    TPI_reduce_init       reduce_init ,
-                    int                   reduce_size ,
-                    void *                reduce_data )
-{
-  const int lock_count = 0 ;
-
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    NULL == reduce_join                ? TPI_ERROR_NULL : (
-    NULL == reduce_init                ? TPI_ERROR_NULL : (
-    NULL == reduce_data                ? TPI_ERROR_NULL : (
-    work_count  <= 0                   ? TPI_ERROR_SIZE : (
-    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
-
-    local_start( thread_pool.m_thread_count ,
-                 work_subprogram, work_info, work_count, lock_count,
-                 reduce_join, reduce_init, reduce_size, reduce_data )))))));
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-int TPI_Run_threads_reduce( TPI_work_subprogram   work_subprogram  ,
-                            const void *          work_info ,
-                            TPI_reduce_join       reduce_join ,
-                            TPI_reduce_init       reduce_init ,
-                            int                   reduce_size ,
-                            void *                reduce_data )
-{
-  const int lock_count = 0 ;
-  const int work_count = 0 < thread_pool.m_thread_count ?
-                             thread_pool.m_thread_count : 1 ;
-
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    NULL == reduce_join                ? TPI_ERROR_NULL : (
-    NULL == reduce_init                ? TPI_ERROR_NULL : (
-    NULL == reduce_data                ? TPI_ERROR_NULL : (
-    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
-
-    local_start( thread_pool.m_thread_count ,
-                 work_subprogram , work_info , work_count , lock_count ,
-                 reduce_join, reduce_init, reduce_size, reduce_data ))))));
-
-  if ( ! result ) { local_wait(); }
-
-  return result ;
-}
-
-int TPI_Start_reduce( TPI_work_subprogram   work_subprogram  ,
-                      const void *          work_info ,
-                      int                   work_count  ,
-                      TPI_reduce_join       reduce_join ,
-                      TPI_reduce_init       reduce_init ,
-                      int                   reduce_size ,
-                      void *                reduce_data )
-{
-  const int lock_count = 0 ;
-
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    NULL == reduce_join                ? TPI_ERROR_NULL : (
-    NULL == reduce_init                ? TPI_ERROR_NULL : (
-    NULL == reduce_data                ? TPI_ERROR_NULL : (
-    work_count  <= 0                   ? TPI_ERROR_SIZE : (
-    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
-
-    local_start( thread_pool.m_thread_count - 1 ,
-                 work_subprogram , work_info , work_count , lock_count ,
-                 reduce_join, reduce_init, reduce_size, reduce_data )))))));
-
-  return result ;
-}
-
-int TPI_Start_threads_reduce( TPI_work_subprogram   work_subprogram  ,
-                              const void *          work_info ,
-                              TPI_reduce_join       reduce_join ,
-                              TPI_reduce_init       reduce_init ,
-                              int                   reduce_size ,
-                              void *                reduce_data )
-{
-  const int lock_count = 0 ;
-  const int work_count = 1 < thread_pool.m_thread_count ?
-                             thread_pool.m_thread_count - 1 : 1 ;
-
-  const int result =
-    NULL != thread_pool.m_work_routine ? TPI_ERROR_ACTIVE : (
-    NULL == work_subprogram            ? TPI_ERROR_NULL : (
-    NULL == reduce_join                ? TPI_ERROR_NULL : (
-    NULL == reduce_init                ? TPI_ERROR_NULL : (
-    NULL == reduce_data                ? TPI_ERROR_NULL : (
-    reduce_size <= 0                   ? TPI_ERROR_SIZE : 
-
-    local_start( thread_pool.m_thread_count - 1 ,
-                 work_subprogram , work_info , work_count , lock_count ,
-                 reduce_join, reduce_init, reduce_size, reduce_data ))))));
-
-  return result ;
-}
-
-
diff --git a/kokkos/basic/optional/ThreadPool/src/TPI.h b/kokkos/basic/optional/ThreadPool/src/TPI.h
deleted file mode 100644
index 939d3be..0000000
--- a/kokkos/basic/optional/ThreadPool/src/TPI.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards  <hcedwar@sandia.gov>
- *
- *  Thread Pool Interface (TPI).
- *
- *  A simple and miminalistic interface for executing subprograms
- *  in a thread parallel, shared memory mode.
- *
- *  States: the underlying thread pool has four states.
- *    1) Uninitialized: no extra threads exist, this is the initial state.
- *    2) Ready:  extra threads exist and are ready to run a subprogram.
- *    3) Active: extra threads are calling the subprogram.
- *    4) Blocked: extra threads blocked.
- *
- *  Threads are created on initialization and placed in the 'Ready' state.
- *  While in the 'Ready' state the threads are spin-waiting to minimize
- *  the cost of activating blocked threads.
- *  Threads can be blocked so that they do not compete for computatational
- *  resources with other threads created external to the TPI interface.
- *  For example, threads created by OpenMP or TBB.
- */
-
-#ifndef ThreadPoolInterface_h
-#define ThreadPoolInterface_h
-
-#if defined( __cplusplus )
-extern "C" {
-#endif
-
-/*--------------------------------------------------------------------*/
-/** \brief  Version string. */
-const char * TPI_Version();
-
-/** Start up the requested number of threads, less the calling thread.
- *  Return the actual number of threads, including the calling thread,
- *  otherwise return an error.
- */
-int TPI_Init( int thread_count );
-
-/** Shut down all started threads. */
-int TPI_Finalize();
-
-/*--------------------------------------------------------------------*/
-/** \brief  A utility to measure wall-clock time, which is frequently
- *          needed when performance testing HPC algorithms.
- */
-double TPI_Walltime();
-
-/*--------------------------------------------------------------------*/
-/* All functions return zero for success. */
-
-#define TPI_ERROR_NULL     ((int) -1)  /**<  NULL input */
-#define TPI_ERROR_SIZE     ((int) -2)  /**<  BAD input: size or index */
-#define TPI_ERROR_LOCK     ((int) -3)  /**<  BAD lock or unlock */
-#define TPI_ERROR_ACTIVE   ((int) -4)  /**<  BAD input: the pool is active  */
-#define TPI_ERROR_INTERNAL ((int) -5)  /**< internal resource error */
-
-/*--------------------------------------------------------------------*/
-/** \brief  Work information passed to a work subprogram. */
-struct TPI_Work_Struct {
-  const void * info ;       /**<  Shared info input to TPI_Run */
-  void       * reduce ;     /**<  Data for reduce operation, if any */
-  int          count ;      /**<  Count of work requested via TPI_Run */
-  int          rank ;       /**<  Rank  of work for the current call */
-  int          lock_count ; /**<  Count of locks requested via TPI_Run */
-};
-
-/** \brief  Typedef for work subprogram argument */
-typedef const struct TPI_Work_Struct TPI_Work ;
-
-/**  The interface for a parallel task */
-typedef void (*TPI_work_subprogram)( TPI_Work * );
-
-/**  The interface for a parallel reduction operation.
- *   Initialize  work->reduce value.
- */
-typedef
-void (*TPI_reduce_init)( TPI_Work * work );
-
-/**  The interface for a parallel reduction operation.
- *   Perform reduction operation  work->reduce OP= reduce.
- *   Every initialized reduce value will appear exactly
- *   once as the 'reduce' argument of a call to the join function.
- */
-typedef
-void (*TPI_reduce_join)( TPI_Work * work , const void * reduce );
-
-/*--------------------------------------------------------------------*/
-/** \brief Run a work subprogram in thread parallel.
- *
- *  The thread pool must be in the 'paused' state when this
- *  function is called.  Thus a recursive call to TPI_Run is illegal.
- */
-int TPI_Run( TPI_work_subprogram work_subprogram  ,
-             const void *        work_info ,
-             int                 work_count  ,
-             int                 lock_count );
-
-/** \brief Run a work and reduction subprograms in thread parallel.
- *
- *  Each call to the work_subprogram has exclusive (thread safe)
- *  access to its work->reduce data.
- *  The reduce_init and reduce_join subprograms have
- *  exclusive access to their arguments.
- */
-int TPI_Run_reduce( TPI_work_subprogram   work_subprogram  ,
-                    const void *          work_info ,
-                    int                   work_count  ,
-                    TPI_reduce_join       reduce_join ,
-                    TPI_reduce_init       reduce_init ,
-                    int                   reduce_size ,
-                    void *                reduce_data );
-
-/** \brief  Run a work subprogram exactly once on each thread.
- *
- *  The thread pool must be in the 'paused' state when this
- *  function is called.  Thus a recursive call to TPI_Run is illegal.
- */
-int TPI_Run_threads( TPI_work_subprogram work_subprogram ,
-                     const void *        work_info ,
-                     int                 lock_count );
-
-/** \brief Run a work and reduction subprograms in thread parallel.
- *
- *  Each call to the work_subprogram has exclusive (thread safe)
- *  access to its work->reduce data.
- *  The reduce_init and reduce_join subprograms have
- *  exclusive access to their arguments.
- */
-int TPI_Run_threads_reduce( TPI_work_subprogram   work_subprogram ,
-                            const void *          work_info ,
-                            TPI_reduce_join       reduce_join ,
-                            TPI_reduce_init       reduce_init ,
-                            int                   reduce_size ,
-                            void *                reduce_data );
-
-/*--------------------------------------------------------------------*/
-/** \brief  Start a work subprogram in thread parallel
- *          running on all but the 'main' calling thread;
- *          the 'main' calling thread returns immediately.
- *
- *  The thread pool must be in the 'paused' state when this
- *  function is called.  Thus a recursive call to TPI_Start is illegal.
- */
-int TPI_Start( TPI_work_subprogram work_subprogram  ,
-               const void *        work_info ,
-               int                 work_count  ,
-               int                 lock_count );
-
-/** \brief  Start a work and reduction subprograms in thread parallel
- *          running on all but the 'main' calling thread;
- *          the 'main' calling thread returns immediately.
- *
- *  Each call to the work_subprogram has exclusive (thread safe)
- *  access to its work->reduce data.
- *  The reduce_init and reduce_join subprograms have
- *  exclusive access to their arguments.
- */
-int TPI_Start_reduce( TPI_work_subprogram   work_subprogram  ,
-                      const void *          work_info ,
-                      int                   work_count  ,
-                      TPI_reduce_join       reduce_join ,
-                      TPI_reduce_init       reduce_init ,
-                      int                   reduce_size ,
-                      void *                reduce_data );
-
-/** \brief  Run a work subprogram on each thread
- *          that is not the 'main' calling thread.
- *          The 'main' calling thread returns immediately.
- *
- *  The thread pool must be in the 'paused' state when this
- *  function is called.  Thus a recursive call to TPI_Start_threads is illegal.
- */
-int TPI_Start_threads( TPI_work_subprogram work_subprogram ,
-                       const void *        work_info ,
-                       int                 lock_count );
-
-/** \brief  Start a work / reduction subprogram 
- *          on each thread that is not the 'main' calling thread.
- *          The 'main' calling thread returns immediately.
- *
- *  Each call to the work_subprogram has exclusive (thread safe)
- *  access to its work->reduce data.
- *  The reduce_init and reduce_join subprograms have
- *  exclusive access to their arguments.
- */
-int TPI_Start_threads_reduce( TPI_work_subprogram   work_subprogram ,
-                              const void *          work_info ,
-                              TPI_reduce_join       reduce_join ,
-                              TPI_reduce_init       reduce_init ,
-                              int                   reduce_size ,
-                              void *                reduce_data );
-
-/** \brief  Wait for a started work subprogram to complete. */
-int TPI_Wait();
-
-/*--------------------------------------------------------------------*/
-/** \brief  Block threads within the operating system.
- *
- *  Normally the worker threads are unblocked and spinning for
- *  minimal start up overhead when running work subprograms.
- *  If no TPI work is to be performed for a long period of time
- *  then an application can block the worker threads.
- */
-int TPI_Block();
-
-/** \brief  Unblock blocked threads within the operating system */
-int TPI_Unblock();
-
-/** \brief  Query if threads are blocked */
-int TPI_Isblocked();
-
-/*--------------------------------------------------------------------*/
-/** \brief  Blocks until lock lock_rank is obtained.
- *          The thread pool must be in the 'active' state.
- */
-int TPI_Lock( int lock_rank );
-
-/** \brief  Unlocks lock lock_rank.
- *          The thread pool must be in the 'active' state.
- */
-int TPI_Unlock( int lock_rank );
-
-/*--------------------------------------------------------------------*/
-
-#if defined( __cplusplus )
-}
-#endif
-
-#endif
-
diff --git a/kokkos/basic/optional/ThreadPool/src/TPI.hpp b/kokkos/basic/optional/ThreadPool/src/TPI.hpp
deleted file mode 100644
index fc1894e..0000000
--- a/kokkos/basic/optional/ThreadPool/src/TPI.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards  <hcedwar@sandia.gov>
- */
-
-#ifndef util_ThreadPool_hpp
-#define util_ThreadPool_hpp
-
-#include <TPI.h>
-
-namespace TPI {
-
-typedef TPI_Work Work ;
-
-//----------------------------------------------------------------------
-/** Run  worker.*method(work)  on all threads.
- */
-template<class Worker>
-int Run( Worker & worker , void (Worker::*method)(Work &) ,
-         int work_count , int lock_count = 0 );
-
-//----------------------------------------------------------------------
-
-inline int Lock( int n )    { return TPI_Lock( n ); }
-inline int Unlock( int n )  { return TPI_Unlock( n ); }
-
-/** Lock guard to insure that a lock is released
- *  when control exists a block.
- *    {
- *      TPI::LockGuard local_lock( i );
- *    }
- */
-class LockGuard {
-private:
-  LockGuard();
-  LockGuard( const LockGuard & );
-  LockGuard & operator = ( const LockGuard & );
-  const int m_value ;
-  const int m_result ;
-public:
-  operator int() const { return m_result ; }
-
-  explicit LockGuard( unsigned i_lock )
-    : m_value( i_lock ), m_result( TPI_Lock(i_lock) ) {}
-
-  ~LockGuard() { TPI_Unlock( m_value ); }
-};
-
-//----------------------------------------------------------------------
-
-inline
-int Init( int n ) { return TPI_Init( n ); }
-
-inline
-int Finalize() { return TPI_Finalize(); }
-
-inline
-double Walltime() { return TPI_Walltime(); }
-
-//----------------------------------------------------------------------
-//----------------------------------------------------------------------
-
-namespace {
-
-template<class Worker>
-class WorkerMethodHelper {
-private:
-  WorkerMethodHelper();
-  WorkerMethodHelper( const WorkerMethodHelper & );
-  WorkerMethodHelper & operator = ( const WorkerMethodHelper & );
-
-public:
-
-  typedef void (Worker::*Method)( Work & );
-
-  Worker & worker ;
-  Method   method ;
-
-  WorkerMethodHelper( Worker & w , Method m ) : worker(w), method(m) {}
-
-  static void run( TPI_Work * work )
-    {
-      try {
-        const WorkerMethodHelper & wm =
-          * reinterpret_cast<const WorkerMethodHelper*>(work->info);
-        (wm.worker.*wm.method)(*work);
-      } catch(...){}
-    }
-};
-
-}
-
-//----------------------------------------------------------------------
-//----------------------------------------------------------------------
-
-template<class Worker>
-inline
-int Run( Worker & worker, void (Worker::*method)(Work &) ,
-         int work_count , int lock_count )
-{
-  typedef WorkerMethodHelper<Worker> WM ;
-
-  WM tmp( worker , method );
-
-  return TPI_Run( reinterpret_cast<TPI_work_subprogram>(& WM::run),&tmp,work_count,lock_count);
-}
-
-//----------------------------------------------------------------------
-//----------------------------------------------------------------------
-
-}
-
-#endif
-
diff --git a/kokkos/basic/optional/ThreadPool/src/TPI_Walltime.c b/kokkos/basic/optional/ThreadPool/src/TPI_Walltime.c
deleted file mode 100644
index d2c1fe4..0000000
--- a/kokkos/basic/optional/ThreadPool/src/TPI_Walltime.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards
- */
-
-#include <TPI.h>
-
-#include <stddef.h>
-#ifdef _MSC_VER
-#include <gettimeofday.c>
-#else
-#include <sys/time.h>
-#endif
-
-double TPI_Walltime()
-{
-  struct timeval tp ;
-
-  gettimeofday( &tp , ((struct timezone *) NULL ) );
-
-  return ( (double) tp.tv_sec ) + ( (double) tp.tv_usec ) / 1.0e6 ;
-}
-
diff --git a/kokkos/basic/optional/ThreadPool/src/ThreadPool_config.h.in b/kokkos/basic/optional/ThreadPool/src/ThreadPool_config.h.in
deleted file mode 100644
index 752f5c5..0000000
--- a/kokkos/basic/optional/ThreadPool/src/ThreadPool_config.h.in
+++ /dev/null
@@ -1,71 +0,0 @@
-/* src/ThreadPool_config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* Define if you want to build export makefiles. */
-#undef HAVE_EXPORT_MAKEFILES
-
-/* Define if you are using gnumake - this will shorten your link lines. */
-#undef HAVE_GNUMAKE
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#undef HAVE_INTTYPES_H
-
-/* Define if want to build libcheck */
-#undef HAVE_LIBCHECK
-
-/* Define to 1 if you have the <memory.h> header file. */
-#undef HAVE_MEMORY_H
-
-/* define if we want to use MPI */
-#undef HAVE_MPI
-
-/* Define if want to build threadpool-tests */
-#undef HAVE_NEW_PACKAGE_TESTS
-
-/* Define if you have POSIX threads libraries and header files. */
-#undef HAVE_PTHREAD
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#undef HAVE_STDINT_H
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#undef HAVE_STDLIB_H
-
-/* Define to 1 if you have the <strings.h> header file. */
-#undef HAVE_STRINGS_H
-
-/* Define to 1 if you have the <string.h> header file. */
-#undef HAVE_STRING_H
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#undef HAVE_SYS_STAT_H
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#undef HAVE_SYS_TYPES_H
-
-/* Define if want to build tests */
-#undef HAVE_TESTS
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#undef HAVE_UNISTD_H
-
-/* Define to the address where bug reports for this package should be sent. */
-#undef PACKAGE_BUGREPORT
-
-/* Define to the full name of this package. */
-#undef PACKAGE_NAME
-
-/* Define to the full name and version of this package. */
-#undef PACKAGE_STRING
-
-/* Define to the one symbol short name of this package. */
-#undef PACKAGE_TARNAME
-
-/* Define to the version of this package. */
-#undef PACKAGE_VERSION
-
-/* Define to the necessary symbol if this constant uses a non-standard name on
-   your system. */
-#undef PTHREAD_CREATE_JOINABLE
-
-/* Define to 1 if you have the ANSI C header files. */
-#undef STDC_HEADERS
diff --git a/kokkos/basic/optional/ThreadPool/test/CMakeLists.txt b/kokkos/basic/optional/ThreadPool/test/CMakeLists.txt
deleted file mode 100644
index ff878e7..0000000
--- a/kokkos/basic/optional/ThreadPool/test/CMakeLists.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-
-INCLUDE(PackageAddExecutableAndTest)
-
-PACKAGE_ADD_EXECUTABLE(
-  test_tpi_unit
-  COMM serial mpi
-  SOURCES test_tpi_unit.c
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_EXECUTABLE(
-  test_c_dnax
-  COMM serial
-  SOURCES test_c_dnax.c
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_EXECUTABLE(
-  test_tpi_cpp
-  COMM serial
-  SOURCES test_tpi.cpp
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_EXECUTABLE(
-  test_tpi_sum
-  COMM serial mpi
-  SOURCES test_mpi_sum.c
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_unit
-  NAME test_tpi_unit_serial
-  COMM serial
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_unit
-  NAME test_tpi_unit_mpi
-  COMM mpi
-  NUM_MPI_PROCS 1
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_cpp
-  NAME test_tpi_cpp
-  COMM serial
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_sum
-  NAME test_tpi_sum_serial
-  COMM serial
-  DIRECTORY .
-  XHOSTTYPE AIX
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_sum
-  NAME test_tpi_sum_np1
-  COMM mpi
-  NUM_MPI_PROCS 1
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_sum
-  NAME test_tpi_sum_np2
-  COMM mpi
-  NUM_MPI_PROCS 2
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_sum
-  NAME test_tpi_sum_np4
-  COMM mpi
-  NUM_MPI_PROCS 4
-  DIRECTORY .
-  )
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/Makefile.am b/kokkos/basic/optional/ThreadPool/test/Makefile.am
deleted file mode 100644
index 8e78cbf..0000000
--- a/kokkos/basic/optional/ThreadPool/test/Makefile.am
+++ /dev/null
@@ -1,55 +0,0 @@
-#@HEADER
-# ************************************************************************
-# 
-#                          ThreadPool Package 
-#                 Copyright (2008) Sandia Corporation
-# 
-# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-# license for use of this work by or on behalf of the U.S. Government.
-# 
-# This library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
-#  
-# This library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#  
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
-# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
-# 
-# ************************************************************************
-#@HEADER
-
-SUBDIRS =
-
-# The following line helps the test harness recover from build errors.
-
-all-local:
-
-include $(top_builddir)/Makefile.export.threadpool
-
-EXEEXT = .exe
-
-noinst_PROGRAMS = test_tpi test_tpi_cpp test_sum 
-
-test_tpi_SOURCES      = test_main.c test_tpi_unit.c test_c_dnax.c test_c_tpi.c test_pthreads.c
-test_tpi_DEPENDENCIES = $(top_builddir)/src/libtpi.a
-test_tpi_CFLAGS     = $(THREADPOOL_INCLUDES)
-test_tpi_LDADD        = $(THREADPOOL_LIBS)
-
-test_tpi_cpp_SOURCES      = test_tpi.cpp
-test_tpi_cpp_DEPENDENCIES = $(top_builddir)/src/libtpi.a
-test_tpi_cpp_CXXFLAGS     = $(THREADPOOL_INCLUDES)
-test_tpi_cpp_LDADD        = $(THREADPOOL_LIBS)
-
-test_sum_SOURCES      = test_mpi_sum.c
-test_sum_DEPENDENCIES = $(top_builddir)/src/libtpi.a
-test_sum_CFLAGS     = $(THREADPOOL_INCLUDES)
-test_sum_LDADD        = $(THREADPOOL_LIBS)
-
diff --git a/kokkos/basic/optional/ThreadPool/test/Makefile.in b/kokkos/basic/optional/ThreadPool/test/Makefile.in
deleted file mode 100644
index ffc5220..0000000
--- a/kokkos/basic/optional/ThreadPool/test/Makefile.in
+++ /dev/null
@@ -1,730 +0,0 @@
-# Makefile.in generated by automake 1.10 from Makefile.am.
-# @configure_input@
-
-# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
-# 2003, 2004, 2005, 2006  Free Software Foundation, Inc.
-# This Makefile.in is free software; the Free Software Foundation
-# gives unlimited permission to copy and/or distribute it,
-# with or without modifications, as long as this notice is preserved.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
-# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-# PARTICULAR PURPOSE.
-
-@SET_MAKE@
-
-#@HEADER
-# ************************************************************************
-# 
-#                          ThreadPool Package 
-#                 Copyright (2008) Sandia Corporation
-# 
-# Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-# license for use of this work by or on behalf of the U.S. Government.
-# 
-# This library is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as
-# published by the Free Software Foundation; either version 2.1 of the
-# License, or (at your option) any later version.
-#  
-# This library is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#  
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-# USA
-# Questions? Contact Carter Edwards (hcedwar@sandia.gov) 
-# 
-# ************************************************************************
-#@HEADER
-
-VPATH = @srcdir@
-pkgdatadir = $(datadir)/@PACKAGE@
-pkglibdir = $(libdir)/@PACKAGE@
-pkgincludedir = $(includedir)/@PACKAGE@
-am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
-install_sh_DATA = $(install_sh) -c -m 644
-install_sh_PROGRAM = $(install_sh) -c
-install_sh_SCRIPT = $(install_sh) -c
-INSTALL_HEADER = $(INSTALL_DATA)
-transform = $(program_transform_name)
-NORMAL_INSTALL = :
-PRE_INSTALL = :
-POST_INSTALL = :
-NORMAL_UNINSTALL = :
-PRE_UNINSTALL = :
-POST_UNINSTALL = :
-build_triplet = @build@
-host_triplet = @host@
-target_triplet = @target@
-noinst_PROGRAMS = test_tpi$(EXEEXT) test_tpi_cpp$(EXEEXT) \
-	test_sum$(EXEEXT)
-subdir = test
-DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in
-ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/config/acx_pthread.m4 \
-	$(top_srcdir)/config/tac_arg_check_mpi.m4 \
-	$(top_srcdir)/config/tac_arg_config_mpi.m4 \
-	$(top_srcdir)/config/tac_arg_enable_export-makefiles.m4 \
-	$(top_srcdir)/config/tac_arg_enable_feature.m4 \
-	$(top_srcdir)/config/tac_arg_enable_feature_sub_check.m4 \
-	$(top_srcdir)/config/tac_arg_with_ar.m4 \
-	$(top_srcdir)/config/tac_arg_with_flags.m4 \
-	$(top_srcdir)/config/tac_arg_with_incdirs.m4 \
-	$(top_srcdir)/config/tac_arg_with_libdirs.m4 \
-	$(top_srcdir)/config/tac_arg_with_libs.m4 \
-	$(top_srcdir)/config/tac_arg_with_perl.m4 \
-	$(top_srcdir)/configure.ac
-am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
-	$(ACLOCAL_M4)
-mkinstalldirs = $(install_sh) -d
-CONFIG_HEADER = $(top_builddir)/src/ThreadPool_config.h
-CONFIG_CLEAN_FILES =
-PROGRAMS = $(noinst_PROGRAMS)
-am_test_sum_OBJECTS = test_sum-test_mpi_sum.$(OBJEXT)
-test_sum_OBJECTS = $(am_test_sum_OBJECTS)
-test_sum_LINK = $(CCLD) $(test_sum_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
-am_test_tpi_OBJECTS = test_tpi-test_main.$(OBJEXT) \
-	test_tpi-test_tpi_unit.$(OBJEXT) \
-	test_tpi-test_c_dnax.$(OBJEXT) test_tpi-test_c_tpi.$(OBJEXT) \
-	test_tpi-test_pthreads.$(OBJEXT)
-test_tpi_OBJECTS = $(am_test_tpi_OBJECTS)
-test_tpi_LINK = $(CCLD) $(test_tpi_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
-am_test_tpi_cpp_OBJECTS = test_tpi_cpp-test_tpi.$(OBJEXT)
-test_tpi_cpp_OBJECTS = $(am_test_tpi_cpp_OBJECTS)
-test_tpi_cpp_LINK = $(CXXLD) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) \
-	$(AM_LDFLAGS) $(LDFLAGS) -o $@
-DEFAULT_INCLUDES = -I. -I$(top_builddir)/src@am__isrc@
-depcomp = $(SHELL) $(top_srcdir)/config/depcomp
-am__depfiles_maybe = depfiles
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-CCLD = $(CC)
-LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
-	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
-CXXLD = $(CXX)
-CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
-	-o $@
-SOURCES = $(test_sum_SOURCES) $(test_tpi_SOURCES) \
-	$(test_tpi_cpp_SOURCES)
-DIST_SOURCES = $(test_sum_SOURCES) $(test_tpi_SOURCES) \
-	$(test_tpi_cpp_SOURCES)
-RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
-	html-recursive info-recursive install-data-recursive \
-	install-dvi-recursive install-exec-recursive \
-	install-html-recursive install-info-recursive \
-	install-pdf-recursive install-ps-recursive install-recursive \
-	installcheck-recursive installdirs-recursive pdf-recursive \
-	ps-recursive uninstall-recursive
-RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
-  distclean-recursive maintainer-clean-recursive
-ETAGS = etags
-CTAGS = ctags
-DIST_SUBDIRS = $(SUBDIRS)
-DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
-ACLOCAL = @ACLOCAL@
-ALTERNATE_AR = @ALTERNATE_AR@
-AMTAR = @AMTAR@
-AUTOCONF = @AUTOCONF@
-AUTOHEADER = @AUTOHEADER@
-AUTOMAKE = @AUTOMAKE@
-AWK = @AWK@
-CC = @CC@
-CCDEPMODE = @CCDEPMODE@
-CFLAGS = @CFLAGS@
-CPPFLAGS = @CPPFLAGS@
-CXX = @CXX@
-CXXCPP = @CXXCPP@
-CXXDEPMODE = @CXXDEPMODE@
-CXXFLAGS = @CXXFLAGS@
-CYGPATH_W = @CYGPATH_W@
-DEFS = @DEFS@
-DEPDIR = @DEPDIR@
-ECHO_C = @ECHO_C@
-ECHO_N = @ECHO_N@
-ECHO_T = @ECHO_T@
-EGREP = @EGREP@
-EXEEXT = .exe
-GREP = @GREP@
-HAVE_PERL = @HAVE_PERL@
-INSTALL = @INSTALL@
-INSTALL_DATA = @INSTALL_DATA@
-INSTALL_PROGRAM = @INSTALL_PROGRAM@
-INSTALL_SCRIPT = @INSTALL_SCRIPT@
-INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
-LDFLAGS = @LDFLAGS@
-LIBOBJS = @LIBOBJS@
-LIBS = @LIBS@
-LTLIBOBJS = @LTLIBOBJS@
-MAINT = @MAINT@
-MAKEINFO = @MAKEINFO@
-MKDIR_P = @MKDIR_P@
-MPI_CC_EXISTS = @MPI_CC_EXISTS@
-MPI_CXX = @MPI_CXX@
-MPI_CXX_EXISTS = @MPI_CXX_EXISTS@
-MPI_F77_EXISTS = @MPI_F77_EXISTS@
-MPI_TEMP_CXX = @MPI_TEMP_CXX@
-OBJEXT = @OBJEXT@
-PACKAGE = @PACKAGE@
-PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
-PACKAGE_NAME = @PACKAGE_NAME@
-PACKAGE_STRING = @PACKAGE_STRING@
-PACKAGE_TARNAME = @PACKAGE_TARNAME@
-PACKAGE_VERSION = @PACKAGE_VERSION@
-PATH_SEPARATOR = @PATH_SEPARATOR@
-PERL_EXE = @PERL_EXE@
-PTHREAD_CC = @PTHREAD_CC@
-PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
-PTHREAD_LIBS = @PTHREAD_LIBS@
-RANLIB = @RANLIB@
-SET_MAKE = @SET_MAKE@
-SHELL = @SHELL@
-STRIP = @STRIP@
-VERSION = @VERSION@
-abs_builddir = @abs_builddir@
-abs_srcdir = @abs_srcdir@
-abs_top_builddir = @abs_top_builddir@
-abs_top_srcdir = @abs_top_srcdir@
-ac_aux_dir = @ac_aux_dir@
-ac_ct_CC = @ac_ct_CC@
-ac_ct_CXX = @ac_ct_CXX@
-am__include = @am__include@
-am__leading_dot = @am__leading_dot@
-am__quote = @am__quote@
-am__tar = @am__tar@
-am__untar = @am__untar@
-bindir = @bindir@
-build = @build@
-build_alias = @build_alias@
-build_cpu = @build_cpu@
-build_os = @build_os@
-build_vendor = @build_vendor@
-builddir = @builddir@
-datadir = @datadir@
-datarootdir = @datarootdir@
-docdir = @docdir@
-dvidir = @dvidir@
-exec_prefix = @exec_prefix@
-host = @host@
-host_alias = @host_alias@
-host_cpu = @host_cpu@
-host_os = @host_os@
-host_vendor = @host_vendor@
-htmldir = @htmldir@
-includedir = @includedir@
-infodir = @infodir@
-install_sh = @install_sh@
-libdir = @libdir@
-libexecdir = @libexecdir@
-localedir = @localedir@
-localstatedir = @localstatedir@
-mandir = @mandir@
-mkdir_p = @mkdir_p@
-oldincludedir = @oldincludedir@
-pdfdir = @pdfdir@
-prefix = @prefix@
-program_transform_name = @program_transform_name@
-psdir = @psdir@
-sbindir = @sbindir@
-sharedstatedir = @sharedstatedir@
-srcdir = @srcdir@
-sysconfdir = @sysconfdir@
-target = @target@
-target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
-top_builddir = @top_builddir@
-top_srcdir = @top_srcdir@
-SUBDIRS = 
-test_tpi_SOURCES = test_main.c test_tpi_unit.c test_c_dnax.c test_c_tpi.c test_pthreads.c
-test_tpi_DEPENDENCIES = $(top_builddir)/src/libtpi.a
-test_tpi_CFLAGS = $(THREADPOOL_INCLUDES)
-test_tpi_LDADD = $(THREADPOOL_LIBS)
-test_tpi_cpp_SOURCES = test_tpi.cpp
-test_tpi_cpp_DEPENDENCIES = $(top_builddir)/src/libtpi.a
-test_tpi_cpp_CXXFLAGS = $(THREADPOOL_INCLUDES)
-test_tpi_cpp_LDADD = $(THREADPOOL_LIBS)
-test_sum_SOURCES = test_mpi_sum.c
-test_sum_DEPENDENCIES = $(top_builddir)/src/libtpi.a
-test_sum_CFLAGS = $(THREADPOOL_INCLUDES)
-test_sum_LDADD = $(THREADPOOL_LIBS)
-all: all-recursive
-
-.SUFFIXES:
-.SUFFIXES: .c .cpp .o .obj
-$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am  $(am__configure_deps)
-	@for dep in $?; do \
-	  case '$(am__configure_deps)' in \
-	    *$$dep*) \
-	      cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \
-		&& exit 0; \
-	      exit 1;; \
-	  esac; \
-	done; \
-	echo ' cd $(top_srcdir) && $(AUTOMAKE) --foreign  test/Makefile'; \
-	cd $(top_srcdir) && \
-	  $(AUTOMAKE) --foreign  test/Makefile
-.PRECIOUS: Makefile
-Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
-	@case '$?' in \
-	  *config.status*) \
-	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
-	  *) \
-	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
-	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
-	esac;
-
-$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
-	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
-
-clean-noinstPROGRAMS:
-	-test -z "$(noinst_PROGRAMS)" || rm -f $(noinst_PROGRAMS)
-test_sum$(EXEEXT): $(test_sum_OBJECTS) $(test_sum_DEPENDENCIES) 
-	@rm -f test_sum$(EXEEXT)
-	$(test_sum_LINK) $(test_sum_OBJECTS) $(test_sum_LDADD) $(LIBS)
-test_tpi$(EXEEXT): $(test_tpi_OBJECTS) $(test_tpi_DEPENDENCIES) 
-	@rm -f test_tpi$(EXEEXT)
-	$(test_tpi_LINK) $(test_tpi_OBJECTS) $(test_tpi_LDADD) $(LIBS)
-test_tpi_cpp$(EXEEXT): $(test_tpi_cpp_OBJECTS) $(test_tpi_cpp_DEPENDENCIES) 
-	@rm -f test_tpi_cpp$(EXEEXT)
-	$(test_tpi_cpp_LINK) $(test_tpi_cpp_OBJECTS) $(test_tpi_cpp_LDADD) $(LIBS)
-
-mostlyclean-compile:
-	-rm -f *.$(OBJEXT)
-
-distclean-compile:
-	-rm -f *.tab.c
-
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_sum-test_mpi_sum.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_c_dnax.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_c_tpi.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_main.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_pthreads.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi-test_tpi_unit.Po@am__quote@
-@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test_tpi_cpp-test_tpi.Po@am__quote@
-
-.c.o:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c $<
-
-.c.obj:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
-
-test_sum-test_mpi_sum.o: test_mpi_sum.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -MT test_sum-test_mpi_sum.o -MD -MP -MF $(DEPDIR)/test_sum-test_mpi_sum.Tpo -c -o test_sum-test_mpi_sum.o `test -f 'test_mpi_sum.c' || echo '$(srcdir)/'`test_mpi_sum.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_sum-test_mpi_sum.Tpo $(DEPDIR)/test_sum-test_mpi_sum.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_mpi_sum.c' object='test_sum-test_mpi_sum.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -c -o test_sum-test_mpi_sum.o `test -f 'test_mpi_sum.c' || echo '$(srcdir)/'`test_mpi_sum.c
-
-test_sum-test_mpi_sum.obj: test_mpi_sum.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -MT test_sum-test_mpi_sum.obj -MD -MP -MF $(DEPDIR)/test_sum-test_mpi_sum.Tpo -c -o test_sum-test_mpi_sum.obj `if test -f 'test_mpi_sum.c'; then $(CYGPATH_W) 'test_mpi_sum.c'; else $(CYGPATH_W) '$(srcdir)/test_mpi_sum.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_sum-test_mpi_sum.Tpo $(DEPDIR)/test_sum-test_mpi_sum.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_mpi_sum.c' object='test_sum-test_mpi_sum.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_sum_CFLAGS) $(CFLAGS) -c -o test_sum-test_mpi_sum.obj `if test -f 'test_mpi_sum.c'; then $(CYGPATH_W) 'test_mpi_sum.c'; else $(CYGPATH_W) '$(srcdir)/test_mpi_sum.c'; fi`
-
-test_tpi-test_main.o: test_main.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_main.o -MD -MP -MF $(DEPDIR)/test_tpi-test_main.Tpo -c -o test_tpi-test_main.o `test -f 'test_main.c' || echo '$(srcdir)/'`test_main.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_main.Tpo $(DEPDIR)/test_tpi-test_main.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_main.c' object='test_tpi-test_main.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_main.o `test -f 'test_main.c' || echo '$(srcdir)/'`test_main.c
-
-test_tpi-test_main.obj: test_main.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_main.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_main.Tpo -c -o test_tpi-test_main.obj `if test -f 'test_main.c'; then $(CYGPATH_W) 'test_main.c'; else $(CYGPATH_W) '$(srcdir)/test_main.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_main.Tpo $(DEPDIR)/test_tpi-test_main.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_main.c' object='test_tpi-test_main.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_main.obj `if test -f 'test_main.c'; then $(CYGPATH_W) 'test_main.c'; else $(CYGPATH_W) '$(srcdir)/test_main.c'; fi`
-
-test_tpi-test_tpi_unit.o: test_tpi_unit.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_tpi_unit.o -MD -MP -MF $(DEPDIR)/test_tpi-test_tpi_unit.Tpo -c -o test_tpi-test_tpi_unit.o `test -f 'test_tpi_unit.c' || echo '$(srcdir)/'`test_tpi_unit.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_tpi_unit.Tpo $(DEPDIR)/test_tpi-test_tpi_unit.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_tpi_unit.c' object='test_tpi-test_tpi_unit.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_tpi_unit.o `test -f 'test_tpi_unit.c' || echo '$(srcdir)/'`test_tpi_unit.c
-
-test_tpi-test_tpi_unit.obj: test_tpi_unit.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_tpi_unit.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_tpi_unit.Tpo -c -o test_tpi-test_tpi_unit.obj `if test -f 'test_tpi_unit.c'; then $(CYGPATH_W) 'test_tpi_unit.c'; else $(CYGPATH_W) '$(srcdir)/test_tpi_unit.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_tpi_unit.Tpo $(DEPDIR)/test_tpi-test_tpi_unit.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_tpi_unit.c' object='test_tpi-test_tpi_unit.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_tpi_unit.obj `if test -f 'test_tpi_unit.c'; then $(CYGPATH_W) 'test_tpi_unit.c'; else $(CYGPATH_W) '$(srcdir)/test_tpi_unit.c'; fi`
-
-test_tpi-test_c_dnax.o: test_c_dnax.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_dnax.o -MD -MP -MF $(DEPDIR)/test_tpi-test_c_dnax.Tpo -c -o test_tpi-test_c_dnax.o `test -f 'test_c_dnax.c' || echo '$(srcdir)/'`test_c_dnax.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_dnax.Tpo $(DEPDIR)/test_tpi-test_c_dnax.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_dnax.c' object='test_tpi-test_c_dnax.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_dnax.o `test -f 'test_c_dnax.c' || echo '$(srcdir)/'`test_c_dnax.c
-
-test_tpi-test_c_dnax.obj: test_c_dnax.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_dnax.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_c_dnax.Tpo -c -o test_tpi-test_c_dnax.obj `if test -f 'test_c_dnax.c'; then $(CYGPATH_W) 'test_c_dnax.c'; else $(CYGPATH_W) '$(srcdir)/test_c_dnax.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_dnax.Tpo $(DEPDIR)/test_tpi-test_c_dnax.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_dnax.c' object='test_tpi-test_c_dnax.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_dnax.obj `if test -f 'test_c_dnax.c'; then $(CYGPATH_W) 'test_c_dnax.c'; else $(CYGPATH_W) '$(srcdir)/test_c_dnax.c'; fi`
-
-test_tpi-test_c_tpi.o: test_c_tpi.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_tpi.o -MD -MP -MF $(DEPDIR)/test_tpi-test_c_tpi.Tpo -c -o test_tpi-test_c_tpi.o `test -f 'test_c_tpi.c' || echo '$(srcdir)/'`test_c_tpi.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_tpi.Tpo $(DEPDIR)/test_tpi-test_c_tpi.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_tpi.c' object='test_tpi-test_c_tpi.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_tpi.o `test -f 'test_c_tpi.c' || echo '$(srcdir)/'`test_c_tpi.c
-
-test_tpi-test_c_tpi.obj: test_c_tpi.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_c_tpi.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_c_tpi.Tpo -c -o test_tpi-test_c_tpi.obj `if test -f 'test_c_tpi.c'; then $(CYGPATH_W) 'test_c_tpi.c'; else $(CYGPATH_W) '$(srcdir)/test_c_tpi.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_c_tpi.Tpo $(DEPDIR)/test_tpi-test_c_tpi.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_c_tpi.c' object='test_tpi-test_c_tpi.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_c_tpi.obj `if test -f 'test_c_tpi.c'; then $(CYGPATH_W) 'test_c_tpi.c'; else $(CYGPATH_W) '$(srcdir)/test_c_tpi.c'; fi`
-
-test_tpi-test_pthreads.o: test_pthreads.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_pthreads.o -MD -MP -MF $(DEPDIR)/test_tpi-test_pthreads.Tpo -c -o test_tpi-test_pthreads.o `test -f 'test_pthreads.c' || echo '$(srcdir)/'`test_pthreads.c
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_pthreads.Tpo $(DEPDIR)/test_tpi-test_pthreads.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_pthreads.c' object='test_tpi-test_pthreads.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_pthreads.o `test -f 'test_pthreads.c' || echo '$(srcdir)/'`test_pthreads.c
-
-test_tpi-test_pthreads.obj: test_pthreads.c
-@am__fastdepCC_TRUE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -MT test_tpi-test_pthreads.obj -MD -MP -MF $(DEPDIR)/test_tpi-test_pthreads.Tpo -c -o test_tpi-test_pthreads.obj `if test -f 'test_pthreads.c'; then $(CYGPATH_W) 'test_pthreads.c'; else $(CYGPATH_W) '$(srcdir)/test_pthreads.c'; fi`
-@am__fastdepCC_TRUE@	mv -f $(DEPDIR)/test_tpi-test_pthreads.Tpo $(DEPDIR)/test_tpi-test_pthreads.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='test_pthreads.c' object='test_tpi-test_pthreads.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_CFLAGS) $(CFLAGS) -c -o test_tpi-test_pthreads.obj `if test -f 'test_pthreads.c'; then $(CYGPATH_W) 'test_pthreads.c'; else $(CYGPATH_W) '$(srcdir)/test_pthreads.c'; fi`
-
-.cpp.o:
-@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ $<
-
-.cpp.obj:
-@am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXXCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
-
-test_tpi_cpp-test_tpi.o: test_tpi.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -MT test_tpi_cpp-test_tpi.o -MD -MP -MF $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo -c -o test_tpi_cpp-test_tpi.o `test -f 'test_tpi.cpp' || echo '$(srcdir)/'`test_tpi.cpp
-@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo $(DEPDIR)/test_tpi_cpp-test_tpi.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='test_tpi.cpp' object='test_tpi_cpp-test_tpi.o' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -c -o test_tpi_cpp-test_tpi.o `test -f 'test_tpi.cpp' || echo '$(srcdir)/'`test_tpi.cpp
-
-test_tpi_cpp-test_tpi.obj: test_tpi.cpp
-@am__fastdepCXX_TRUE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -MT test_tpi_cpp-test_tpi.obj -MD -MP -MF $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo -c -o test_tpi_cpp-test_tpi.obj `if test -f 'test_tpi.cpp'; then $(CYGPATH_W) 'test_tpi.cpp'; else $(CYGPATH_W) '$(srcdir)/test_tpi.cpp'; fi`
-@am__fastdepCXX_TRUE@	mv -f $(DEPDIR)/test_tpi_cpp-test_tpi.Tpo $(DEPDIR)/test_tpi_cpp-test_tpi.Po
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	source='test_tpi.cpp' object='test_tpi_cpp-test_tpi.obj' libtool=no @AMDEPBACKSLASH@
-@AMDEP_TRUE@@am__fastdepCXX_FALSE@	DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCXX_FALSE@	$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(test_tpi_cpp_CXXFLAGS) $(CXXFLAGS) -c -o test_tpi_cpp-test_tpi.obj `if test -f 'test_tpi.cpp'; then $(CYGPATH_W) 'test_tpi.cpp'; else $(CYGPATH_W) '$(srcdir)/test_tpi.cpp'; fi`
-
-# This directory's subdirectories are mostly independent; you can cd
-# into them and run `make' without going through this Makefile.
-# To change the values of `make' variables: instead of editing Makefiles,
-# (1) if the variable is set in `config.status', edit `config.status'
-#     (which will cause the Makefiles to be regenerated when you run `make');
-# (2) otherwise, pass the desired values on the `make' command line.
-$(RECURSIVE_TARGETS):
-	@failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	target=`echo $@ | sed s/-recursive//`; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    dot_seen=yes; \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done; \
-	if test "$$dot_seen" = "no"; then \
-	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
-	fi; test -z "$$fail"
-
-$(RECURSIVE_CLEAN_TARGETS):
-	@failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
-	dot_seen=no; \
-	case "$@" in \
-	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
-	  *) list='$(SUBDIRS)' ;; \
-	esac; \
-	rev=''; for subdir in $$list; do \
-	  if test "$$subdir" = "."; then :; else \
-	    rev="$$subdir $$rev"; \
-	  fi; \
-	done; \
-	rev="$$rev ."; \
-	target=`echo $@ | sed s/-recursive//`; \
-	for subdir in $$rev; do \
-	  echo "Making $$target in $$subdir"; \
-	  if test "$$subdir" = "."; then \
-	    local_target="$$target-am"; \
-	  else \
-	    local_target="$$target"; \
-	  fi; \
-	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
-	  || eval $$failcom; \
-	done && test -z "$$fail"
-tags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
-	done
-ctags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
-	done
-
-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS: tags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	tags=; \
-	here=`pwd`; \
-	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
-	  include_option=--etags-include; \
-	  empty_fix=.; \
-	else \
-	  include_option=--include; \
-	  empty_fix=; \
-	fi; \
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test ! -f $$subdir/TAGS || \
-	      tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
-	  fi; \
-	done; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	if test -z "$(ETAGS_ARGS)$$tags$$unique"; then :; else \
-	  test -n "$$unique" || unique=$$empty_fix; \
-	  $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
-	    $$tags $$unique; \
-	fi
-ctags: CTAGS
-CTAGS: ctags-recursive $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	tags=; \
-	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '    { files[$$0] = 1; } \
-	       END { for (i in files) print i; }'`; \
-	test -z "$(CTAGS_ARGS)$$tags$$unique" \
-	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
-	     $$tags $$unique
-
-GTAGS:
-	here=`$(am__cd) $(top_builddir) && pwd` \
-	  && cd $(top_srcdir) \
-	  && gtags -i $(GTAGS_ARGS) $$here
-
-distclean-tags:
-	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
-
-distdir: $(DISTFILES)
-	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
-	list='$(DISTFILES)'; \
-	  dist_files=`for file in $$list; do echo $$file; done | \
-	  sed -e "s|^$$srcdirstrip/||;t" \
-	      -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
-	case $$dist_files in \
-	  */*) $(MKDIR_P) `echo "$$dist_files" | \
-			   sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
-			   sort -u` ;; \
-	esac; \
-	for file in $$dist_files; do \
-	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
-	  if test -d $$d/$$file; then \
-	    dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
-	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
-	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
-	    fi; \
-	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
-	  else \
-	    test -f $(distdir)/$$file \
-	    || cp -p $$d/$$file $(distdir)/$$file \
-	    || exit 1; \
-	  fi; \
-	done
-	list='$(DIST_SUBDIRS)'; for subdir in $$list; do \
-	  if test "$$subdir" = .; then :; else \
-	    test -d "$(distdir)/$$subdir" \
-	    || $(MKDIR_P) "$(distdir)/$$subdir" \
-	    || exit 1; \
-	    distdir=`$(am__cd) $(distdir) && pwd`; \
-	    top_distdir=`$(am__cd) $(top_distdir) && pwd`; \
-	    (cd $$subdir && \
-	      $(MAKE) $(AM_MAKEFLAGS) \
-	        top_distdir="$$top_distdir" \
-	        distdir="$$distdir/$$subdir" \
-		am__remove_distdir=: \
-		am__skip_length_check=: \
-	        distdir) \
-	      || exit 1; \
-	  fi; \
-	done
-check-am: all-am
-check: check-recursive
-all-am: Makefile $(PROGRAMS) all-local
-installdirs: installdirs-recursive
-installdirs-am:
-install: install-recursive
-install-exec: install-exec-recursive
-install-data: install-data-recursive
-uninstall: uninstall-recursive
-
-install-am: all-am
-	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
-
-installcheck: installcheck-recursive
-install-strip:
-	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
-	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
-	  `test -z '$(STRIP)' || \
-	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
-mostlyclean-generic:
-
-clean-generic:
-
-distclean-generic:
-	-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-
-maintainer-clean-generic:
-	@echo "This command is intended for maintainers to use"
-	@echo "it deletes files that may require special tools to rebuild."
-clean: clean-recursive
-
-clean-am: clean-generic clean-noinstPROGRAMS mostlyclean-am
-
-distclean: distclean-recursive
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-distclean-am: clean-am distclean-compile distclean-generic \
-	distclean-tags
-
-dvi: dvi-recursive
-
-dvi-am:
-
-html: html-recursive
-
-info: info-recursive
-
-info-am:
-
-install-data-am:
-
-install-dvi: install-dvi-recursive
-
-install-exec-am:
-
-install-html: install-html-recursive
-
-install-info: install-info-recursive
-
-install-man:
-
-install-pdf: install-pdf-recursive
-
-install-ps: install-ps-recursive
-
-installcheck-am:
-
-maintainer-clean: maintainer-clean-recursive
-	-rm -rf ./$(DEPDIR)
-	-rm -f Makefile
-maintainer-clean-am: distclean-am maintainer-clean-generic
-
-mostlyclean: mostlyclean-recursive
-
-mostlyclean-am: mostlyclean-compile mostlyclean-generic
-
-pdf: pdf-recursive
-
-pdf-am:
-
-ps: ps-recursive
-
-ps-am:
-
-uninstall-am:
-
-.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) install-am \
-	install-strip
-
-.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
-	all all-am all-local check check-am clean clean-generic \
-	clean-noinstPROGRAMS ctags ctags-recursive distclean \
-	distclean-compile distclean-generic distclean-tags distdir dvi \
-	dvi-am html html-am info info-am install install-am \
-	install-data install-data-am install-dvi install-dvi-am \
-	install-exec install-exec-am install-html install-html-am \
-	install-info install-info-am install-man install-pdf \
-	install-pdf-am install-ps install-ps-am install-strip \
-	installcheck installcheck-am installdirs installdirs-am \
-	maintainer-clean maintainer-clean-generic mostlyclean \
-	mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \
-	tags tags-recursive uninstall uninstall-am
-
-
-# The following line helps the test harness recover from build errors.
-
-all-local:
-
-include $(top_builddir)/Makefile.export.threadpool
-# Tell versions [3.59,3.63) of GNU make to not export all variables.
-# Otherwise a system limit (for SysV at least) may be exceeded.
-.NOEXPORT:
diff --git a/kokkos/basic/optional/ThreadPool/test/build_gnu b/kokkos/basic/optional/ThreadPool/test/build_gnu
deleted file mode 100755
index bba4b90..0000000
--- a/kokkos/basic/optional/ThreadPool/test/build_gnu
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-
-TEST_SRC="test_main.c test_c_dnax.c test_tpi_unit.c test_pthreads.c"
-
-LIB_SRC="../src/TPI.c ../src/TPI_Walltime.c"
-
-LIB_OBJ="TPI.o TPI_Walltime.o"
-
-# OPT="-O3"
-OPT="-g"
-# OPT="-O"
-
-#CFLAGS="${OPT} -std=c99   -Wall -Wextra"
-
-CFLAGS=" ${OPT} -std=c89   -Wall -Wextra"
-CCFLAGS="${OPT} -std=c++98 -Wall -Wextra"
-
-echo build: gcc ${CFLAGS}
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-echo "#define HAVE_PTHREAD 1" > ThreadPool_config.h
-
-gcc	${CFLAGS} -c	\
-	-I. -I../src ${LIB_SRC}
-
-gcc	${CFLAGS} \
-	-o test_tpi.gnu.exe	\
-	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread -lm
-
-g++	${CCFLAGS} \
-	-o test_tpi_cpp.gnu.exe	\
-	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++ -lm
-
-gcc	${CFLAGS} \
-	-o test_sum.gnu.exe	\
-	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
-
-#-----------------------------------------------------------------------
-
-mpicc	${CFLAGS} \
-	-o test_sum.mpi.gnu.exe	\
-	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
-
-gcc	${CFLAGS} -c	\
-	-I. -I../src ${LIB_SRC}
-
-gcc	${CFLAGS} \
-	-o test_tpi.gnu.noth.exe	\
-	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread -lm
-
-g++	${CCFLAGS} \
-	-o test_tpi_cpp.gnu.noth.exe	\
-	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++ -lm
-
-gcc	${CFLAGS} \
-	-o test_sum.gnu.noth.exe	\
-	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
-echo "#define HAVE_MPI 1" >> ThreadPool_config.h
-
-mpicc	${CFLAGS} \
-	-o test_sum.mpi.gnu.noth.exe	\
-	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread -lm
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-
diff --git a/kokkos/basic/optional/ThreadPool/test/build_intel b/kokkos/basic/optional/ThreadPool/test/build_intel
deleted file mode 100755
index accb0a0..0000000
--- a/kokkos/basic/optional/ThreadPool/test/build_intel
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-
-# . /usr/local/modules/3.2.6/Modules/$MODULE_VERSION/bin/modulecmd tcsh	\
-#	load sierra-devel-desktop-intel-10.1ip
-
-
-TEST_SRC="test_main.c test_c_dnax.c test_tpi_unit.c test_pthreads.c"
-
-LIB_SRC="../src/TPI.c ../src/TPI_Walltime.c"
-
-LIB_OBJ="TPI.o TPI_Walltime.o"
-
-#CFLAGS="-std=c99 -strict-ansi -Wall -Wcheck -Werror -wd141 -wd869 -wd1418 -wd1419"
-#CFLAGS="-std=c89 -strict-ansi -Wall -Wcheck -Werror -wd141 -wd869 -wd1418 -wd1419"
-CCFLAGS="        -strict-ansi -Wall -Wcheck -Werror -wd141 -wd869 -wd1418 -wd1419"
-
-OPT="-O3"
-# OPT="-g"
-# OPT="-O"
-
-echo build ${OPT}
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-echo "#define HAVE_PTHREAD 1" > ThreadPool_config.h
-
-icc	${CFLAGS} ${OPT} -c	\
-	-I. -I../src ${LIB_SRC}
-
-icc	${CFLAGS} ${OPT}	\
-	-o test_tpi.intel.exe	\
-	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread
-
-icc	${CCFLAGS} ${OPT}	\
-	-o test_tpi_cpp.intel.exe	\
-	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++
-
-icc	${CFLAGS} ${OPT}	\
-	-o test_sum.intel.exe	\
-	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread
-
-#-----------------------------------------------------------------------
-
-mpicc	${CFLAGS} ${OPT}	\
-	-o test_sum.mpi.intel.exe	\
-	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
-
-icc	${CFLAGS} ${OPT} -c	\
-	-I. -I../src ${LIB_SRC}
-
-icc	${CFLAGS} ${OPT}	\
-	-o test_tpi.intel.noth.exe	\
-	-I. -I../src ${TEST_SRC} ${LIB_OBJ} -lpthread
-
-icc	${CCFLAGS} ${OPT}	\
-	-o test_tpi_cpp.intel.noth.exe	\
-	-I. -I../src test_tpi.cpp ${LIB_OBJ} -lpthread -lstdc++
-
-icc	${CFLAGS} ${OPT}	\
-	-o test_sum.intel.noth.exe	\
-	-I. -I../src test_mpi_sum.c ${LIB_OBJ} -lpthread
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-echo "/* #define HAVE_PTHREAD 1 */" > ThreadPool_config.h
-echo "#define HAVE_MPI 1" >> ThreadPool_config.h
-
-mpicc	${CFLAGS} ${OPT}	\
-	-o test_sum.mpi.intel.noth.exe	\
-	-I. -I../src -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread
-
-#-----------------------------------------------------------------------
-
-rm -f ThreadPool_config.h
-
diff --git a/kokkos/basic/optional/ThreadPool/test/build_pgi b/kokkos/basic/optional/ThreadPool/test/build_pgi
deleted file mode 100755
index 85799cc..0000000
--- a/kokkos/basic/optional/ThreadPool/test/build_pgi
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-export LM_LICENSE_FILE=7500@reddish
-PGI_HOME="/usr/local/pgi_64/linux86-64/7.0-7"
-MPICH_HOME="/usr/local/mpi/mpich/64Bit/1.2.7/pgi-6.0"
-
-export PATH="${PGI_HOME}/bin:${PATH}"
-
-TEST_SRC="test_main.c test_c_dnax.c test_c_tpi.c test_pthreads.c"
-
-LIB_SRC="../src/TPI_pthreads.c ../src/TPI_Walltime.c ../src/TPI_Concurrency.c"
-
-LIB_OBJ="TPI_pthreads.o TPI_Walltime.o TPI_Concurrency.o"
-
-#-----------------------------------------------------------------------
-
-pgcc	-O4 -c	\
-	-I../include ${LIB_SRC} -lpthread
-
-pgcc	-O4	\
-	-o test_tpi.pgi.exe	\
-	-I../include ${TEST_SRC} ${LIB_OBJ} -lpthread
-
-pgCC	-O4	\
-	-o test_tpi_cpp.pgi.exe	\
-	-I../include test_tpi.cpp ${LIB_OBJ} -lpthread
-
-#-----------------------------------------------------------------------
-# Enable PGI-MPI installation to accept as large a message as possible, 200 Mb
-
-# export P4_GLOBMEMSIZE="268435456"
-
-export PATH="${MPICH_HOME}/bin:${PGI_HOME}/bin:${PATH}"
-
-mpicc	-c99	\
-	-O4	\
-	-o test_sum.mpi.pgi.exe	\
-	-I../include -DTEST_WITH_MPI test_mpi_sum.c ${LIB_OBJ} -lpthread
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.c b/kokkos/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.c
deleted file mode 100644
index 5f2866f..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.c
+++ /dev/null
@@ -1,562 +0,0 @@
-
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <BoxPartitionIB.h>
-
-/*--------------------------------------------------------------------*/
-/* Recursively split a box into into (up-ip) sub-boxes */
-
-typedef const int RangeInput[2] ;
-typedef       int RangeOutput[2] ;
-typedef RangeInput  * const BoxInput ;
-typedef RangeOutput * const BoxOutput ;
-
-static 
-void box_partition( int ip , int up , int axis ,
-                    BoxInput box ,
-                    int (* const p_box)[3][2] )
-{
-  const int np = up - ip ;
-  if ( 1 == np ) {
-    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
-    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
-    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
-  }
-  else {
-    const int n = box[ axis ][1] - box[ axis ][0] ;
-    const int np_low = np / 2 ;  /* Rounded down */
-    const int np_upp = np - np_low ;
-
-    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
-    const int n_low = n - n_upp ;
-    const int next_axis = ( axis + 2 ) % 3 ;
-
-    if ( np_low ) { /* P = [ip,ip+np_low) */
-      int dbox[3][2] ;
-      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
-      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
-      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
-
-      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
-
-      box_partition( ip, ip + np_low, next_axis,
-                     (const int (*)[2]) dbox, p_box );
-    }
-
-    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
-      int dbox[3][2] ;
-      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
-      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
-      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
-
-      ip += np_low ;
-      dbox[ axis ][0] += n_low ;
-      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
-
-      box_partition( ip, ip + np_upp, next_axis,
-                     (const int (*)[2]) dbox, p_box );
-    }
-  }
-}
-
-void box_partition_rcb( const int np ,
-                        const int root_box[3][2] ,
-                        int    pbox[][3][2] )
-{
-  box_partition( 0 , np , 2 , root_box , pbox );
-}
-
-/*--------------------------------------------------------------------*/
-
-static int box_intersect( BoxInput a , BoxInput b , BoxOutput c )
-{
-  int i ;
-  for ( i = 0 ; i < 3 ; ++i ) {
-    c[i][0] = a[i][0] < b[i][0] ? b[i][0] : a[i][0] ;
-    c[i][1] = a[i][1] < b[i][1] ? a[i][1] : b[i][1] ;
-  }
-
-  return c[0][0] < c[0][1] && c[1][0] < c[1][1] && c[2][0] < c[2][1] ;
-}
- 
-
-/*--------------------------------------------------------------------*/
-
-static void global_to_use_box( BoxInput gbox ,
-                               BoxInput pbox ,
-                               const int ghost ,
-                                     BoxOutput interiorBox ,
-                                     BoxOutput useBox )
-{
-  int i = 0 ;
-
-  for ( i = 0 ; i < 3 ; ++i ) {
-    const int n = pbox[i][1] - pbox[i][0] ;
-
-    if ( n < 0 ) {
-      abort();
-    }
-
-    interiorBox[i][0] = gbox[i][0] == pbox[i][0]
-                      ? gbox[i][0] :  pbox[i][0] + ghost ;
-
-    interiorBox[i][1] = gbox[i][1] == pbox[i][1]
-                      ? gbox[i][1] :  pbox[i][1] - ghost ;
-
-    if ( interiorBox[i][1] < pbox[i][0] ) {
-      interiorBox[i][1] = pbox[i][0] ;
-    }
-
-    if ( interiorBox[i][0] > pbox[i][1] ) {
-      interiorBox[i][0] = pbox[i][1] ;
-    }
-
-    if ( interiorBox[i][1] < interiorBox[i][0] ) {
-      interiorBox[i][1] = interiorBox[i][0] ;
-    }
-
-    useBox[i][0] = pbox[i][0] - ghost ;
-    useBox[i][1] = pbox[i][1] + ghost ;
-
-    if ( useBox[i][0] < gbox[i][0] ) { useBox[i][0] = gbox[i][0] ; }
-    if ( useBox[i][1] > gbox[i][1] ) { useBox[i][1] = gbox[i][1] ; }
-  }
-}
-
-
-/*  A use-box is the owned box plus the ghost layers.
- *  Map a global (x,y,z) to a local integer ordinate.
- */
-static int map_global_to_use_box( BoxInput useBox ,
-                                  const int global_x ,
-                                  const int global_y ,
-                                  const int global_z )
-{
-  const int nx = useBox[0][1] - useBox[0][0] ;
-  const int ny = useBox[1][1] - useBox[1][0] ;
-  const int nz = useBox[2][1] - useBox[2][0] ;
-  const int ix = global_x     - useBox[0][0] ;
-  const int iy = global_y     - useBox[1][0] ;
-  const int iz = global_z     - useBox[2][0] ;
-
-  const int good = 0 <= ix && ix < nx &&
-                   0 <= iy && iy < ny &&
-                   0 <= iz && iz < nz ;
-
-  if ( nx < 0 || ny < 0 || nz < 0 ) {
-    abort();
-  }
-  if ( ! good ) {
-    abort();
-  }
-
-  return good ? ix + iy * nx + iz * nx * ny : -1 ;
-}
-
-int box_map_local( const int local_uses[3][2] ,
-                   const int map_local_id[] ,
-                   const int global_x ,
-                   const int global_y ,
-                   const int global_z )
-{
-  int i = map_global_to_use_box( local_uses , global_x , global_y , global_z );
-
-  if ( 0 <= i ) { i = map_local_id[i] ; }
-
-  return i ;
-}
-
-
-/*--------------------------------------------------------------------*/
-
-static void resize_int( int ** a , int * allocLen , int newLen )
-{
-  int k = 32;
-  while ( k < newLen ) { k <<= 1 ; }
-  if ( NULL == *a )
-    { *a = malloc( sizeof(int)*(*allocLen = k) ); }
-  else if ( *allocLen < k ) 
-    { *a = realloc(*a , sizeof(int)*(*allocLen = k)); }
-}
-
-void box_partition_map( 
-  const int np ,
-  const int my_p ,
-  const int gbox[3][2] ,
-  const int pbox[][3][2] ,
-  const int ghost ,
-
-  int    map_use_box[3][2] ,
-  int    map_local_id[] ,
-  int *  map_count_interior ,
-  int *  map_count_owns ,
-  int *  map_count_uses ,
-  int ** map_recv_pc ,
-  int ** map_send_pc ,
-  int ** map_send_id )
-{
-  int * recv_pc = (int *) malloc( ( np + 1 ) * sizeof(int) );
-  int * send_pc = (int *) malloc( ( np + 1 ) * sizeof(int) );
-
-  int   id_length = 0 ;
-
-  int * send_id  = NULL ;
-  int   send_id_size = 0 ;
-
-  int own_length , use_length , int_length ;
-  int count_interior , count_parallel ;
-  int iSend ;
-  int g_ix , g_iy , g_iz ;
-  int i ;
-
-  int my_int_box[3][2] ;
-
-  global_to_use_box( gbox , pbox[my_p] , ghost , my_int_box , map_use_box );
-
-  own_length = ( pbox[my_p][0][1] - pbox[my_p][0][0] ) *
-               ( pbox[my_p][1][1] - pbox[my_p][1][0] ) *
-               ( pbox[my_p][2][1] - pbox[my_p][2][0] );
-
-  use_length = ( map_use_box[0][1] - map_use_box[0][0] ) *
-               ( map_use_box[1][1] - map_use_box[1][0] ) *
-               ( map_use_box[2][1] - map_use_box[2][0] );
-
-  int_length = ( my_int_box[0][1] - my_int_box[0][0] ) *
-               ( my_int_box[1][1] - my_int_box[1][0] ) *
-               ( my_int_box[2][1] - my_int_box[2][0] );
-
-  for ( i = 0 ; i < id_length ; ++i ) { map_local_id[i] = -1 ; }
-
-  /* Fill in locally owned portion: { interior , parallel } */
-
-  count_interior = 0 ;
-  count_parallel = int_length ;
-
-  for ( g_iz = pbox[my_p][2][0] ; g_iz < pbox[my_p][2][1] ; ++g_iz ) {
-  for ( g_iy = pbox[my_p][1][0] ; g_iy < pbox[my_p][1][1] ; ++g_iy ) {
-  for ( g_ix = pbox[my_p][0][0] ; g_ix < pbox[my_p][0][1] ; ++g_ix ) {
-
-    const int local =
-      map_global_to_use_box( (BoxInput) map_use_box, g_ix, g_iy, g_iz );
-
-    if ( local < 0 ) { 
-      abort();
-    }
-
-    if ( my_int_box[2][0] <= g_iz && g_iz < my_int_box[2][1] &&
-         my_int_box[1][0] <= g_iy && g_iy < my_int_box[1][1] &&
-         my_int_box[0][0] <= g_ix && g_ix < my_int_box[0][1] ) {
-      /* Interior */
-      map_local_id[ local ] = count_interior++ ;
-    }
-    else {
-      /* Parallel */
-      map_local_id[ local ] = count_parallel++ ;
-    }
-  }
-  }
-  }
-
-  if ( count_interior != int_length ) { abort(); }
-  if ( count_parallel != own_length ) { abort(); }
-
-  /* Fill in off-process received portion: { ( i + my_p ) % np } */
-
-  recv_pc[0] = count_parallel ;
-  recv_pc[1] = count_parallel ;
-  send_pc[0] = 0 ;
-  send_pc[1] = 0 ;
-  iSend = 0 ;
-
-  for ( i = 1 ; i < np ; ++i ) {
-    const int ip = ( i + my_p ) % np ;
-    int recv_box[3][2] ;
-    int send_box[3][2] ;
-    int other_int_box[3][2] ;
-    int other_use_box[3][2] ;
-
-    /* Received portions */
-
-    if ( box_intersect( (BoxInput) map_use_box , (BoxInput) pbox[ip] , recv_box ) ) {
-
-      for ( g_iz = recv_box[2][0] ; g_iz < recv_box[2][1] ; ++g_iz ) {
-      for ( g_iy = recv_box[1][0] ; g_iy < recv_box[1][1] ; ++g_iy ) {
-      for ( g_ix = recv_box[0][0] ; g_ix < recv_box[0][1] ; ++g_ix ) {
-
-        const int local = map_global_to_use_box( (BoxInput) map_use_box, g_ix, g_iy, g_iz );
-
-        map_local_id[ local ] = count_parallel++ ;
-      }
-      }
-      }
-    }
-    recv_pc[i+1] = count_parallel ;
-
-    /* Sent items */
-
-    global_to_use_box( gbox, pbox[ip], ghost, other_int_box, other_use_box );
-
-    if ( box_intersect( (BoxInput) other_use_box , (BoxInput) pbox[my_p] , send_box ) ) {
-
-      int nSend = ( send_box[0][1] - send_box[0][0] ) *
-                  ( send_box[1][1] - send_box[1][0] ) *
-                  ( send_box[2][1] - send_box[2][0] );
-
-      resize_int( & send_id , & send_id_size , (iSend + nSend ) );
-
-      for ( g_iz = send_box[2][0] ; g_iz < send_box[2][1] ; ++g_iz ) {
-      for ( g_iy = send_box[1][0] ; g_iy < send_box[1][1] ; ++g_iy ) {
-      for ( g_ix = send_box[0][0] ; g_ix < send_box[0][1] ; ++g_ix ) {
-
-        const int local = map_global_to_use_box( (BoxInput) map_use_box, g_ix, g_iy, g_iz );
-
-        if ( map_local_id[ local ] < count_interior ) { abort(); }
-
-        send_id[ iSend ] = map_local_id[ local ] ;
-        ++iSend ;
-      }
-      }
-      }
-    }
-    send_pc[i+1] = iSend ;
-  }
-
-  if ( count_parallel != use_length ) { abort(); }
-
-  *map_count_interior = int_length ;
-  *map_count_owns     = own_length ;
-  *map_count_uses     = use_length ;
-  *map_recv_pc        = recv_pc ;
-  *map_send_pc        = send_pc ;
-  *map_send_id        = send_id ;
-}
-
-/*--------------------------------------------------------------------*/
-
-#ifdef UNIT_TEST
-
-static int box_contain( const int a[3][2] , const int b[3][2] )
-{
-  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
-         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
-         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
-}
-
-static void box_print( FILE * fp , const int a[][2] )
-{
-  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
-                a[0][0] , a[0][1] ,  
-                a[1][0] , a[1][1] ,  
-                a[2][0] , a[2][1] );
-}
-
-static int box_disjoint( BoxInput a , BoxInput b )
-{
-  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
-         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
-         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
-}
-
-
-static void test_box( const int box[3][2] , const int np )
-{
-  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
-  int ncell_total = 0 ;
-  int ncell_min = ncell_box ;
-  int ncell_max = 0 ;
-  int (*pbox)[3][2] ;
-  int i , j ;
-
-  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
-
-  box_partition( 0 , np , 2 , box , pbox );
-
-  for ( i = 0 ; i < np ; ++i ) {
-    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
-                      ( pbox[i][1][1] - pbox[i][1][0] ) *
-                      ( pbox[i][2][1] - pbox[i][2][0] );
-
-    if ( ! box_contain( box , (const int (*)[2]) pbox[i] ) ) {
-      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
-      box_print(stdout,(const int (*)[2]) pbox[i]);
-      fprintf(stdout,"\n");
-      abort();
-    }
-
-    for ( j = i + 1 ; j < np ; ++j ) {
-      if ( ! box_disjoint( (const int (*)[2]) pbox[i] ,
-                           (const int (*)[2]) pbox[j] ) ) {
-        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
-        box_print(stdout, (const int (*)[2]) pbox[i]);
-        fprintf(stdout,"\n");
-        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
-        box_print(stdout, (const int (*)[2]) pbox[j]);
-        fprintf(stdout,"\n");
-        abort();
-      }
-    }
-    ncell_total += ncell ;
-
-    if ( ncell_max < ncell ) { ncell_max = ncell ; }
-    if ( ncell < ncell_min ) { ncell_min = ncell ; }
-  }
-
-  if ( ncell_total != ncell_box ) {
-    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
-    abort();
-  }
-  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
-          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
-
-  free( pbox );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void test_maps( const int root_box[][2] , const int np )
-{
-  const int ghost = 1 ;
-  const int nx_global = root_box[0][1] - root_box[0][0] ;
-  const int ny_global = root_box[1][1] - root_box[1][0] ;
-  int map_count_interior , map_count_owns , map_count_uses ;
-  int map_use_box[3][2] ;
-  int ieq , i , j ;
-  int (*pbox)[3][2] ;
-  int **local_values ;
-  int **map_local_id ;
-  int **map_recv_pc ;
-  int **map_send_pc ;
-  int **map_send_id ;
-  
-  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
-
-  box_partition( 0 , np , 2 , root_box , pbox );
-
-  local_values = (int **) malloc( sizeof(int*) * np );
-  map_local_id = (int **) malloc( sizeof(int*) * np );
-  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
-  map_send_pc  = (int **) malloc( sizeof(int*) * np );
-  map_send_id  = (int **) malloc( sizeof(int*) * np );
-
-  /* Set each local value to the global equation number */
-
-  for ( ieq = i = 0 ; i < np ; ++i ) {
-    const int (*mybox)[2] = (const int (*)[2]) pbox[i] ;
-    const int nx = mybox[0][1] - mybox[0][0] ;
-    const int ny = mybox[1][1] - mybox[1][0] ;
-    const int nz = mybox[2][1] - mybox[2][0] ;
-    int ix , iy , iz ;
-
-    map_local_id[i] = (int *) malloc( sizeof(int) *
-                                      ( nx + 2 * ghost ) *
-                                      ( ny + 2 * ghost ) *
-                                      ( nz + 2 * ghost ) );
-
-    /* Generate the partition maps for this rank */
-    box_partition_map( np , i , root_box ,
-                        (const int (*)[3][2])  pbox , ghost ,
-                        map_use_box ,
-                        map_local_id[i] ,
-                        & map_count_interior ,
-                        & map_count_owns ,
-                        & map_count_uses ,
-                        & map_recv_pc[i] , 
-                        & map_send_pc[i] , & map_send_id[i] );
-
-    if ( map_count_uses != map_recv_pc[i][np] ) { abort(); }
-
-    local_values[i] = (int *) malloc( sizeof(int) * map_count_uses );
-
-    for ( iz = map_use_box[2][0] ; iz < map_use_box[2][1] ; ++iz ) {
-    for ( iy = map_use_box[1][0] ; iy < map_use_box[1][1] ; ++iy ) {
-    for ( ix = map_use_box[0][0] ; ix < map_use_box[0][1] ; ++ix ) {
-
-      const int igrid = map_global_to_use_box((BoxInput)map_use_box,ix,iy,iz);
-      const int ieq   = map_local_id[i][ igrid ];
-
-      if ( 0 <= ieq ) {
-        local_values[i][ ieq ] =
-          ix + iy * nx_global + iz * nx_global * ny_global ;
-      }
-    }
-    }
-    }
-  }
-
-  /* Pair-wise compare the local values */
-  /* i  == receiving processor rank */
-  /* ip == sending   processor rank */
-  /* j  == receiving processor data entry for message from 'ip' */
-  /* jp == sending   processor data entry for message to   'i' */
-
-  for ( i = 0 ; i < np ; ++i ) {
-    for ( j = 1 ; j < np ; ++j ) {
-      const int ip = ( i + j ) % np ;
-      const int jp = ( i + np - ip ) % np ;
-      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
-      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
-      int k ;
-      if ( nrecv != nsend ) {
-        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
-        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
-        abort();
-      }
-      for ( k = 0 ; k < nrecv ; ++k ) {
-        const int irecv = map_recv_pc[i][j] + k ;
-        const int isend = map_send_pc[ip][jp] + k ;
-        const int val_irecv = local_values[i][irecv] ;
-        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
-        if ( val_irecv != val_isend ) {
-          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
-          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
-          abort();
-        }
-      }
-    }
-  }
-
-  for ( i = 0 ; i < np ; ++i ) {
-    free( map_local_id[i] );
-    free( map_recv_pc[i] );
-    free( map_send_pc[i] );
-    free( map_send_id[i] );
-    free( local_values[i] );
-  }
-  free( map_send_id );
-  free( map_send_pc );
-  free( map_recv_pc );
-  free( map_local_id );
-  free( local_values );
-  free( pbox );
-}
-
-/*--------------------------------------------------------------------*/
-
-int main( int argc , char * argv[] )
-{
-  int np_max = 256 ;
-  int box[3][2] = { { 0 , 64 } , { 0 , 64 } , { 0 , 64 } };
-  int np = 0 ;
-
-  switch( argc ) {
-  case 3:
-    sscanf(argv[1],"%d",&np);
-    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
-    if ( 0 < np ) { test_box(  (const int (*)[2]) box , np ); }
-    if ( 0 < np ) { test_maps( (const int (*)[2]) box , np ); }
-    break ;
-  default:
-    for ( np = 1 ; np <= np_max ; ++np ) {
-      test_box(  (const int (*)[2]) box , np );
-      test_maps( (const int (*)[2]) box , np );
-    }
-    break ;
-  }
-  return 0 ;
-}
-
-#endif
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.h b/kokkos/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.h
deleted file mode 100644
index 71d71f5..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/BoxPartitionIB.h
+++ /dev/null
@@ -1,88 +0,0 @@
-
-
-#ifndef BoxPartionIB_h
-#define BoxPartionIB_h
-
-/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
- *
- *  Use recursive coordinate bisection to partition a box 
- *  into np disjoint sub-boxes.  Allocate (via malloc) and
- *  populate the sub-boxes, mapping the local (x,y,z) to
- *  a local ordinal, and mappings for the send-recv messages
- *  to update the ghost cells.
- *
- *  Order local ordinates as follows:
- *    {
- *      interior ,
- *      boundary ,
- *      remote[ ( my_p + i ) % np ]
- *    } 
- *      where i = 1..(np-1)
- *
- *  usage:
- *
- *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
- *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
- *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
- *
- *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
- *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
- *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
- *    const int x_global = x + pbox[my_p][0][0] ;
- *    const int y_global = y + pbox[my_p][1][0] ;
- *    const int z_global = z + pbox[my_p][2][0] ;
- *
- *    const int local_ordinal =
- *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
- *
- *    if ( 0 <= local_ordinal ) {
- *    }
- *  }
- *  
- *  for ( i = 1 ; i < np ; ++i ) {
- *    const int recv_processor = ( my_p + i ) % np ;
- *    const int recv_ordinal_begin = map_recv_pc[i];
- *    const int recv_ordinal_end   = map_recv_pc[i+1];
- *  }
- *
- *  for ( i = 1 ; i < np ; ++i ) {
- *    const int send_processor = ( my_p + i ) % np ;
- *    const int send_map_begin = map_send_pc[i];
- *    const int send_map_end   = map_send_pc[i+1];
- *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
- *      send_ordinal = map_send_id[j] ;
- *    }
- *  }
- */
-
-
-void box_partition_rcb(
-  const int np              /**< [in] Number of partitions */ ,
-  const int root_box[3][2]  /**< [in] Global 3D box to partition  */ ,
-  int       pbox[][3][2]    /**< [out] Partition of global 3D boxes */ );
-
-void box_partition_map(
-  const int np            /**< [in] Number of partitions */ ,
-  const int my_p          /**< [in] My partition */ ,
-  const int gbox[3][2]    /**< [in] Global 3D box */ ,
-  const int pbox[][3][2]  /**< [in] Partitions of global 3D box */ ,
-  const int ghost         /**< [in] Number of grid points to ghost */ ,
-
-  int    map_uses_box[3][2]  /**< [out] Local box expanded by ghosting */ ,
-  int    map_local_id[]      /**< [out] Mapping for local points */ ,
-  int *  map_count_interior  /**< [out] Number of my interior points */ ,
-  int *  map_count_owns      /**< [out] Number of points I own */ ,
-  int *  map_count_uses      /**< [out] Number of points I access */ ,
-  int ** map_recv_pc         /**< [out] Received prefix spans per process */ ,
-  int ** map_send_pc         /**< [out] Send prefix counts per process */ ,
-  int ** map_send_id         /**< [out] Send grid points */ );
-
-/* \brief  Map a global (x,y,z) to a local ordinal.  */
-int box_map_local( const int local_uses[3][2] ,
-                   const int map_local_id[] ,
-                   const int global_x ,
-                   const int global_y ,
-                   const int global_z );
-
-#endif
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/CGSolver.c b/kokkos/basic/optional/ThreadPool/test/hhpccg/CGSolver.c
deleted file mode 100644
index 55f739d..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/CGSolver.c
+++ /dev/null
@@ -1,311 +0,0 @@
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <ThreadPool_config.h>
-#include <TPI.h>
-#include <tpi_vector.h>
-#include <CGSolver.h>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-/*--------------------------------------------------------------------*/
-
-void cgsolve_set_lhs( const struct distributed_crs_matrix * const matrix ,
-                      const VECTOR_SCALAR * const x ,
-                            VECTOR_SCALAR * const b )
-{
-  const int nRow = matrix->n_local_row ;
-  const int nVec = matrix->p_recv_pc[ matrix->p_size ] ;
-
-  VECTOR_SCALAR * const p =
-    (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
-
-  tpi_copy( nRow , x , p );
-
-  dcrs_apply( matrix , p , b );
-
-  free( p );
-}
-
-/*--------------------------------------------------------------------*/
-
-/*  x += alpha * p ;
- *  r -= alpha * Ap ;
- *  return dot( r , r );
- */
-static
-double cgsolver_update( const int length ,
-                        const VECTOR_SCALAR alpha ,
-                        const VECTOR_SCALAR * p ,
-                        const VECTOR_SCALAR * Ap ,
-                              VECTOR_SCALAR * x ,
-                              VECTOR_SCALAR * r );
-
-/*--------------------------------------------------------------------*/
-
-void cgsolve_blas( const struct distributed_crs_matrix * matrix ,
-                   const VECTOR_SCALAR * const b ,
-                         VECTOR_SCALAR * const x ,
-                   const VECTOR_SCALAR tolerance ,
-                   const int max_iter ,
-                   const int print_iter ,
-                         int    * const iter_count ,
-                         VECTOR_SCALAR * const norm_resid ,
-                         double * const solve_dt )
-{
-  const int nRow = matrix->n_local_row ;
-  const int nVec = matrix->p_recv_pc[ matrix->p_size ] ;
-
-  const VECTOR_SCALAR tol_2 = tolerance * tolerance ;
-
-  VECTOR_SCALAR * const r  =
-    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
-  VECTOR_SCALAR * const p  =
-    (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
-  VECTOR_SCALAR * const Ap =
-    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
-
-  VECTOR_SCALAR rtrans = 0.0 ;
-  VECTOR_SCALAR beta = 0.0 ;
-  VECTOR_SCALAR pAp = 0.0 ;
-  VECTOR_SCALAR alpha ;
-  double time_begin , time_end ;
-
-  int k ;
-
-  tpi_copy( nRow , b , r );
-  tpi_copy( nRow , x , p );
-
-  /*  Ap = matrix * p ; */
-  dcrs_apply( matrix , p , Ap );
-
-  /*  r -= Ap ; */
-  tpi_axpy( nRow , -1.0 , Ap , r );
-
-  rtrans = tpi_dot( nRow , r , r );
-
-  time_begin = TPI_Walltime();
-
-  for ( k = 0 ; k < max_iter && tol_2 < rtrans ; ++k ) {
-
-    /*  p = r + beta * p ; */
-    tpi_xpby( nRow, r, beta, p ); /* parallel */
-
-    dcrs_apply( matrix , p , Ap );
-
-    pAp = tpi_dot( nRow , p , Ap );
-
-    /* If orthogonal then cannot update */
-    alpha = 0 < fabs( pAp ) ? rtrans / pAp : 0.0 ;
-
-    /*  x += alpha * p ;
-     *  r -= alpha * Ap ;
-     *  return dot( r , r );
-     */
-    beta = rtrans ;
-
-    tpi_axpy( nRow ,  alpha , p , x );
-    tpi_axpy( nRow , -alpha , Ap , r );
-    rtrans = tpi_dot( nRow , r , r );
-    beta = rtrans / beta ;
-  }
-
-  time_end = TPI_Walltime();
-
-#ifdef HAVE_MPI
-  {
-    double tb = time_begin ;
-    double te = time_end ;
-    MPI_Allreduce(&tb, &time_begin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(&te, &time_end,   1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  }
-#endif
-
-  *solve_dt += time_end - time_begin ;
-
-  *norm_resid = sqrt( rtrans );
-  *iter_count = k ;
-
-  free( Ap );
-  free( p );
-  free( r );
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-void cgsolve( const struct distributed_crs_matrix * matrix ,
-              const VECTOR_SCALAR * const b ,
-                    VECTOR_SCALAR * const x ,
-              const int overlap_comm ,
-              const VECTOR_SCALAR tolerance ,
-              const int max_iter ,
-              const int print_iter ,
-                    int    * const iter_count ,
-                    VECTOR_SCALAR * const norm_resid ,
-                    double * const solve_dt )
-{
-  const int nRow = matrix->n_local_row ;
-  const int nVec = matrix->p_recv_pc[ matrix->p_size ] ;
-
-  const VECTOR_SCALAR tol_2 = tolerance * tolerance ;
-
-  VECTOR_SCALAR * const r  =
-    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
-  VECTOR_SCALAR * const p  =
-    (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
-  VECTOR_SCALAR * const Ap =
-    (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
-
-  VECTOR_SCALAR rtrans = 0.0 ;
-  VECTOR_SCALAR beta = 0.0 ;
-  VECTOR_SCALAR pAp = 0.0 ;
-  VECTOR_SCALAR alpha ;
-  double time_begin , time_end ;
-
-  int k ;
-
-  tpi_copy( nRow , b , r );
-  tpi_copy( nRow , x , p );
-
-  /*  gather off-processor components of 'p'.
-   *  Ap = matrix * p ;
-   *  return dot( Ap , p );
-   */
-  pAp = dcrs_apply_and_dot( matrix , p , Ap , overlap_comm );
-
-  /*  r -= 1.0 * Ap ;
-   *  return dot( r , r );
-   */
-  alpha = 1.0 ;
-  rtrans = cgsolver_update( nRow, alpha, NULL, Ap, NULL, r ); /* parallel */
-
-  time_begin = TPI_Walltime();
-
-  for ( k = 0 ; k < max_iter && tol_2 < rtrans ; ++k ) {
-
-    /*  p = r + beta * p ; */
-    tpi_xpby( nRow, r, beta, p ); /* parallel */
-
-    /*  gather off-processor components of 'p'.
-     *  Ap = matrix * p ;
-     *  return dot( Ap , p );
-     */
-    pAp = dcrs_apply_and_dot( matrix , p , Ap , overlap_comm ); /* parallel */
-
-    /* If orthogonal then cannot update */
-    alpha = 0 < fabs( pAp ) ? rtrans / pAp : 0.0 ;
-
-    /*  x += alpha * p ;
-     *  r -= alpha * Ap ;
-     *  return dot( r , r );
-     */
-    beta = rtrans ;
-    rtrans = cgsolver_update( nRow , alpha , p , Ap , x , r ); /* parallel */
-    beta = rtrans / beta ;
-  }
-
-  time_end = TPI_Walltime();
-
-#ifdef HAVE_MPI
-  {
-    double tb = time_begin ;
-    double te = time_end ;
-    MPI_Allreduce(&tb, &time_begin, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-    MPI_Allreduce(&te, &time_end,   1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  }
-#endif
-
-  *solve_dt += time_end - time_begin ;
-
-  *norm_resid = sqrt( rtrans );
-  *iter_count = k ;
-
-  free( Ap );
-  free( p );
-  free( r );
-}
-
-/*--------------------------------------------------------------------*/
-
-struct tpi_work_cgsolve {
-  const VECTOR_SCALAR * p ;
-  const VECTOR_SCALAR * Ap ;
-        VECTOR_SCALAR * x ;
-        VECTOR_SCALAR * r ;
-        VECTOR_SCALAR alpha ;
-  int length ;
-};
-
-static void tpi_work_dot_join( TPI_Work * work , const void * src  )
-{ *((double *) work->reduce ) += *((const double *) src); }
- 
-static void tpi_work_dot_init( TPI_Work * work )
-{ *((double *) work->reduce ) = 0 ; }
-
-static void tpi_work_update( TPI_Work * work )
-{
-  const struct tpi_work_cgsolve * const cg_work = 
-    (const struct tpi_work_cgsolve *) work->info ;
-
-  const int           length = cg_work->length ;
-  const VECTOR_SCALAR alpha  = cg_work->alpha ;
-  const VECTOR_SCALAR * const p  = cg_work->p ;
-  const VECTOR_SCALAR * const Ap = cg_work->Ap ;
-        VECTOR_SCALAR * const x  = cg_work->x ;
-        VECTOR_SCALAR * const r  = cg_work->r ;
-
-  double mag = 0 ;
-  int iBeg , iEnd , i ;
-
-  tpi_work_span( work , length , & iBeg , & iEnd );
-
-  if ( x ) { for ( i = iBeg ; i < iEnd ; ++i ) { x[i] += alpha * p[i]; } }
-
-  for ( i = iBeg ; i < iEnd ; ++i ) {
-    const VECTOR_SCALAR val = ( r[i] -= alpha * Ap[i] );
-    mag += val * val ;
-  }
-
-  *((double*) work->reduce ) = mag ;
-}
-
-double cgsolver_update( const int length ,
-                        const VECTOR_SCALAR alpha ,
-                        const VECTOR_SCALAR * p ,
-                        const VECTOR_SCALAR * Ap ,
-                              VECTOR_SCALAR * x ,
-                              VECTOR_SCALAR * r )
-{
-  struct tpi_work_cgsolve work ;
-
-  double result = 0.0 ;
-
-  work.length = length ;
-  work.alpha  = alpha ;
-  work.p  = p ;
-  work.Ap = Ap ;
-  work.x  = x ;
-  work.r  = r ;
-
-  TPI_Run_threads_reduce( tpi_work_update , & work ,
-                          tpi_work_dot_join , tpi_work_dot_init ,
-                          sizeof(result) , & result );
-
-#ifdef HAVE_MPI
-  {
-    double local = result ;
-    MPI_Allreduce( & local, & result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
-  }
-#endif
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/CGSolver.h b/kokkos/basic/optional/ThreadPool/test/hhpccg/CGSolver.h
deleted file mode 100644
index f0ee6f6..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/CGSolver.h
+++ /dev/null
@@ -1,40 +0,0 @@
-
-#ifndef CGSolver_h
-#define CGSolver_h
-
-#include <tpi_vector.h>
-#include <dcrs_matrix.h>
-
-/*--------------------------------------------------------------------*/
-
-void cgsolve_set_lhs( const struct distributed_crs_matrix * matrix ,
-                      const VECTOR_SCALAR * const x ,
-                            VECTOR_SCALAR * const b );
-
-/* Solve with fused loops */
-void cgsolve( const struct distributed_crs_matrix * matrix ,
-              const VECTOR_SCALAR * const b ,
-                    VECTOR_SCALAR * const x ,
-              const int overlap_comm ,
-              const VECTOR_SCALAR tolerance ,
-              const int max_iter ,
-              const int print_iter ,
-                    int    * const iter_count ,
-                    VECTOR_SCALAR * const norm_resid ,
-                    double * const solve_dt );
-
-/* Solve with blas-like calls */
-void cgsolve_blas( const struct distributed_crs_matrix * matrix ,
-                   const VECTOR_SCALAR * const b ,
-                         VECTOR_SCALAR * const x ,
-                   const VECTOR_SCALAR tolerance ,
-                   const int max_iter ,
-                   const int print_iter ,
-                         int    * const iter_count ,
-                         VECTOR_SCALAR * const norm_resid ,
-                         double * const solve_dt );
-
-/*--------------------------------------------------------------------*/
-
-#endif
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/CMakeLists.txt b/kokkos/basic/optional/ThreadPool/test/hhpccg/CMakeLists.txt
deleted file mode 100644
index 0c652cd..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/CMakeLists.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-
-INCLUDE(PackageAddExecutableAndTest)
-INCLUDE(PackageLibraryMacros)
-
-####################
-
-SET(HEADERS "")
-SET(SOURCES "")
- 
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
- 
-SET(HEADERS ${HEADERS}
-  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h
-  )
- 
-INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
- 
-APPEND_SET(HEADERS
-  BoxPartition.h
-  CGSolver.h
-  tpi_vector.h
-  dcrs_matrix.h
-  )
- 
-####################
-
-
-PACKAGE_ADD_EXECUTABLE(
-  test_tpi_hhpccg
-  COMM serial mpi
-  SOURCES main.c CGSolver.c BoxPartitionIB.c tpi_vector.c dcrs_matrix.c
-  DEPLIBS pthread m
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hhpccg
-  NAME test_tpi_hhpccg_serial_1
-  COMM serial
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hhpccg
-  NAME test_tpi_hhpccg_serial_2
-  COMM serial
-  ARGS "threads=2"
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hhpccg
-  NAME test_tpi_hhpccg_serial_4
-  COMM serial
-  ARGS "threads=4"
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hhpccg
-  NAME test_tpi_hhpccg_mpi_1
-  COMM mpi
-  NUM_MPI_PROCS 1
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hhpccg
-  NAME test_tpi_hhpccg_mpi_2
-  COMM mpi
-  NUM_MPI_PROCS 2
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hhpccg
-  NAME test_tpi_hhpccg_mpi_4
-  COMM mpi
-  NUM_MPI_PROCS 4
-  DIRECTORY .
-  )
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.c b/kokkos/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.c
deleted file mode 100644
index d61404f..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.c
+++ /dev/null
@@ -1,314 +0,0 @@
-
-#include <stdlib.h>
-#include <math.h>
-
-#include <ThreadPool_config.h>
-#include <TPI.h>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <dcrs_matrix.h>
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-#if ! defined( HAVE_MPI )
-
-static
-double comm_sum( double v ) { return v ; }
-
-#define get_off_process_entries( M , V )  /* */
-
-/*--------------------------------------------------------------------*/
-#else /* defined( HAVE_MPI ) */
-/*--------------------------------------------------------------------*/
-
-static
-double comm_sum( double v )
-{
-  double result = 0 ;
-  MPI_Allreduce( & v , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD );
-  return result ;
-}
-
-static
-void get_off_process_entries(
-  const struct distributed_crs_matrix * const matrix ,
-  VECTOR_SCALAR * const vec )
-{
-  const int np   = matrix->p_size ;
-  const int my_p = matrix->p_rank ;
-  const int * const recv_pc = matrix->p_recv_pc ;
-  const int * const send_pc = matrix->p_send_pc ;
-  const int * const send_id = matrix->p_send_id ;
-  int i , irecv ;
-
-  for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
-    if ( recv_pc[i] < recv_pc[i+1] ) ++irecv ;
-  }
-
-  {
-    VECTOR_SCALAR * const send_buf =
-      (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * send_pc[np] );
-
-    MPI_Request * const recv_request =
-      (MPI_Request *) malloc( sizeof(MPI_Request) * irecv );
-
-    MPI_Status * const recv_status =
-      (MPI_Status *) malloc( sizeof(MPI_Status) * irecv );
-
-    for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
-      const int ip = ( i + my_p ) % np ;
-      const int recv_beg    = recv_pc[i];
-      const int recv_length = recv_pc[i+1] - recv_beg ;
-      if ( recv_length ) {
-        MPI_Irecv( vec + recv_beg ,
-                   recv_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
-                   ip , 0 , MPI_COMM_WORLD , recv_request + irecv );
-        ++irecv ;
-      }
-    }
-
-    /* Gather components into send buffer */
-
-    for ( i = 0 ; i < send_pc[np] ; ++i ) {
-      send_buf[i] = vec[ send_id[i] ];
-    }
-
-    MPI_Barrier( MPI_COMM_WORLD );
-
-    for ( i = 1 ; i < np ; ++i ) {
-      const int ip = ( i + my_p ) % np ;
-      const int send_beg    = send_pc[i];
-      const int send_length = send_pc[i+1] - send_beg ;
-      if ( send_length ) { /* Send to 'i' */
-        MPI_Rsend( send_buf + send_beg ,
-                   send_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
-                   ip , 0 , MPI_COMM_WORLD );
-      }
-    }
-
-    MPI_Waitall( irecv , recv_request , recv_status );
-
-    free( recv_status );
-    free( recv_request );
-    free( send_buf );
-  }
-}
-
-#endif
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-static void dcrs_apply_and_dot_span(
-  const struct distributed_crs_matrix * const matrix ,
-  const int span_begin ,
-  const int span_end ,
-  const VECTOR_SCALAR * const x ,
-        VECTOR_SCALAR * const y ,
-        double        * const result )
-{
-  const int           * const A_pc  = matrix->A_pc ;
-  const int           * const A_ia  = matrix->A_ia ;
-  const MATRIX_SCALAR * const A_a   = matrix->A_a ;
-
-  double dot_x_y = *result ;
-
-  int row = span_begin ;
-
-  for ( ; row < span_end ; ++row ) {
-    const int pcBeg = A_pc[ row ];
-    const int pcEnd = A_pc[ row + 1 ];
-
-    const int           *       ia    = A_ia + pcBeg ;
-    const MATRIX_SCALAR *       a     = A_a  + pcBeg ;
-    const MATRIX_SCALAR * const a_end = A_a  + pcEnd ;
-
-    VECTOR_SCALAR y_tmp = 0 ;
-    for ( ; a != a_end ; ++a , ++ia ) {
-      y_tmp += *a * x[ *ia ];
-    }
-    dot_x_y += x[ row ] * y_tmp ;
-    y[ row ] = y_tmp ;
-  }
-
-  *result = dot_x_y ;
-}
-
-static void dcrs_apply_span(
-  const struct distributed_crs_matrix * const matrix ,
-  const int span_begin ,
-  const int span_end ,
-  const VECTOR_SCALAR * const x ,
-        VECTOR_SCALAR * const y )
-{
-  const int           * const A_pc  = matrix->A_pc ;
-  const int           * const A_ia  = matrix->A_ia ;
-  const MATRIX_SCALAR * const A_a   = matrix->A_a ;
-
-  int row = span_begin ;
-
-  for ( ; row < span_end ; ++row ) {
-    const int pcBeg = A_pc[ row ];
-    const int pcEnd = A_pc[ row + 1 ];
-
-    const int           *       ia    = A_ia + pcBeg ;
-    const MATRIX_SCALAR *       a     = A_a  + pcBeg ;
-    const MATRIX_SCALAR * const a_end = A_a  + pcEnd ;
-
-    VECTOR_SCALAR y_tmp = 0 ;
-    for ( ; a != a_end ; ++a , ++ia ) {
-      y_tmp += *a * x[ *ia ];
-    }
-    y[ row ] = y_tmp ;
-  }
-}
-
-static void work_span( const int count , const int rank ,
-                       int * jBeg , int * jEnd )
-{
-  const int length = *jEnd - *jBeg ;
-  const int chunk  = ( length + count - 1 ) / count ;
-  const int begin  = chunk * rank ;
-        int end    = begin + chunk ;
-
-  if ( length < end ) { end = length ; }
-
-  *jEnd  = *jBeg + end ;
-  *jBeg += begin ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_dot_join( TPI_Work * work , const void * src  )
-{ *((double *) ( work->reduce) ) += *((const double *) src); }
-
-static void tpi_work_dot_init( TPI_Work * work )
-{ *((double *) ( work->reduce) ) = 0 ; }
-
-/*--------------------------------------------------------------------*/
-
-struct work_dcrs {
-  const struct distributed_crs_matrix * matrix ;
-  const VECTOR_SCALAR * x ;
-        VECTOR_SCALAR * y ;
-  int   jBeg ;
-  int   jEnd ;
-};
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_dcrs_apply_and_dot( TPI_Work * work )
-{
-  const struct work_dcrs * const info = (const struct work_dcrs *) work->info ;
-
-  int local_begin = info->jBeg ;
-  int local_end   = info->jEnd ;
-
-  work_span( work->count , work->rank , & local_begin , & local_end );
-
-  dcrs_apply_and_dot_span( info->matrix , local_begin , local_end ,
-                           info->x , info->y , (double *) work->reduce );
-}
-
-double dcrs_apply_and_dot(
-  const struct distributed_crs_matrix * matrix ,
-  VECTOR_SCALAR * x ,
-  VECTOR_SCALAR * y ,
-  const int overlap_communication )
-{
-  struct work_dcrs info ;
-
-  double result = 0.0 ;
-
-  info.matrix = matrix ;
-  info.x      = x ;
-  info.y      = y ;
-
-  if ( overlap_communication &&
-       matrix->n_internal_row < matrix->n_local_row ) {
-
-    double remote_result = 0 ;
-
-    /* Start the internal matrix-vector multiply */
-    /* result += dot( output = A * input , input ); */
-
-    info.jBeg = 0 ;
-    info.jEnd = matrix->n_internal_row ;
-
-    /*  Divide internal work evenly among worker threads.
-     *  This leave the primary thread completely out of the computation.
-     */
-    TPI_Start_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
-                              tpi_work_dot_join ,
-                              tpi_work_dot_init ,
-                              sizeof(result) , & result );
-
-    get_off_process_entries( matrix , x );
-
-    TPI_Wait(); /* Wait for internal result */
-
-    info.jBeg = matrix->n_internal_row ;
-    info.jEnd = matrix->n_local_row ;
-
-    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
-                            tpi_work_dot_join ,
-                            tpi_work_dot_init ,
-                            sizeof(remote_result) , & remote_result );
-
-    result += remote_result ;
-  }
-  else {
-    info.jBeg = 0 ;
-    info.jEnd = matrix->n_local_row ;
-
-    get_off_process_entries( matrix , x );
-
-    TPI_Run_threads_reduce( tpi_work_dcrs_apply_and_dot , & info , 
-                            tpi_work_dot_join ,
-                            tpi_work_dot_init ,
-                            sizeof(result) , & result );
-  }
-
-  result = comm_sum( result );
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_dcrs_apply( TPI_Work * work )
-{
-  const struct work_dcrs * const info = (const struct work_dcrs *) work->info ;
-
-  int local_begin = info->jBeg ;
-  int local_end   = info->jEnd ;
-
-  work_span( work->count , work->rank , & local_begin , & local_end );
-
-  dcrs_apply_span( info->matrix , local_begin , local_end ,
-                   info->x , info->y );
-}
-
-void dcrs_apply(
-  const struct distributed_crs_matrix * matrix ,
-  VECTOR_SCALAR * x ,
-  VECTOR_SCALAR * y )
-{
-  struct work_dcrs info ;
-
-  info.matrix = matrix ;
-  info.x      = x ;
-  info.y      = y ;
-  info.jBeg   = 0 ;
-  info.jEnd   = matrix->n_local_row ;
-
-  get_off_process_entries( matrix , x );
-
-  TPI_Run_threads( tpi_work_dcrs_apply , & info , 0 );
-}
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.h b/kokkos/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.h
deleted file mode 100644
index 61f2032..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/dcrs_matrix.h
+++ /dev/null
@@ -1,41 +0,0 @@
-
-#ifndef dcrs_matrix_h
-#define dcrs_matrix_h
-
-#include <tpi_vector.h>
-
-struct distributed_crs_matrix {
-  /* Global parallel */
-  int   p_size ;
-  int   p_rank ;
-  int * p_recv_pc ; /* [np+1], span of received off-processor elements */
-  int * p_send_pc ; /* [np+1], span of sent off-processor elements */
-  int * p_send_id ; /* [send_pc[np]], indices of sent elements */
-
-  /* Local and local parallel */
-  int   n_local_column ; /* Number of local columns */
-  int   n_local_row ;    /* Number of local rows */
-  int   n_internal_row ; /* Number of local rows with internal columns */
-  int * A_pc ;           /* Offsets into A_ia array for column indices */
-  int * A_ia ;
-  MATRIX_SCALAR * A_a ;
-};
-
-/*  1) communicate off-processor portions of input.
- *  2) apply: output = matrix * input ;
- *  3) return: dot( output , input );
- */
-double dcrs_apply_and_dot( const struct distributed_crs_matrix * matrix ,
-                           VECTOR_SCALAR * input ,
-                           VECTOR_SCALAR * output ,
-                           const int overlap_communication );
-
-/*  1) communicate off-processor portions of input.
- *  2) apply: output = matrix * input ;
- */
-void dcrs_apply( const struct distributed_crs_matrix * matrix ,
-                 VECTOR_SCALAR * input ,
-                 VECTOR_SCALAR * output );
-
-#endif
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/main.c b/kokkos/basic/optional/ThreadPool/test/hhpccg/main.c
deleted file mode 100644
index 57bb80a..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/main.c
+++ /dev/null
@@ -1,422 +0,0 @@
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <ThreadPool_config.h>
-#include <TPI.h>
-#include <BoxPartitionIB.h>
-#include <dcrs_matrix.h>
-#include <CGSolver.h>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-/*--------------------------------------------------------------------*/
-static
-void hpccg_alloc_and_fill( const int np ,
-                           const int my_p ,
-                           const int gbox[][2] ,
-                           const int ghost ,
-                           struct distributed_crs_matrix * const matrix );
-
-/*--------------------------------------------------------------------*/
-
-int main( int argc , char ** argv )
-{
-  const int ghost = 1 ;
-  const int max_cube = 20 ;
-  int ncube[20] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
-                    0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
-
-  FILE * print_file = stdout ;
-  int print_iter = 500 ;
-  int max_iter = 50 ;
-  int overlap_comm = 0 ;
-
-  float tolerance = 0.0 ; /* Force max iterations */
-
-  int gbox[3][2] = { { 0 , 16 } , { 0 , 16 } , { 0 , 16 } };
-  int nt = 0 ;
-  int trials = 6 ;
-  int ntest ;
-  int np = 1;
-  int my_p = 0 ;
-
-#ifdef HAVE_MPI
-  MPI_Init( & argc , & argv );
-  MPI_Comm_size( MPI_COMM_WORLD , & np );
-  MPI_Comm_rank( MPI_COMM_WORLD , & my_p );
-#endif
-
-  if ( ! my_p ) {
-    const char arg_threads[] = "threads=" ;
-    const char arg_cube[] = "cube=" ;
-    const char arg_box[] = "box=" ;
-    const char arg_max[] = "max_iter=" ;
-    const char arg_trials[] = "trials=" ;
-    const char arg_print[] = "print_iter=" ;
-    const char arg_file[] = "print_file=" ;
-    const char arg_comm[] = "overlap_comm=" ;
-    const char arg_tolerance[] = "tolerance=" ;
-    int i ;
-    for ( i = 1 ; i < argc ; ++i ) {
-      if ( ! strncmp(argv[i],arg_threads,strlen(arg_threads)) ) {
-        sscanf(argv[i]+strlen(arg_threads),"%d",&nt);
-      }
-      else if ( ! strncmp(argv[i],arg_box,strlen(arg_box)) ) {
-        sscanf(argv[i]+strlen(arg_box),"%d%*[x]%d%*[x]%d",
-               & gbox[0][1] , & gbox[1][1] , & gbox[2][1] );
-      }
-      else if ( ! strncmp(argv[i],arg_cube,strlen(arg_cube)) ) {
-        sscanf(argv[i]+strlen(arg_cube),
-               "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",
-               ncube+0, ncube+1, ncube+2, ncube+3, ncube+4,
-               ncube+5, ncube+6, ncube+7, ncube+8, ncube+9,
-               ncube+10, ncube+11, ncube+12, ncube+13, ncube+14,
-               ncube+15, ncube+16, ncube+17, ncube+18, ncube+19);
-      }
-      else if ( ! strncmp(argv[i],arg_max,strlen(arg_max)) ) {
-        sscanf(argv[i]+strlen(arg_max),"%d",&max_iter);
-      }
-      else if ( ! strncmp(argv[i],arg_trials,strlen(arg_trials)) ) {
-        sscanf(argv[i]+strlen(arg_trials),"%d",&trials);
-      }
-      else if ( ! strncmp(argv[i],arg_print,strlen(arg_print)) ) {
-        sscanf(argv[i]+strlen(arg_print),"%d",&print_iter);
-      }
-      else if ( ! strncmp(argv[i],arg_comm,strlen(arg_comm)) ) {
-        sscanf(argv[i]+strlen(arg_print),"%d",&overlap_comm);
-      }
-      else if ( ! strncmp(argv[i],arg_tolerance,strlen(arg_tolerance)) ) {
-        sscanf(argv[i]+strlen(arg_print),"%f",&tolerance);
-      }
-      else if ( ! strncmp(argv[i],arg_file,strlen(arg_file)) ) {
-        char buffer[256] ;
-        sscanf(argv[i]+strlen(arg_file),"%s",buffer);
-        print_file = fopen(buffer,"a");
-      }
-    }
-  }
-
-#ifdef HAVE_MPI
-  {
-    MPI_Bcast( & nt , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & gbox[0][0] , 6 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( ncube , max_cube , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & overlap_comm , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & max_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & print_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & trials , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & tolerance , 1 , MPI_FLOAT , 0 , MPI_COMM_WORLD );
-  }
-#endif
-
-  if ( nt ) {
-    TPI_Init( nt );
-    TPI_Block();
-    TPI_Unblock();
-  }
-
-  if ( ! my_p ) {
-    fprintf(print_file,"\"PROC\" , \"THREAD\" , \"EQUATION\" , \"NON-ZERO\" , \"FUSED-AVG\", \"FUSED-MAX\", \"BLAS-AVG\", \"BLAS-MAX\", \"FUSED\", \"BLAS\"  , \"Iter\"\n");
-    fprintf(print_file,"\"COUNT\", \"COUNT\"  , \"COUNT\"    , \"COUNT\"    , \"Mflops\"   , \"Mflops\"   , \"Mflops\"  , \"Mflops\"  , \"error\", \"error\" , \"COUNT\"\n");
-  }
-
-  for ( ntest = 0 ; ! ntest || ( ntest < max_cube && ncube[ntest] ) ; ++ntest ) {
-    struct distributed_crs_matrix matrix ;
-
-    if ( ncube[ntest] ) {
-      gbox[0][1] = gbox[1][1] = gbox[2][1] = ncube[ntest] ;
-    }
-
-    hpccg_alloc_and_fill( np, my_p, (const int (*)[2]) gbox, ghost, &matrix);
-
-    {
-      const int nRow = matrix.n_local_row ;
-
-      double solve_dt[2] = { 0 , 0 };
-      double solve_blas_dt[2] = { 0 , 0 };
-      VECTOR_SCALAR norm_resid = 0.0 ;
-      VECTOR_SCALAR norm_resid_blas = 0.0 ;
-      int iter_count = 0 ;
-      int iter_count_blas = 0 ;
-      int k ;
-
-      VECTOR_SCALAR * const b      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
-      VECTOR_SCALAR * const x      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
-      VECTOR_SCALAR * const x_blas = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
-      VECTOR_SCALAR * const xexact = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * nRow );
-
-      {
-        const VECTOR_SCALAR value = 1.0 /* 1.0 / 3.0 */ ;
-        int i ;
-        for ( i = 0 ; i < nRow ; ++i ) xexact[i] = value ;
-      }
-
-      for ( k = 0 ; k < trials ; ++k ) {
-        double dt = 0 ;
-        int i ;
-
-        for ( i = 0 ; i < nRow ; ++i ) { x_blas[i] = 0.0 ; }
-
-        cgsolve_set_lhs( & matrix , xexact , b );
-
-        cgsolve_blas( & matrix, b, x_blas,
-                      tolerance , max_iter , print_iter ,
-                      & iter_count_blas, & norm_resid_blas, & dt );
-
-        solve_blas_dt[0] += dt ;
-        if ( ! k || dt < solve_blas_dt[1] ) { solve_blas_dt[1] = dt ; }
-      }
-
-      for ( k = 0 ; k < trials ; ++k ) {
-        double dt = 0 ;
-        int i ;
-
-        for ( i = 0 ; i < nRow ; ++i ) { x[i] = 0.0 ; }
-
-        cgsolve_set_lhs( & matrix , xexact , b );
-
-        cgsolve( & matrix, b, x, overlap_comm,
-                 tolerance , max_iter , print_iter ,
-                 & iter_count, & norm_resid, & dt );
-
-        solve_dt[0] += dt ;
-        if ( ! k || dt < solve_dt[1] ) { solve_dt[1] = dt ; }
-      }
-
-      {
-        int nnzGlobal = matrix.A_pc[ nRow ];
-        double error[3] = { 0 , 0 , 0 };
-
-        for ( k = 0 ; k < nRow ; ++k ) {
-          error[0] += xexact[k] * xexact[k] ;
-          error[1] += ( x[k] - xexact[k] ) * ( x[k] - xexact[k] );
-          error[2] += ( x_blas[k] - xexact[k] ) * ( x_blas[k] - xexact[k] );
-        }
-
-#ifdef HAVE_MPI
-        {
-          double error_global[3] = { 0.0 , 0.0 , 0.0 };
-          int nnz = nnzGlobal ;
-
-          MPI_Allreduce( & nnz , & nnzGlobal , 1 , MPI_INT , MPI_SUM ,
-                         MPI_COMM_WORLD );
-
-          MPI_Allreduce( error , error_global , 3 , MPI_DOUBLE , MPI_SUM ,
-                         MPI_COMM_WORLD );
-
-          error[0] = error_global[0];
-          error[1] = error_global[1];
-          error[2] = error_global[2];
-        }
-#endif
-
-        error[0] = sqrt( error[0] );
-        error[1] = sqrt( error[1] );
-        error[2] = sqrt( error[2] );
-
-        if ( ! my_p ) {
-          const int nRowGlobal = ( gbox[0][1] - gbox[0][0] ) *
-                                 ( gbox[1][1] - gbox[1][0] ) *
-                                 ( gbox[2][1] - gbox[2][0] );
-
-          const double dt_mean_fuse_step = 1.0e6 * solve_dt[0]      / (double) trials ;
-          const double dt_mean_blas_step = 1.0e6 * solve_blas_dt[0] / (double) trials ;
-          const double dt_min_fuse_step  = 1.0e6 * solve_dt[1] ;
-          const double dt_min_blas_step  = 1.0e6 * solve_blas_dt[1] ;
-
-          const double Mflop_step = 2 * nnzGlobal 
-                                  + 3 * 2 * nRowGlobal 
-                                  + 2 * 2 * nRowGlobal ;
-
-          const double Mflop_mean_fuse = Mflop_step * iter_count / dt_mean_fuse_step ;
-          const double Mflop_mean_blas = Mflop_step * iter_count_blas / dt_mean_blas_step ;
-
-          const double Mflop_max_fuse = Mflop_step * iter_count / dt_min_fuse_step ;
-          const double Mflop_max_blas = Mflop_step * iter_count_blas / dt_min_blas_step ;
-
-          fprintf(print_file,"%8d , %8d , %8d , %8d , %10g , %10g , %10g , %10g , %10g , %10g , %d\n",
-                  np , nt , nRowGlobal , nnzGlobal ,
-                  Mflop_mean_fuse , Mflop_max_fuse ,
-                  Mflop_mean_blas , Mflop_max_blas ,
-                  error[1] / error[0] , error[2] / error[0] , iter_count );
-          fflush(print_file);
-        }
-      }
-
-      free( xexact );
-      free( x_blas );
-      free( x );
-      free( b );
-    }
-    free( matrix.A_a );
-    free( matrix.A_ia );
-    free( matrix.A_pc );
-    free( matrix.p_recv_pc );
-    free( matrix.p_send_pc );
-    free( matrix.p_send_id );
-  }
-
-  if ( nt ) { TPI_Finalize(); }
-
-#ifdef HAVE_MPI
-  MPI_Finalize();
-#endif
-
-  return 0 ;
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-static
-void hpccg_alloc_and_fill( const int np ,
-                           const int my_p ,
-                           const int gbox[][2] ,
-                           const int ghost ,
-                           struct distributed_crs_matrix * const matrix )
-{
-  int (* const pbox)[3][2] = (int (*)[3][2]) malloc( sizeof(int)*np*3*2 );
-
-  const int (* const my_box)[2] = (const int (*)[2]) pbox[my_p] ;
-
-  int my_uses_box[3][2] ;
-  int * map_local_ord = NULL;
-
-  matrix->n_local_row     = 0 ;
-  matrix->n_internal_row  = 0 ;
-  matrix->A_pc            = NULL ;
-  matrix->A_ia            = NULL ;
-  matrix->A_a             = NULL ;
-
-  matrix->p_size    = np ;
-  matrix->p_rank    = my_p ;
-  matrix->p_recv_pc = NULL  ;
-  matrix->p_send_pc = NULL ;
-  matrix->p_send_id = NULL ;
-
-  /* Partition the global box */
-  box_partition_rcb( np , gbox , pbox );
-
-  /* Upper bound */
-  map_local_ord = (int *) malloc( sizeof(int) *
-                                  ( 2 * ghost + my_box[0][1]- my_box[0][0] ) *
-                                  ( 2 * ghost + my_box[1][1]- my_box[1][0] ) *
-                                  ( 2 * ghost + my_box[2][1]- my_box[2][0] ) );
-
-  /* Generate local layout with ghosting. */
-  box_partition_map( np, my_p, gbox,
-                     (const int (* const)[3][2]) pbox,
-                     ghost,
-                     my_uses_box , map_local_ord ,
-                     & matrix->n_internal_row ,
-                     & matrix->n_local_row ,
-                     & matrix->n_local_column ,
-                     & matrix->p_recv_pc ,
-                     & matrix->p_send_pc ,
-                     & matrix->p_send_id );
-
-  {
-    const int nrow = matrix->n_local_row ;
-    int * const pc = (int *) malloc( sizeof(int) * ( nrow + 1 ) );
-    int * ia = NULL ;
-    MATRIX_SCALAR * a = NULL ;
-
-    int ix , iy , iz ;
-    int sx , sy , sz ;
-
-    /* Number of non zeros in each matrix row,
-     * then prefix the array for offsets.
-     */
-    pc[0] = 0 ;
-
-    for ( iz = my_box[2][0] ; iz < my_box[2][1] ; ++iz ) {
-    for ( iy = my_box[1][0] ; iy < my_box[1][1] ; ++iy ) {
-    for ( ix = my_box[0][0] ; ix < my_box[0][1] ; ++ix ) {
-      const int irow = box_map_local( (const int (*const)[2]) my_uses_box, map_local_ord, ix, iy, iz );
-      int count = 1 ; /* Count the diagonal */
-
-      /* Count the off-diagonal terms to follow */
-      for ( sz = -1 ; sz <= 1 ; ++sz ) {
-      for ( sy = -1 ; sy <= 1 ; ++sy ) {
-      for ( sx = -1 ; sx <= 1 ; ++sx ) {
-        const int g_ix = ix + sx ;
-        const int g_iy = iy + sy ;
-        const int g_iz = iz + sz ;
-
-        if ( my_uses_box[0][0] <= g_ix && g_ix < my_uses_box[0][1] &&
-             my_uses_box[1][0] <= g_iy && g_iy < my_uses_box[1][1] &&
-             my_uses_box[2][0] <= g_iz && g_iz < my_uses_box[2][1] &&
-             ! ( sz == 0 && sy == 0 && sx == 0 ) ) {
-          /* This column is within global bounds and is not a diagonal */
-          ++count ;
-        }
-      }
-      }
-      }
-      pc[ irow + 1 ] = count ;
-    }
-    }
-    }
-
-    for ( ix = 0 ; ix < nrow ; ++ix ) { pc[ix+1] += pc[ix] ; }
-
-    ia = (int *)           malloc( sizeof(int)           * pc[ nrow ]  );
-    a  = (MATRIX_SCALAR *) malloc( sizeof(MATRIX_SCALAR) * pc[ nrow ]  );
-
-    for ( iz = my_box[2][0] ; iz < my_box[2][1] ; ++iz ) {
-    for ( iy = my_box[1][0] ; iy < my_box[1][1] ; ++iy ) {
-    for ( ix = my_box[0][0] ; ix < my_box[0][1] ; ++ix ) {
-      const int irow = box_map_local( (const int (*const)[2]) my_uses_box, map_local_ord, ix, iy, iz );
-      int ipc = pc[ irow ];
-
-      /* Diagonal term first */
-      ia[ ipc ] = irow ;
-      a[  ipc ] = 27.0f ;
-      ++ipc ;
-
-      /* Off-diagonal terms to follow */
-      for ( sz = -1 ; sz <= 1 ; ++sz ) {
-      for ( sy = -1 ; sy <= 1 ; ++sy ) {
-      for ( sx = -1 ; sx <= 1 ; ++sx ) {
-        const int g_ix = ix + sx ;
-        const int g_iy = iy + sy ;
-        const int g_iz = iz + sz ;
-
-        if ( my_uses_box[0][0] <= g_ix && g_ix < my_uses_box[0][1] &&
-             my_uses_box[1][0] <= g_iy && g_iy < my_uses_box[1][1] &&
-             my_uses_box[2][0] <= g_iz && g_iz < my_uses_box[2][1] &&
-             ! ( sz == 0 && sy == 0 && sx == 0 ) ) {
-          /* Column is within global bounds and is not a diagonal */
-          /* 'icol' is mapped for communication */
-
-          const int icol =
-            box_map_local( (const int (*const)[2]) my_uses_box, map_local_ord, g_ix, g_iy, g_iz );
-
-          if ( icol < 0 ) { abort(); }
-
-          ia[ ipc ] = icol ;
-          a[  ipc ] = -1.0f ;
-          ++ipc ;
-        }
-      }
-      }
-      }
-      if ( ipc != pc[ irow + 1 ] ) { abort(); }
-    }
-    }
-    }
-
-    matrix->A_pc = pc ;
-    matrix->A_ia = ia ;
-    matrix->A_a  = a ;
-  }
-
-  free( map_local_ord );
-  free( pbox );
-}
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/tpi_vector.c b/kokkos/basic/optional/ThreadPool/test/hhpccg/tpi_vector.c
deleted file mode 100644
index e5cc365..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/tpi_vector.c
+++ /dev/null
@@ -1,277 +0,0 @@
-#include <stdio.h>
-#include <stddef.h>
-
-#include <ThreadPool_config.h>
-#include <TPI.h>
-#include <tpi_vector.h>
-
-#if defined( HAVE_MPI )
-#include <mpi.h>
-#endif
-
-/*--------------------------------------------------------------------*/
-
-struct tpi_work_vector {
-        VECTOR_SCALAR alpha ;
-        VECTOR_SCALAR beta ;
-  const VECTOR_SCALAR * x ;
-  const VECTOR_SCALAR * y ;
-        VECTOR_SCALAR * w ; 
-        int  n ;
-};
-
-void tpi_work_span( TPI_Work * const work , const int n ,
-                    int * const iBeg , int * const iEnd )
-{
-  const int chunk = ( n + work->count - 1 ) / work->count ;
-  const int i_end = chunk + ( *iBeg = chunk * work->rank );
-
-  *iEnd = n < i_end ? n : i_end ;
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_fill( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR alpha = h->alpha ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] = alpha ; }
-}
-
-void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.alpha = alpha ;
-  tmp.w = x ;
-  tmp.n = n ;
-  TPI_Run_threads( tpi_work_fill , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_scale( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR beta = h->beta ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] *= beta ; }
-}
-
-void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.alpha = alpha ;
-  tmp.w = x ;
-  tmp.n = n ;
-  TPI_Run_threads( tpi_work_scale , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_copy( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR * const x = h->x ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] = x[i] ; }
-}
-
-void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.x = x ;
-  tmp.w = y ;
-  tmp.n = n ;
-  TPI_Run_threads( tpi_work_copy , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_axpby( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR alpha = h->alpha ;
-  const VECTOR_SCALAR beta  = h->beta ;
-  const VECTOR_SCALAR * const x = h->x ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] = alpha * x[i] + beta * w[i] ; }
-}
-
-void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
-                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.alpha = alpha ;
-  tmp.beta  = beta ;
-  tmp.x = x ;
-  tmp.w = y ;
-  tmp.n = n ;
-
-  TPI_Run_threads( tpi_work_axpby , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_axpy( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR alpha = h->alpha ;
-  const VECTOR_SCALAR * const x = h->x ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] += alpha * x[i] ; }
-}
-
-void tpi_axpy( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
-                                                   VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.alpha = alpha ;
-  tmp.x = x ;
-  tmp.w = y ;
-  tmp.n = n ;
-
-  TPI_Run_threads( tpi_work_axpy , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_xpby( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR beta  = h->beta ;
-  const VECTOR_SCALAR * const x = h->x ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] = x[i] + beta * w[i] ; }
-}
-
-void tpi_xpby( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR beta  ,
-                                                 VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.beta  = beta ;
-  tmp.x = x ;
-  tmp.w = y ;
-  tmp.n = n ;
-
-  TPI_Run_threads( tpi_work_xpby , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_dot_partial( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR * const x = h->x ;
-  const VECTOR_SCALAR * const y = h->y ;
-  double * const s = (double *) work->reduce ;
-  double tmp = *s ;
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { tmp += x[i] * y[i] ; }
-
-  *s = tmp ;
-}
-
-static void tpi_work_dot_partial_self( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR * const x = h->x ;
-  double * const s = (double *) work->reduce ;
-  double tmp = *s ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { const VECTOR_SCALAR d = x[i] ; tmp += d * d ; }
-
-  *s = tmp ;
-}
-
-static void tpi_work_dot_join( TPI_Work * work , const void * src  )
-{
-  *((double *) ( work->reduce) ) += *((const double *) src);
-}
-
-static void tpi_work_dot_init( TPI_Work * work )
-{
-  *((double *) ( work->reduce) ) = 0 ;
-}
-
-double tpi_dot( int n , const VECTOR_SCALAR * x , const VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  double result = 0.0 ;
-  tmp.x = x ;
-  tmp.y = y ;
-  tmp.n = n ;
-  if ( x != y ) {
-    TPI_Run_threads_reduce( tpi_work_dot_partial , & tmp ,
-                            tpi_work_dot_join , tpi_work_dot_init ,
-                            sizeof(result) , & result );
-  }
-  else {
-    TPI_Run_threads_reduce( tpi_work_dot_partial_self , & tmp ,
-                            tpi_work_dot_join , tpi_work_dot_init ,
-                            sizeof(result) , & result );
-  }
-#if defined HAVE_MPI
-  {
-    double tmp = result ;
-    MPI_Allreduce( & tmp , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD );
-  }
-#endif
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hhpccg/tpi_vector.h b/kokkos/basic/optional/ThreadPool/test/hhpccg/tpi_vector.h
deleted file mode 100644
index fba628f..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hhpccg/tpi_vector.h
+++ /dev/null
@@ -1,30 +0,0 @@
-
-#ifndef tpi_vector_h
-#define tpi_vector_h
-
-#define VECTOR_SCALAR float
-#define MATRIX_SCALAR float
-
-void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
-
-void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
-
-void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y );
-
-void tpi_xpby( int n , const VECTOR_SCALAR * x ,
-                             VECTOR_SCALAR beta  , VECTOR_SCALAR * y );
-
-void tpi_axpy( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
-                                                   VECTOR_SCALAR * y );
-
-void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
-                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y );
-
-double tpi_dot( int n , const VECTOR_SCALAR * x ,
-                        const VECTOR_SCALAR * y );
-
-void tpi_work_span( TPI_Work * const work , const int n ,
-                    int * const iBeg , int * const iEnd );
-
-#endif
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/BoxPartition.c b/kokkos/basic/optional/ThreadPool/test/hpccg/BoxPartition.c
deleted file mode 100644
index ef860ae..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/BoxPartition.c
+++ /dev/null
@@ -1,487 +0,0 @@
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <BoxPartition.h>
-
-/*--------------------------------------------------------------------*/
-
-static int box_map_local_entry( const int box[][2] ,
-                                const int ghost ,
-                                int local_x ,
-                                int local_y ,
-                                int local_z )
-{
-  const int nx = 2 * ghost + box[0][1] - box[0][0] ;
-  const int ny = 2 * ghost + box[1][1] - box[1][0] ;
-  const int nz = 2 * ghost + box[2][1] - box[2][0] ;
-  int result = -1 ;
-
-  local_x += ghost ;
-  local_y += ghost ;
-  local_z += ghost ;
-
-  if ( 0 <= local_x && local_x < nx &&
-       0 <= local_y && local_y < ny &&
-       0 <= local_z && local_z < nz ) {
-
-    result = local_z * ny * nx + local_y * nx + local_x ;
-  }
-  return result ;
-}
-
-int box_map_local( const int box_local[][2] ,
-                   const int ghost ,
-                   const int box_local_map[] ,
-                   const int local_x ,
-                   const int local_y ,
-                   const int local_z )
-{
-  int result = box_map_local_entry(box_local,ghost,local_x,local_y,local_z);
-
-  if ( 0 <= result ) {
-    result = box_local_map[ result ];
-  }
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-/* Recursively split a box into into (up-ip) sub-boxes */
-
-static 
-void box_partition( int ip , int up , int axis ,
-                    const int box[3][2] ,
-                    int p_box[][3][2] )
-{
-  const int np = up - ip ;
-  if ( 1 == np ) {
-    p_box[ip][0][0] = box[0][0] ; p_box[ip][0][1] = box[0][1] ;
-    p_box[ip][1][0] = box[1][0] ; p_box[ip][1][1] = box[1][1] ;
-    p_box[ip][2][0] = box[2][0] ; p_box[ip][2][1] = box[2][1] ;
-  }
-  else {
-    const int n = box[ axis ][1] - box[ axis ][0] ;
-    const int np_low = np / 2 ;  /* Rounded down */
-    const int np_upp = np - np_low ;
-
-    const int n_upp = (int) (((double) n) * ( ((double)np_upp) / ((double)np)));
-    const int n_low = n - n_upp ;
-    const int next_axis = ( axis + 2 ) % 3 ;
-
-    if ( np_low ) { /* P = [ip,ip+np_low) */
-      int dbox[3][2] ;
-      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
-      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
-      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
-
-      dbox[ axis ][1] = dbox[ axis ][0] + n_low ;
-
-      box_partition( ip, ip + np_low, next_axis,
-                     (const int (*)[2]) dbox, p_box );
-    }
-
-    if ( np_upp ) { /* P = [ip+np_low,ip+np_low+np_upp) */
-      int dbox[3][2] ;
-      dbox[0][0] = box[0][0] ; dbox[0][1] = box[0][1] ;
-      dbox[1][0] = box[1][0] ; dbox[1][1] = box[1][1] ;
-      dbox[2][0] = box[2][0] ; dbox[2][1] = box[2][1] ;
-
-      ip += np_low ;
-      dbox[ axis ][0] += n_low ;
-      dbox[ axis ][1]  = dbox[ axis ][0] + n_upp ;
-
-      box_partition( ip, ip + np_upp, next_axis,
-                     (const int (*)[2]) dbox, p_box );
-    }
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-static int box_disjoint( const int a[3][2] , const int b[3][2] )
-{
-  return a[0][1] <= b[0][0] || b[0][1] <= a[0][0] ||
-         a[1][1] <= b[1][0] || b[1][1] <= a[1][0] ||
-         a[2][1] <= b[2][0] || b[2][1] <= a[2][0] ;
-}
-
-static void resize_int( int ** a , int * allocLen , int newLen )
-{
-  int k = 32;
-  while ( k < newLen ) { k <<= 1 ; }
-  if ( NULL == *a )
-    { *a = malloc( sizeof(int)*(*allocLen = k) ); }
-  else if ( *allocLen < k ) 
-    { *a = realloc(*a , sizeof(int)*(*allocLen = k)); }
-}
-
-static void box_partition_maps( 
-  const int np ,
-  const int my_p ,
-  const int pbox[][3][2] ,
-  const int ghost ,
-  int ** map_local_id ,
-  int ** map_recv_pc ,
-  int ** map_send_pc ,
-  int ** map_send_id )
-{
-  const int (*my_box)[2] = pbox[my_p] ;
-
-  const int my_ix = my_box[0][0] ;
-  const int my_iy = my_box[1][0] ;
-  const int my_iz = my_box[2][0] ;
-  const int my_nx = my_box[0][1] - my_box[0][0] ;
-  const int my_ny = my_box[1][1] - my_box[1][0] ;
-  const int my_nz = my_box[2][1] - my_box[2][0] ;
-
-  const int my_use_nx = 2 * ghost + my_nx ;
-  const int my_use_ny = 2 * ghost + my_ny ;
-  const int my_use_nz = 2 * ghost + my_nz ;
-
-  const int id_length = my_use_nx * my_use_ny * my_use_nz ;
-
-  int * local_id  = (int *) malloc( id_length * sizeof(int) );
-  int * recv_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
-  int * send_pc   = (int *) malloc( ( np + 1 ) * sizeof(int) );
-
-  int * send_id  = NULL ;
-  int   send_id_size = 0 ;
-
-  int iLocal , iSend ;
-  int i ;
-
-  int my_use_box[3][2] ;
-
-  my_use_box[0][0] = my_box[0][0] - ghost ;
-  my_use_box[0][1] = my_box[0][1] + ghost ;
-  my_use_box[1][0] = my_box[1][0] - ghost ;
-  my_use_box[1][1] = my_box[1][1] + ghost ;
-  my_use_box[2][0] = my_box[2][0] - ghost ;
-  my_use_box[2][1] = my_box[2][1] + ghost ;
-
-  for ( i = 0 ; i < id_length ; ++i ) { local_id[i] = -1 ; }
-
-  iSend = 0 ;
-  iLocal = 0 ;
-
-  /* The vector space is partitioned by processors */
-
-  for ( i = 0 ; i < np ; ++i ) {
-    const int ip = ( i + my_p ) % np ;
-    recv_pc[i] = iLocal ;
-    send_pc[i] = iSend ;
-
-    if ( ! box_disjoint( (const int (*)[2]) my_use_box , pbox[ip] ) ) {
-      const int p_ix = pbox[ip][0][0] ;
-      const int p_iy = pbox[ip][1][0] ;
-      const int p_iz = pbox[ip][2][0] ;
-      const int p_ex = pbox[ip][0][1] ;
-      const int p_ey = pbox[ip][1][1] ;
-      const int p_ez = pbox[ip][2][1] ;
-
-      int local_x , local_y , local_z ;
-
-      /* Run the span of global cells that my processor uses */
-
-      for ( local_z = -ghost ; local_z < my_nz + ghost ; ++local_z ) {
-      for ( local_y = -ghost ; local_y < my_ny + ghost ; ++local_y ) {
-      for ( local_x = -ghost ; local_x < my_nx + ghost ; ++local_x ) {
-
-        const int global_z = local_z + my_iz ;
-        const int global_y = local_y + my_iy ;
-        const int global_x = local_x + my_ix ;
-
-        const int entry = 
-          box_map_local_entry(my_box,ghost,local_x,local_y,local_z);
-
-        if ( entry < 0 ) { abort(); }
-
-        if ( p_iz <= global_z && global_z < p_ez &&
-             p_iy <= global_y && global_y < p_ey &&
-             p_ix <= global_x && global_x < p_ex ) {
-
-          /* This ordinal is owned by processor 'ip' */
-
-          local_id[ entry ] = iLocal++ ;
-
-#if defined(DEBUG_PRINT)
-if ( my_p != ip ) {
-  fprintf(stdout,"  (%d,%d,%d) : P%d recv at local %d from P%d\n",
-                  global_x,global_y,global_z,my_p,local_id[entry],ip);
-  fflush(stdout);
-}
-#endif
-        }
-
-        /* If in my ownership and used by the other processor */
-        if ( my_p != ip &&
-             /* In my ownership: */
-             ( 0 <= local_z && local_z < my_nz &&
-               0 <= local_y && local_y < my_ny &&
-               0 <= local_x && local_x < my_nx ) &&
-             /* In other processors usage: */
-             ( p_iz - ghost <= global_z && global_z < p_ez + ghost &&
-               p_iy - ghost <= global_y && global_y < p_ey + ghost &&
-               p_ix - ghost <= global_x && global_x < p_ex + ghost ) ) {
-
-          resize_int( & send_id , & send_id_size , (iSend + 1) );
-          send_id[ iSend ] = local_id[ entry ] ;
-          ++iSend ;
-
-#if defined(DEBUG_PRINT)
-{
-  fprintf(stdout,"  (%d,%d,%d) : P%d send at local %d to P%d\n",
-                  global_x,global_y,global_z,my_p,local_id[entry],ip);
-  fflush(stdout);
-}
-#endif
-        }
-      }
-    }
-    }
-    }
-  }
-  recv_pc[np] = iLocal ;
-  send_pc[np] = iSend ;
-
-  *map_local_id  = local_id ;
-  *map_recv_pc   = recv_pc ;
-  *map_send_pc   = send_pc ;
-  *map_send_id   = send_id ;
-}
-
-void box_partition_rcb( const int np , 
-                        const int my_p ,
-                        const int root_box[][2] , 
-                        const int ghost ,
-                        int (**pbox)[3][2] , 
-                        int ** map_local_id ,
-                        int ** map_recv_pc ,
-                        int ** map_send_pc ,
-                        int ** map_send_id )
-{
-  *pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
-
-  box_partition( 0 , np , 2 , root_box , *pbox );
-
-  box_partition_maps( np , my_p , (const int (*)[3][2]) *pbox , ghost ,
-                      map_local_id , map_recv_pc , 
-                      map_send_pc , map_send_id );
-}
-
-/*--------------------------------------------------------------------*/
-
-#ifdef UNIT_TEST
-
-static int box_contain( const int a[3][2] , const int b[3][2] )
-{
-  return a[0][0] <= b[0][0] && b[0][1] <= a[0][1] &&
-         a[1][0] <= b[1][0] && b[1][1] <= a[1][1] &&
-         a[2][0] <= b[2][0] && b[2][1] <= a[2][1] ;
-}
-
-static void box_print( FILE * fp , const int a[][2] )
-{
-  fprintf(fp,"{ [ %d , %d ) , [ %d , %d ) , [ %d , %d ) }",
-                a[0][0] , a[0][1] ,  
-                a[1][0] , a[1][1] ,  
-                a[2][0] , a[2][1] );
-}
-
-static void test_box( const int box[3][2] , const int np )
-{
-  const int ncell_box = box[0][1] * box[1][1] * box[2][1] ;
-  int ncell_total = 0 ;
-  int ncell_min = ncell_box ;
-  int ncell_max = 0 ;
-  int (*pbox)[3][2] ;
-  int i , j ;
-
-  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
-
-  box_partition( 0 , np , 2 , box , pbox );
-
-  for ( i = 0 ; i < np ; ++i ) {
-    const int ncell = ( pbox[i][0][1] - pbox[i][0][0] ) *
-                      ( pbox[i][1][1] - pbox[i][1][0] ) *
-                      ( pbox[i][2][1] - pbox[i][2][0] );
-
-    if ( ! box_contain( box , (const int (*)[2]) pbox[i] ) ) {
-      fprintf(stdout,"  OUT OF BOUNDS pbox[%d/%d] = ",i,np);
-      box_print(stdout,(const int (*)[2]) pbox[i]);
-      fprintf(stdout,"\n");
-      abort();
-    }
-
-    for ( j = i + 1 ; j < np ; ++j ) {
-      if ( ! box_disjoint( (const int (*)[2]) pbox[i] ,
-                           (const int (*)[2]) pbox[j] ) ) {
-        fprintf(stdout,"  NOT DISJOINT pbox[%d/%d] = ",i,np);
-        box_print(stdout, (const int (*)[2]) pbox[i]);
-        fprintf(stdout,"\n");
-        fprintf(stdout,"               pbox[%d/%d] = ",j,np);
-        box_print(stdout, (const int (*)[2]) pbox[j]);
-        fprintf(stdout,"\n");
-        abort();
-      }
-    }
-    ncell_total += ncell ;
-
-    if ( ncell_max < ncell ) { ncell_max = ncell ; }
-    if ( ncell < ncell_min ) { ncell_min = ncell ; }
-  }
-
-  if ( ncell_total != ncell_box ) {
-    fprintf(stdout,"  WRONG CELL COUNT NP = %d\n",np);
-    abort();
-  }
-  fprintf(stdout,"NP = %d, total = %d, avg = %d, min = %d, max = %d\n",
-          np,ncell_box,ncell_box/np,ncell_min,ncell_max);
-
-  free( pbox );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void test_maps( const int root_box[][2] , const int np )
-{
-  const int ghost = 1 ;
-  const int nx_global = root_box[0][1] - root_box[0][0] ;
-  const int ny_global = root_box[1][1] - root_box[1][0] ;
-  int ieq , i , j ;
-  int (*pbox)[3][2] ;
-  int **local_values ;
-  int **map_local_id ;
-  int **map_recv_pc ;
-  int **map_send_pc ;
-  int **map_send_id ;
-  
-  pbox = (int (*)[3][2]) malloc( sizeof(int) * np * 3 * 2 );
-
-  box_partition( 0 , np , 2 , root_box , pbox );
-
-  local_values = (int **) malloc( sizeof(int*) * np );
-  map_local_id = (int **) malloc( sizeof(int*) * np );
-  map_recv_pc  = (int **) malloc( sizeof(int*) * np );
-  map_send_pc  = (int **) malloc( sizeof(int*) * np );
-  map_send_id  = (int **) malloc( sizeof(int*) * np );
-
-  /* Set each local value to the global equation number */
-
-  for ( ieq = i = 0 ; i < np ; ++i ) {
-    const int (*mybox)[2] = (const int (*)[2]) pbox[i] ;
-    const int nx = mybox[0][1] - mybox[0][0] ;
-    const int ny = mybox[1][1] - mybox[1][0] ;
-    const int nz = mybox[2][1] - mybox[2][0] ;
-    int ix , iy , iz ;
-
-    /* Generate the partition maps for this rank */
-    box_partition_maps( np , i , (const int (*)[3][2]) pbox , ghost ,
-                        & map_local_id[i] , & map_recv_pc[i] , 
-                        & map_send_pc[i] , & map_send_id[i] );
-
-    local_values[i] = (int *) malloc( sizeof(int) * map_recv_pc[i][np] );
-
-    for ( iz = -ghost ; iz < nz + ghost ; ++iz ) {
-    for ( iy = -ghost ; iy < ny + ghost ; ++iy ) {
-    for ( ix = -ghost ; ix < nx + ghost ; ++ix ) {
-      const int ieq = box_map_local(mybox,ghost,map_local_id[i],ix,iy,iz);
-
-      if ( 0 <= ieq ) {
-        const int ix_global = ix + mybox[0][0] ;
-        const int iy_global = iy + mybox[1][0] ;
-        const int iz_global = iz + mybox[2][0] ;
-
-        if ( root_box[0][0] <= ix_global && ix_global < root_box[0][1] &&
-             root_box[1][0] <= iy_global && iy_global < root_box[1][1] &&
-             root_box[2][0] <= iz_global && iz_global < root_box[2][1] ) {
-
-          local_values[i][ ieq ] = ix_global +
-                                   iy_global * nx_global +
-                                   iz_global * nx_global * ny_global ;
-        }
-        else {
-          local_values[i][ ieq ] = -1 ;
-        }
-      }
-    }
-    }
-    }
-  }
-
-  /* Pair-wise compare the local values */
-  /* i  == receiving processor rank */
-  /* ip == sending   processor rank */
-  /* j  == receiving processor data entry for message from 'ip' */
-  /* jp == sending   processor data entry for message to   'i' */
-
-  for ( i = 0 ; i < np ; ++i ) {
-    for ( j = 1 ; j < np ; ++j ) {
-      const int ip = ( i + j ) % np ;
-      const int jp = ( i + np - ip ) % np ;
-      const int nrecv = map_recv_pc[i] [j+1]  - map_recv_pc[i] [j] ;
-      const int nsend = map_send_pc[ip][jp+1] - map_send_pc[ip][jp] ;
-      int k ;
-      if ( nrecv != nsend ) {
-        fprintf(stderr,"P%d recv %d from P%d\n",i,nrecv,ip);
-        fprintf(stderr,"P%d send %d to   P%d\n",ip,nsend,i);
-        abort();
-      }
-      for ( k = 0 ; k < nrecv ; ++k ) {
-        const int irecv = map_recv_pc[i][j] + k ;
-        const int isend = map_send_pc[ip][jp] + k ;
-        const int val_irecv = local_values[i][irecv] ;
-        const int val_isend = local_values[ip][ map_send_id[ip][isend] ] ;
-        if ( val_irecv != val_isend ) {
-          fprintf(stderr,"P%d recv[%d] = %d , from P%d\n",i,k,val_irecv,ip);
-          fprintf(stderr,"P%d send[%d] = %d , to   P%d\n",ip,k,val_isend,i);
-          abort();
-        }
-      }
-    }
-  }
-
-  for ( i = 0 ; i < np ; ++i ) {
-    free( map_local_id[i] );
-    free( map_recv_pc[i] );
-    free( map_send_pc[i] );
-    free( map_send_id[i] );
-    free( local_values[i] );
-  }
-  free( map_send_id );
-  free( map_send_pc );
-  free( map_recv_pc );
-  free( map_local_id );
-  free( local_values );
-  free( pbox );
-}
-
-/*--------------------------------------------------------------------*/
-
-int main( int argc , char * argv[] )
-{
-  int np_max = 256 ;
-  int box[3][2] = { { 0 , 64 } , { 0 , 64 } , { 0 , 64 } };
-  int np = 0 ;
-
-  switch( argc ) {
-  case 3:
-    sscanf(argv[1],"%d",&np);
-    sscanf(argv[2],"%dx%dx%d",& box[0][1] , & box[1][1] , & box[2][1] );
-    if ( 0 < np ) { test_box(  (const int (*)[2]) box , np ); }
-    if ( 0 < np ) { test_maps( (const int (*)[2]) box , np ); }
-    break ;
-  default:
-    for ( np = 1 ; np <= np_max ; ++np ) {
-      test_box(  (const int (*)[2]) box , np );
-      test_maps( (const int (*)[2]) box , np );
-    }
-    break ;
-  }
-  return 0 ;
-}
-
-#endif
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/BoxPartition.h b/kokkos/basic/optional/ThreadPool/test/hpccg/BoxPartition.h
deleted file mode 100644
index 3dfd839..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/BoxPartition.h
+++ /dev/null
@@ -1,64 +0,0 @@
-
-/** \brief  Partition a { [ix,jx) X [iy,jy) X [iz,jz) } box.
- *
- *  Use recursive coordinate bisection to partition a box 
- *  into np disjoint sub-boxes.  Allocate (via malloc) and
- *  populate the sub-boxes, mapping the local (x,y,z) to
- *  a local ordinal, and mappings for the send-recv messages
- *  to update the ghost cells.
- *
- *  usage:
- *
- *  my_nx = pbox[my_p][0][1] - pbox[my_p][0][0] ;
- *  my_ny = pbox[my_p][1][1] - pbox[my_p][1][0] ;
- *  my_nz = pbox[my_p][2][1] - pbox[my_p][2][0] ;
- *
- *  for ( x = -ghost ; x < my_nx + ghost ; ++x ) {
- *  for ( y = -ghost ; y < my_ny + ghost ; ++y ) {
- *  for ( z = -ghost ; z < my_nz + ghost ; ++z ) {
- *    const int x_global = x + pbox[my_p][0][0] ;
- *    const int y_global = y + pbox[my_p][1][0] ;
- *    const int z_global = z + pbox[my_p][2][0] ;
- *
- *    const int local_ordinal =
- *      box_map_local( pbox[my_p], ghost, map_local_id, x, y, z );
- *
- *    if ( 0 <= local_ordinal ) {
- *    }
- *  }
- *  
- *  for ( i = 1 ; i < np ; ++i ) {
- *    const int recv_processor = ( my_p + i ) % np ;
- *    const int recv_ordinal_begin = map_recv_pc[i];
- *    const int recv_ordinal_end   = map_recv_pc[i+1];
- *  }
- *
- *  for ( i = 1 ; i < np ; ++i ) {
- *    const int send_processor = ( my_p + i ) % np ;
- *    const int send_map_begin = map_send_pc[i];
- *    const int send_map_end   = map_send_pc[i+1];
- *    for ( j = send_map_begin ; j < send_map_end ; ++j ) {
- *      send_ordinal = map_send_id[j] ;
- *    }
- *  }
- */
-void box_partition_rcb( 
-  const int np            /**< [in]  Number of partitions */ ,
-  const int my_p          /**< [in]  My partition rank    */ ,
-  const int root_box[][2] /**< [in]  3D Box to partition  */ ,
-  const int ghost         /**< [in]  Ghost cell boundary  */ ,
-  int (**pbox)[3][2]      /**< [out] Partition's 3D boxes */ ,
-  int ** map_local_id     /**< [out] Map local cells */ ,
-  int ** map_recv_pc      /**< [out] Receive spans per processor */ ,
-  int ** map_send_pc      /**< [out] Send prefix counts per processor */ ,
-  int ** map_send_id      /**< [out] Send message ordinals */ );
-
-/* \brief  Map a local (x,y,z) to a local ordinal.
- */
-int box_map_local( const int box_local[][2] ,
-                   const int ghost ,
-                   const int map_local_id[] ,
-                   const int local_x ,
-                   const int local_y ,
-                   const int local_z );
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/CGSolver.c b/kokkos/basic/optional/ThreadPool/test/hpccg/CGSolver.c
deleted file mode 100644
index 2670bf7..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/CGSolver.c
+++ /dev/null
@@ -1,248 +0,0 @@
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <ThreadPool_config.h>
-#include <TPI.h>
-#include <tpi_vector.h>
-#include <CGSolver.h>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-/*--------------------------------------------------------------------*/
-
-#ifdef HAVE_MPI
-
-#define TIMER( DT , F )	\
-  { double tb , te , tbg , teg , dt ; \
-    tb = TPI_Walltime(); \
-    F ; \
-    te = TPI_Walltime(); \
-    MPI_Allreduce(&tb, &tbg, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); \
-    MPI_Allreduce(&te, &teg, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); \
-    DT[0] += dt = teg - tbg ; \
-    DT[1] += dt * dt ; }
-
-#else
-
-#define TIMER( DT , F )	\
-  { const double tb = TPI_Walltime(); double dt ; \
-    F ; \
-    DT[0] += dt = TPI_Walltime() - tb ; \
-    DT[1] += dt * dt ; }
-
-#endif
-
-/*--------------------------------------------------------------------*/
-
-static
-VECTOR_SCALAR comm_sum( VECTOR_SCALAR v )
-{
-#ifdef HAVE_MPI
-  VECTOR_SCALAR result = 0 ;
-  if ( sizeof(VECTOR_SCALAR) == sizeof(double) ) {
-    MPI_Allreduce( & v , & result , 1 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD );
-  }
-  else {
-    MPI_Allreduce( & v , & result , 1 , MPI_FLOAT , MPI_SUM , MPI_COMM_WORLD );
-  }
-  return result ;
-#else
-  return v ;
-#endif
-}
-
-#ifdef HAVE_MPI
-static
-void comm_rhs_vector( const struct cgsolve_data * const data ,
-                      VECTOR_SCALAR * const vec )
-{
-  const int np = data->np ;
-  const int my_p = data->ip ;
-  const int * const recv_pc = data->recv_pc ;
-  const int * const send_pc = data->send_pc ;
-  const int * const send_id = data->send_id ;
-  int i , irecv ;
-
-  for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
-    if ( recv_pc[i] < recv_pc[i+1] ) ++irecv ;
-  }
-
-#ifdef DEBUG_PRINT
-  fflush(stdout);
-  MPI_Barrier( MPI_COMM_WORLD );
-  fflush(stdout);
-#endif
-
-  {
-    VECTOR_SCALAR * const send_buf =
-      (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * send_pc[np] );
-
-    MPI_Request * const recv_request =
-      (MPI_Request *) malloc( sizeof(MPI_Request) * irecv );
-
-    MPI_Status * const recv_status =
-      (MPI_Status *) malloc( sizeof(MPI_Status) * irecv );
-
-    for ( irecv = 0 , i = 1 ; i < np ; ++i ) {
-      const int ip = ( i + my_p ) % np ;
-      const int recv_beg    = recv_pc[i];
-      const int recv_length = recv_pc[i+1] - recv_beg ;
-      if ( recv_length ) {
-#ifdef DEBUG_PRINT
-        fprintf(stdout,"  comm_rhs_vector P%d Irecv P%d : %d\n",
-                       my_p, ip, recv_length );
-        fflush(stdout);
-#endif
-        MPI_Irecv( vec + recv_beg ,
-                   recv_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
-                   ip , 0 , MPI_COMM_WORLD , recv_request + irecv );
-        ++irecv ;
-      }
-    }
-
-    /* Gather components into send buffer */
-
-    for ( i = 0 ; i < send_pc[np] ; ++i ) {
-      send_buf[i] = vec[ send_id[i] ];
-    }
-
-    MPI_Barrier( MPI_COMM_WORLD );
-
-    for ( i = 1 ; i < np ; ++i ) {
-      const int ip = ( i + my_p ) % np ;
-      const int send_beg    = send_pc[i];
-      const int send_length = send_pc[i+1] - send_beg ;
-      if ( send_length ) { /* Send to 'i' */
-#ifdef DEBUG_PRINT
-        fprintf(stdout,"  comm_rhs_vector P%d Rsend P%d : %d\n",
-                       my_p, ip, send_length );
-        fflush(stdout);
-#endif
-        MPI_Rsend( send_buf + send_beg ,
-                   send_length * sizeof(VECTOR_SCALAR), MPI_BYTE ,
-                   ip , 0 , MPI_COMM_WORLD );
-      }
-    }
-
-    MPI_Waitall( irecv , recv_request , recv_status );
-
-    free( recv_status );
-    free( recv_request );
-    free( send_buf );
-  }
-}
-#else
-#define comm_rhs_vector( D , V ) /* */
-#endif
-
-/*--------------------------------------------------------------------*/
-
-void cgsolve_set_lhs( const struct cgsolve_data * const data ,
-                      const VECTOR_SCALAR * const x ,
-                            VECTOR_SCALAR * const b )
-{
-  const int nRow = data->nRow ;
-  const int nVec = data->recv_pc[ data->np ] ;
-  const int   * const A_pc = data->A_pc ;
-  const int   * const A_ia = data->A_ia ;
-  const MATRIX_SCALAR * const A_a  = data->A_a ;
-
-  VECTOR_SCALAR * const p = (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
-
-  tpi_copy( nRow , x , p );
-
-  comm_rhs_vector( data , p );
-
-  tpi_crs_matrix_apply( nRow, A_pc, A_ia, A_a, p, b );
-
-  free( p );
-}
-
-/*--------------------------------------------------------------------*/
-
-void cgsolve( const struct cgsolve_data * const data ,
-              const VECTOR_SCALAR * const b ,
-                    VECTOR_SCALAR * const x ,
-                    int    * const iter_count ,
-                    VECTOR_SCALAR * const norm_resid ,
-                    double * const dt_mxv ,  
-                    double * const dt_axpby ,
-                    double * const dt_dot )
-{
-  const int nRow = data->nRow ;
-  const int nVec = data->recv_pc[ data->np ] ;
-  const int max_iter = data->max_iter ;
-  const int print_iter = data->print_iter ;
-  const int   * const A_pc = data->A_pc ;
-  const int   * const A_ia = data->A_ia ;
-  const MATRIX_SCALAR * const A_a  = data->A_a ;
-  const VECTOR_SCALAR tolerance = data->tolerance ;
-
-  const VECTOR_SCALAR tol_2 = tolerance * tolerance ;
-
-  VECTOR_SCALAR * const r  = (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
-  VECTOR_SCALAR * const p  = (VECTOR_SCALAR *) malloc( nVec * sizeof(VECTOR_SCALAR) );
-  VECTOR_SCALAR * const Ap = (VECTOR_SCALAR *) malloc( nRow * sizeof(VECTOR_SCALAR) );
-
-  VECTOR_SCALAR rtrans = 0.0 ;
-
-  int k ;
-
-  tpi_copy( nRow , b , r );
-  tpi_copy( nRow , x , p );
-
-  comm_rhs_vector( data , p ); tpi_crs_matrix_apply( nRow, A_pc, A_ia, A_a, p, Ap );
-
-  tpi_axpby( nRow , -1.0, Ap, 1.0 , r );
-
-  /* Include timing dot product for 2 * #iter dot products */
-  TIMER( dt_dot , rtrans = comm_sum( tpi_dot( nRow , r , r ) ) );
-
-  for ( k = 0 ; k < max_iter && tol_2 < rtrans ; ++k ) {
-    VECTOR_SCALAR alpha ;
-    VECTOR_SCALAR beta = 0.0 ;
-    VECTOR_SCALAR pAp = 0.0 ;
-
-    if ( k ) {
-      const VECTOR_SCALAR oldrtrans = rtrans ;
-      TIMER( dt_dot , rtrans = comm_sum( tpi_dot( nRow , r , r ) ) );
-      beta = rtrans / oldrtrans ;
-    }
-
-    TIMER( dt_axpby , tpi_axpby( nRow, 1.0, r, beta, p ) );
-
-    TIMER( dt_mxv , comm_rhs_vector( data , p ); tpi_crs_matrix_apply( nRow, A_pc, A_ia, A_a, p, Ap ) );
-
-    TIMER( dt_dot , pAp = comm_sum( tpi_dot( nRow , p , Ap ) ) );
-
-    if ( 0 < fabs( pAp ) ) {
-      alpha = rtrans / pAp ;
-    }
-    else {
-      alpha = rtrans = 0.0 ; /* Orthogonal, cannot continue */
-    }
-
-    if ( ! ( ( k + 1 ) % print_iter ) ) {
-      fprintf(stdout,"  cgsolve | r(%d) | = %g\n",k,sqrt(rtrans));
-      fflush(stdout);
-    }
-  
-    TIMER( dt_axpby , tpi_axpby( nRow , alpha,  p,  1.0, x) );
-    TIMER( dt_axpby , tpi_axpby( nRow , -alpha, Ap, 1.0, r) );
-  }
-
-  *norm_resid = sqrt( rtrans );
-  *iter_count = k ;
-
-  free( Ap );
-  free( p );
-  free( r );
-}
-
-/*--------------------------------------------------------------------*/
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/CGSolver.h b/kokkos/basic/optional/ThreadPool/test/hpccg/CGSolver.h
deleted file mode 100644
index 0660a01..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/CGSolver.h
+++ /dev/null
@@ -1,32 +0,0 @@
-
-#include <tpi_vector.h>
-
-struct cgsolve_data {
-  int             nRow ; 
-  int           * A_pc ; 
-  int           * A_ia ; 
-  MATRIX_SCALAR * A_a ; 
-  int             max_iter ; 
-  int             print_iter ; 
-  VECTOR_SCALAR   tolerance ; 
-
-  int     np ; 
-  int     ip ; 
-  int   * recv_pc ; 
-  int   * send_pc ; 
-  int   * send_id ; 
-}; 
-
-void cgsolve_set_lhs( const struct cgsolve_data * data ,
-                      const VECTOR_SCALAR * const x ,
-                            VECTOR_SCALAR * const b );
-
-void cgsolve( const struct cgsolve_data * data ,
-              const VECTOR_SCALAR * const b ,
-                    VECTOR_SCALAR * const x ,
-                    int    * const iter_count ,
-                    VECTOR_SCALAR * const norm_resid ,
-                    double * const dt_mxv ,
-                    double * const dt_axpby ,
-                    double * const dt_dot );
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/CMakeLists.txt b/kokkos/basic/optional/ThreadPool/test/hpccg/CMakeLists.txt
deleted file mode 100644
index bfba897..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/CMakeLists.txt
+++ /dev/null
@@ -1,83 +0,0 @@
-
-INCLUDE(PackageAddExecutableAndTest)
-INCLUDE(PackageLibraryMacros)
-
-####################
-
-SET(HEADERS "")
-SET(SOURCES "")
- 
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
- 
-SET(HEADERS ${HEADERS}
-  ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h
-  )
- 
-INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
- 
-APPEND_SET(HEADERS
-  BoxPartition.h
-  CGSolver.h
-  tpi_vector.h
-  )
- 
-####################
-
-
-PACKAGE_ADD_EXECUTABLE(
-  test_tpi_hpccg
-  COMM serial mpi
-  SOURCES main.c CGSolver.c BoxPartition.c tpi_vector.c
-  DEPLIBS pthread m
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hpccg
-  NAME test_tpi_hpccg_serial_1
-  COMM serial
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hpccg
-  NAME test_tpi_hpccg_serial_2
-  COMM serial
-  ARGS "threads=2"
-  DIRECTORY .
-  XHOSTTYPE AIX
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hpccg
-  NAME test_tpi_hpccg_serial_4
-  COMM serial
-  ARGS "threads=4"
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hpccg
-  NAME test_tpi_hpccg_mpi_1
-  COMM mpi
-  NUM_MPI_PROCS 1
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hpccg
-  NAME test_tpi_hpccg_mpi_2
-  COMM mpi
-  NUM_MPI_PROCS 2
-  DIRECTORY .
-  )
-
-PACKAGE_ADD_TEST(
-  test_tpi_hpccg
-  NAME test_tpi_hpccg_mpi_4
-  COMM mpi
-  NUM_MPI_PROCS 4
-  DIRECTORY .
-  )
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/main.c b/kokkos/basic/optional/ThreadPool/test/hpccg/main.c
deleted file mode 100644
index 676a02d..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/main.c
+++ /dev/null
@@ -1,340 +0,0 @@
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <ThreadPool_config.h>
-#include <TPI.h>
-#include <BoxPartition.h>
-#include <CGSolver.h>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-/*--------------------------------------------------------------------*/
-
-static
-void hpccg_alloc_and_fill( const int np ,
-                           const int my_p ,
-                           const int gbox[][2] ,
-                           const int ghost ,
-                           struct cgsolve_data * const data )
-{
-  int (*pbox)[3][2] = NULL ;
-  int * map_local_ord = NULL;
-
-  data->nRow = 0 ;
-  data->A_pc = NULL ;
-  data->A_ia = NULL ;
-  data->A_a  = NULL ;
-
-  data->np = np ;
-  data->ip = my_p ;
-  data->recv_pc = NULL  ;
-  data->send_pc = NULL ;
-  data->send_id = NULL ;
-
-  box_partition_rcb( np, my_p,
-                     (const int (*)[2]) gbox, ghost,
-                     & pbox ,
-                     & map_local_ord ,
-                     & data->recv_pc ,
-                     & data->send_pc ,
-                     & data->send_id );
-
-  {
-    const int (* const my_box)[2] = (const int (*)[2]) pbox[my_p] ;
-    const int bx = my_box[0][0] ;
-    const int by = my_box[1][0] ;
-    const int bz = my_box[2][0] ;
-    const int nx = my_box[0][1] - bx ;
-    const int ny = my_box[1][1] - by ;
-    const int nz = my_box[2][1] - bz ;
-    const int n = nx * ny * nz ;
-    const int nnz = 27 * n ; /* Upper bound */
-    int    * const pc = (int *)   malloc( sizeof(int) * ( n + 1 ) );
-    int    * const ia = (int *)   malloc( sizeof(int) * nnz );
-    MATRIX_SCALAR  * const a  = (MATRIX_SCALAR *) malloc( sizeof(MATRIX_SCALAR) * nnz );
-
-    int irow = 0 ;
-    int ipc  = 0 ;
-    int ix , iy , iz ;
-    int sx , sy , sz ;
-
-    for ( iz = 0 ; iz < nz ; ++iz ) {
-    for ( iy = 0 ; iy < ny ; ++iy ) {
-    for ( ix = 0 ; ix < nx ; ++ix , ++irow ) {
-
-      if ( irow != box_map_local( my_box, ghost, map_local_ord,ix,iy,iz) ) {
-        fprintf(stderr,"P%d:  irow[%d] != box_map_local(%d,%d,%d) = %d\n",
-                my_p,irow,ix,iy,iz,
-                box_map_local( my_box, ghost, map_local_ord, ix, iy, iz) );
-      }
-
-      pc[ irow ] = ipc ;   /* Beginning of row coefficients */
-      /* Diagonal term first */
-      ia[ ipc ] = irow ;
-      a[  ipc ] = 27.0f ;
-      ++ipc ;
-
-      /* Off-diagonal terms to follow */
-      for ( sz = -1 ; sz <= 1 ; ++sz ) {
-      for ( sy = -1 ; sy <= 1 ; ++sy ) {
-      for ( sx = -1 ; sx <= 1 ; ++sx ) {
-        const int dx = ix + sx ;
-        const int dy = iy + sy ;
-        const int dz = iz + sz ;
-        const int global_x = dx + bx ;
-        const int global_y = dy + by ;
-        const int global_z = dz + bz ;
-
-        if ( gbox[0][0] <= global_x && global_x < gbox[0][1] &&
-             gbox[1][0] <= global_y && global_y < gbox[1][1] &&
-             gbox[2][0] <= global_z && global_z < gbox[2][1] &&
-             ! ( sz == 0 && sy == 0 && sx == 0 ) ) {
-          /* 'icol' is mapped for communication */
-
-          const int icol =
-            box_map_local(my_box,ghost,map_local_ord,dx,dy,dz);
-
-          if ( icol < 0 ) {
-            fprintf(stderr,"P%d : bad column at local (%d,%d,%d) global(%d,%d,%d)\n",
-                    my_p, dx,dy,dz,global_x,global_y,global_z);
-            fflush(stderr);
-            abort();
-          }
-
-          ia[ ipc ] = icol ;
-          a[  ipc ] = -1.0f ;
-          ++ipc ;
-        }
-      }
-      }
-      }
-    }
-    }
-    }
-
-    pc[irow] = ipc ;
-
-    data->nRow = irow ;
-    data->A_pc = pc ;
-    data->A_ia = ia ;
-    data->A_a  = a ;
-  }
-
-  free( map_local_ord );
-  free( pbox );
-}
-
-/*--------------------------------------------------------------------*/
-
-int main( int argc , char ** argv )
-{
-  const int ghost = 1 ;
-  const int max_cube = 20 ;
-  int ncube[20] = { 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ,
-                    0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 };
-
-  FILE * print_file = stdout ;
-  int print_iter = 500 ;
-  int max_iter = 50 ;
-
-  VECTOR_SCALAR tolerance = 0.0 ; /* Force max iterations */
-
-  int gbox[3][2] = { { 0 , 16 } , { 0 , 16 } , { 0 , 16 } };
-  int nt = 0 ;
-  int trials = 5 ;
-  int ntest ;
-  int np = 1;
-  int my_p = 0 ;
-
-#ifdef HAVE_MPI
-  MPI_Init( & argc , & argv );
-  MPI_Comm_size( MPI_COMM_WORLD , & np );
-  MPI_Comm_rank( MPI_COMM_WORLD , & my_p );
-#endif
-
-  if ( ! my_p ) {
-    const char arg_threads[] = "threads=" ;
-    const char arg_cube[] = "cube=" ;
-    const char arg_box[] = "box=" ;
-    const char arg_max[] = "max_iter=" ;
-    const char arg_trials[] = "trials=" ;
-    const char arg_print[] = "print_iter=" ;
-    const char arg_file[] = "print_file=" ;
-    int i ;
-    for ( i = 1 ; i < argc ; ++i ) {
-      if ( ! strncmp(argv[i],arg_threads,strlen(arg_threads)) ) {
-        sscanf(argv[i]+strlen(arg_threads),"%d",&nt);
-      }
-      else if ( ! strncmp(argv[i],arg_box,strlen(arg_box)) ) {
-        sscanf(argv[i]+strlen(arg_box),"%d%*[x]%d%*[x]%d",
-               & gbox[0][1] , & gbox[1][1] , & gbox[2][1] );
-      }
-      else if ( ! strncmp(argv[i],arg_cube,strlen(arg_cube)) ) {
-        sscanf(argv[i]+strlen(arg_cube),
-               "%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d",
-               ncube+0, ncube+1, ncube+2, ncube+3, ncube+4,
-               ncube+5, ncube+6, ncube+7, ncube+8, ncube+9,
-               ncube+10, ncube+11, ncube+12, ncube+13, ncube+14,
-               ncube+15, ncube+16, ncube+17, ncube+18, ncube+19);
-      }
-      else if ( ! strncmp(argv[i],arg_max,strlen(arg_max)) ) {
-        sscanf(argv[i]+strlen(arg_max),"%d",&max_iter);
-      }
-      else if ( ! strncmp(argv[i],arg_trials,strlen(arg_trials)) ) {
-        sscanf(argv[i]+strlen(arg_trials),"%d",&trials);
-      }
-      else if ( ! strncmp(argv[i],arg_print,strlen(arg_print)) ) {
-        sscanf(argv[i]+strlen(arg_print),"%d",&print_iter);
-      }
-      else if ( ! strncmp(argv[i],arg_file,strlen(arg_file)) ) {
-        char buffer[256] ;
-        sscanf(argv[i]+strlen(arg_file),"%s",buffer);
-        print_file = fopen(buffer,"a");
-      }
-    }
-  }
-
-#ifdef HAVE_MPI
-  {
-    MPI_Bcast( & nt , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & gbox[0][0] , 6 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( ncube , max_cube , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & max_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & print_iter , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-    MPI_Bcast( & trials , 1 , MPI_INT , 0 , MPI_COMM_WORLD );
-  }
-#endif
-
-  if ( nt ) {
-    TPI_Init( nt );
-    TPI_Block();
-    TPI_Unblock();
-  }
-
-  if ( ! my_p ) {
-    fprintf(print_file,"\"PROC\" , \"THREAD\" , \"EQUATION\" , \"NON-ZERO\" , \"MXV\"    , \"AXPBY\"  , \"DOT\" , \"Xerror\" , \"Iter\"\n");
-    fprintf(print_file,"\"COUNT\" , \"COUNT\"  , \"COUNT\"    , \"COUNT\"    , \"Mflops\" , \"Mflops\" , \"Mflops\" , \"L2norm\" , \"COUNT\"\n");
-  }
-
-  for ( ntest = 0 ; ! ntest || ( ntest < max_cube && ncube[ntest] ) ; ++ntest ) {
-    struct cgsolve_data cgdata ;
-
-    if ( ncube[ntest] ) {
-      gbox[0][1] = gbox[1][1] = gbox[2][1] = ncube[ntest] ;
-    }
-
-    hpccg_alloc_and_fill( np, my_p, (const int (*)[2]) gbox, ghost, &cgdata);
-
-    cgdata.max_iter   = max_iter ;
-    cgdata.print_iter = print_iter ;
-    cgdata.tolerance  = tolerance ;
-
-    {
-      double dt_mxv[2] = { 0 , 0 };
-      double dt_axpby[2] = { 0 , 0 };
-      double dt_dot[2] = { 0 , 0 };
-      VECTOR_SCALAR norm_resid = 0.0 ;
-      int iter_count = 0 ;
-      int iter_total = 0 ;
-      int k ;
-
-      VECTOR_SCALAR * const b      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
-      VECTOR_SCALAR * const x      = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
-      VECTOR_SCALAR * const xexact = (VECTOR_SCALAR *) malloc( sizeof(VECTOR_SCALAR) * cgdata.nRow );
-
-      {
-        const VECTOR_SCALAR value = 1.0 /* 1.0 / 3.0 */ ;
-        int i ;
-        for ( i = 0 ; i < cgdata.nRow ; ++i ) xexact[i] = value ;
-      }
-
-      for ( k = 0 ; k < trials ; ++k ) {
-        int i ;
-
-        for ( i = 0 ; i < cgdata.nRow ; ++i ) { x[i] = 0.0 ; }
-
-        cgsolve_set_lhs( & cgdata , xexact , b );
-
-        cgsolve( & cgdata, b, x,
-                 & iter_count, & norm_resid,
-                 dt_mxv , dt_axpby , dt_dot );
-
-        iter_total += iter_count ;
-      }
-
-      {
-        int nnzGlobal = cgdata.A_pc[ cgdata.nRow ];
-        double error[2] = { 0 , 0 };
-
-        for ( k = 0 ; k < cgdata.nRow ; ++k ) {
-          error[0] += ( x[k] - xexact[k] ) * ( x[k] - xexact[k] );
-          error[1] += xexact[k] * xexact[k] ;
-        }
-
-#ifdef HAVE_MPI
-        {
-          double error_global[2] = { 0.0 , 0.0 };
-          int nnz = nnzGlobal ;
-
-          MPI_Allreduce( & nnz , & nnzGlobal , 1 , MPI_INT , MPI_SUM ,
-                         MPI_COMM_WORLD );
-
-          MPI_Allreduce( error , error_global , 2 , MPI_DOUBLE , MPI_SUM ,
-                         MPI_COMM_WORLD );
-
-          error[0] = error_global[0];
-          error[1] = error_global[1];
-        }
-#endif
-
-        error[0] = sqrt( error[0] );
-        error[1] = sqrt( error[1] );
-
-        if ( ! my_p ) {
-          const int nRowGlobal = ( gbox[0][1] - gbox[0][0] ) *
-                                 ( gbox[1][1] - gbox[1][0] ) *
-                                 ( gbox[2][1] - gbox[2][0] );
-
-          const double mflop_mxv =
-             1.0e-6 * ( iter_total ) * 2 * nnzGlobal / dt_mxv[0] ;
-
-          const double mflop_axpby =
-             1.0e-6 * ( iter_total * 3 ) * 3 * nRowGlobal / dt_axpby[0] ;
-
-          const double mflop_dot =
-             1.0e-6 * ( iter_total * 2 ) * 2 * nRowGlobal / dt_dot[0] ;
-
-          fprintf(print_file,"%8d , %8d , %8d , %8d , %10g , %10g , %10g , %g , %d\n",
-                  np , nt , nRowGlobal , nnzGlobal ,
-                  mflop_mxv , mflop_axpby , mflop_dot ,
-                  error[0] / error[1] , iter_total );
-          fflush(print_file);
-        }
-      }
-
-      free( xexact );
-      free( x );
-      free( b );
-    }
-    free( cgdata.A_a );
-    free( cgdata.A_ia );
-    free( cgdata.A_pc );
-    free( cgdata.recv_pc );
-    free( cgdata.send_pc );
-    free( cgdata.send_id );
-  }
-
-  if ( nt ) { TPI_Finalize(); }
-
-#ifdef HAVE_MPI
-  MPI_Finalize();
-#endif
-
-  return 0 ;
-}
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/tpi_vector.c b/kokkos/basic/optional/ThreadPool/test/hpccg/tpi_vector.c
deleted file mode 100644
index 1b8a26c..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/tpi_vector.c
+++ /dev/null
@@ -1,273 +0,0 @@
-#include <stdio.h>
-
-#include <stddef.h>
-
-#include <TPI.h>
-#include <tpi_vector.h>
-
-/*--------------------------------------------------------------------*/
-
-struct tpi_work_vector {
-        VECTOR_SCALAR alpha ;
-        VECTOR_SCALAR beta ;
-  const VECTOR_SCALAR * x ;
-  const VECTOR_SCALAR * y ;
-        VECTOR_SCALAR * w ; 
-        int  n ;
-};
-
-static void tpi_work_span( TPI_Work * const work , const int n ,
-                           int * const iBeg , int * const iEnd )
-{
-  const int chunk = ( n + work->count - 1 ) / work->count ;
-  const int i_end = chunk + ( *iBeg = chunk * work->rank );
-
-  *iEnd = n < i_end ? n : i_end ;
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_fill( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR alpha = h->alpha ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] = alpha ; }
-}
-
-void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.alpha = alpha ;
-  tmp.w = x ;
-  tmp.n = n ;
-  TPI_Run_threads( tpi_work_fill , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_scale( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR beta = h->beta ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] *= beta ; }
-}
-
-void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.alpha = alpha ;
-  tmp.w = x ;
-  tmp.n = n ;
-  TPI_Run_threads( tpi_work_scale , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_copy( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR * const x = h->x ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] = x[i] ; }
-}
-
-void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.x = x ;
-  tmp.w = y ;
-  tmp.n = n ;
-  TPI_Run_threads( tpi_work_copy , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_axpby( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  const VECTOR_SCALAR alpha = h->alpha ;
-  const VECTOR_SCALAR beta  = h->beta ;
-  const VECTOR_SCALAR * const x = h->x ;
-  VECTOR_SCALAR * const w = h->w ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { w[i] = alpha * x[i] + beta * w[i] ; }
-}
-
-void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
-                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  tmp.alpha = alpha ;
-  tmp.beta  = beta ;
-  tmp.x = x ;
-  tmp.w = y ;
-  tmp.n = n ;
-
-  TPI_Run_threads( tpi_work_axpby , & tmp , 0 );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void tpi_work_dot_partial( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  VECTOR_SCALAR * const s = (VECTOR_SCALAR *) work->reduce ;
-  const VECTOR_SCALAR * const x = h->x ;
-  const VECTOR_SCALAR * const y = h->y ;
-  VECTOR_SCALAR tmp = *s ;
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { tmp += x[i] * y[i] ; }
-
-  *s = tmp ;
-}
-
-static void tpi_work_dot_partial_self( TPI_Work * work )
-{
-  const struct tpi_work_vector * const h =
-    (struct tpi_work_vector *) work->info ;
-
-  VECTOR_SCALAR * const s = (VECTOR_SCALAR *) work->reduce ;
-  const VECTOR_SCALAR * const x = h->x ;
-  VECTOR_SCALAR tmp = *s ;
-
-  int i , iEnd ;
-
-  tpi_work_span( work , h->n , & i , & iEnd );
-
-  for ( ; i < iEnd ; ++i ) { const VECTOR_SCALAR d = x[i] ; tmp += d * d ; }
-
-  *s = tmp ;
-}
-
-static void tpi_work_dot_join( TPI_Work * work , const void * src  )
-{
-  *((VECTOR_SCALAR *) ( work->reduce) ) += *((const VECTOR_SCALAR *) src);
-}
-
-static void tpi_work_dot_init( TPI_Work * work )
-{
-  *((VECTOR_SCALAR *) ( work->reduce) ) = 0 ;
-}
-
-VECTOR_SCALAR tpi_dot( int n , const VECTOR_SCALAR * x , const VECTOR_SCALAR * y )
-{
-  struct tpi_work_vector tmp = { 0.0 , 0.0 , NULL , NULL , NULL , 0 };
-  VECTOR_SCALAR result = 0.0 ;
-  tmp.x = x ;
-  tmp.y = y ;
-  tmp.n = n ;
-  if ( x != y ) {
-    TPI_Run_threads_reduce( tpi_work_dot_partial , & tmp ,
-                            tpi_work_dot_join , tpi_work_dot_init ,
-                            sizeof(result) , & result );
-  }
-  else {
-    TPI_Run_threads_reduce( tpi_work_dot_partial_self , & tmp ,
-                            tpi_work_dot_join , tpi_work_dot_init ,
-                            sizeof(result) , & result );
-  }
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
-struct tpi_crs_matrix {
-        int      nRow ;
-  const int    * A_pc ;
-  const int    * A_ia ;
-  const MATRIX_SCALAR * A_a ;
-  const VECTOR_SCALAR * x ;
-        VECTOR_SCALAR * y ;
-};
-
-static void tpi_work_crs_matrix_apply( TPI_Work * work )
-{
-  const struct tpi_crs_matrix * const h =
-    (struct tpi_crs_matrix *) work->info ;
-
-  const int   * const A_pc = h->A_pc ;
-  const int   * const A_ia = h->A_ia ;
-  const MATRIX_SCALAR * const A_a  = h->A_a ;
-  const VECTOR_SCALAR * const x = h->x ;
-
-  const int nRow  = h->nRow ;
-  const int chunk = ( nRow + work->count - 1 ) / work->count ;
-
-  int row    = chunk * work->rank ;
-  int rowEnd = chunk + row ;
-
-  if ( nRow < rowEnd ) { rowEnd = nRow ; }
-
-  {
-    const int * const pc_end = A_pc + rowEnd ;
-    const int *       pc     = A_pc + row ;
-    VECTOR_SCALAR *   y      = h->y + row ;
-
-    for ( ; pc != pc_end ; ++pc , ++y ) {
-      const int   *       ia    = A_ia + *pc ;
-      const MATRIX_SCALAR *       a     = A_a  + *pc ;
-      const MATRIX_SCALAR * const a_end = A_a  + pc[1] ;
-      VECTOR_SCALAR tmp = 0 ;
-      for ( ; a != a_end ; ++a , ++ia ) {
-        tmp += *a * x[ *ia ];
-      }
-      *y = tmp ;
-    }
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-void tpi_crs_matrix_apply(
-  const int      nRow ,
-  const int    * A_pc ,
-  const int    * A_ia ,
-  const MATRIX_SCALAR * A_a ,
-  const VECTOR_SCALAR * x ,
-        VECTOR_SCALAR * y )
-{
-  struct tpi_crs_matrix h = { 0 , NULL , NULL , NULL , NULL , NULL };
-  h.nRow = nRow ;
-  h.A_pc = A_pc ;
-  h.A_ia = A_ia ;
-  h.A_a  = A_a ;
-  h.x    = x ;
-  h.y    = y ;
-  TPI_Run_threads( tpi_work_crs_matrix_apply , & h , 0 );
-}
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/hpccg/tpi_vector.h b/kokkos/basic/optional/ThreadPool/test/hpccg/tpi_vector.h
deleted file mode 100644
index bcd514e..0000000
--- a/kokkos/basic/optional/ThreadPool/test/hpccg/tpi_vector.h
+++ /dev/null
@@ -1,31 +0,0 @@
-
-#include <ThreadPool_config.h>
-
-#ifndef tpi_vector_h
-#define tpi_vector_h
-
-#define VECTOR_SCALAR float
-#define MATRIX_SCALAR float
-
-void tpi_fill( int n , VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
-
-void tpi_scale( int n , const VECTOR_SCALAR alpha , VECTOR_SCALAR * x );
-
-void tpi_copy( int n , const VECTOR_SCALAR * x , VECTOR_SCALAR * y );
-
-void tpi_axpby( int n , VECTOR_SCALAR alpha , const VECTOR_SCALAR * x ,
-                        VECTOR_SCALAR beta  ,       VECTOR_SCALAR * y );
-
-VECTOR_SCALAR tpi_dot( int n , const VECTOR_SCALAR * x ,
-                               const VECTOR_SCALAR * y );
-
-void tpi_crs_matrix_apply(
-  const int             nRow ,
-  const int           * A_pc ,
-  const int           * A_ia ,
-  const MATRIX_SCALAR * A_a ,
-  const VECTOR_SCALAR * x ,
-        VECTOR_SCALAR * y );
-
-#endif
-
diff --git a/kokkos/basic/optional/ThreadPool/test/test_c_dnax.c b/kokkos/basic/optional/ThreadPool/test/test_c_dnax.c
deleted file mode 100644
index 4f6ab9b..0000000
--- a/kokkos/basic/optional/ThreadPool/test/test_c_dnax.c
+++ /dev/null
@@ -1,414 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards
- *
- *  Multi-array 'axpby'
- */
-
-#include <math.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <TPI.h>
-
-#if defined( HAVE_MPI )
-#include <mpi.h>
-#endif
-
-int test_c_tpi_dnax( int , int );
-
-int main( int argc , char ** argv )
-{
-  int num_thread[] = { 1 , 2 , 4 , 6 , 8 , 12 , 16 };
-  int num_test = sizeof(num_thread) / sizeof(int);
- 
-  const int ntrial = 1 < argc ? atoi( argv[1] ) : 2 ;
-  int i ;
-
-#if defined( HAVE_MPI )
-  int rank ;
- 
-  MPI_Init( & argc , & argv );
-  MPI_Comm_rank( MPI_COMM_WORLD , & rank );
-  if ( 0 == rank ) {
-#endif
-
-
-  fprintf( stdout , "\"TESTING Multiarray 'axpby' with: %s\"\n" ,
-           TPI_Version() );
- 
-  for ( i = 0 ; i < num_test ; ++i ) {
-    test_c_tpi_dnax( num_thread[i] , ntrial );
-  }
-
-#if defined( HAVE_MPI )
-  }
-  MPI_Finalize();
-#endif
- 
-  return 0 ;
-}
-
-/*------------------------------------------------------------------------*/
-
-typedef double SCALAR ;
-
-/*------------------------------------------------------------------------*/
-
-struct TestTPI_DNAX {
-  SCALAR * coef ;
-  SCALAR * array ;
-  unsigned number ;
-  unsigned length ;
-  unsigned stride ;
-  unsigned chunk_length ;
-};
-
-/*------------------------------------------------------------------------*/
-
-static
-void test_dnax_column( const unsigned num_array , 
-                       const unsigned stride ,
-                       const unsigned length , 
-                       const SCALAR * const coef ,
-                       SCALAR * const array )
-{
-  unsigned i = 0 ;
-  for ( ; i < length ; ++i ) {
-    SCALAR * const a = array + i ;
-    SCALAR tmp = 0 ;
-    unsigned j = 0 ;
-    for ( ; j < num_array ; ++j ) { tmp += coef[j] * a[ j * stride ] ; }
-    a[0] = tmp ;
-  }
-}
-
-static
-void test_dnax_row( const unsigned num_array , 
-                    const unsigned stride ,
-                    const unsigned length , 
-                    const SCALAR * const coef ,
-                    SCALAR * const array )
-{
-  unsigned i = 0 ;
-  for ( ; i < length ; ++i ) {
-    SCALAR * const a = array + i * stride ;
-    SCALAR tmp = 0 ;
-    unsigned j = 0 ;
-    for ( ; j < num_array ; ++j ) { tmp += coef[j] * a[j] ; }
-    a[0] = tmp ;
-  }
-}
-
-/*------------------------------------------------------------------------*/
-/*  The multi-array storage is flat: every array is fully contiguous.
- *  Work corresponds to a span of the array.
- */
-static
-void test_dnax_flat_work( TPI_Work * work )
-{
-  const struct TestTPI_DNAX * const info =
-    (struct TestTPI_DNAX *) work->info ;
-
-  const unsigned which_chunk = work->rank ;
-  const unsigned beg_local   = info->chunk_length * which_chunk ;
-  const unsigned max_local   = info->length - beg_local ;
-  const unsigned len_local   = info->chunk_length < max_local ?
-                               info->chunk_length : max_local ;
-
-  test_dnax_column( info->number ,
-                    info->stride ,
-                    len_local ,
-                    info->coef ,
-                    info->array + beg_local );
-
-  return ;
-}
-
-/*  The multi-array storage is chunked: each array has a contiguous chunk;
- *  but chunk-subarrays are contiguously grouped.
- */
-static
-void test_dnax_column_work( TPI_Work * work )
-{
-  const struct TestTPI_DNAX * const info =
-    (struct TestTPI_DNAX *) work->info ;
-
-  const unsigned which_chunk = work->rank ;
-  const unsigned beg_local   = info->chunk_length * which_chunk ;
-  const unsigned max_local   = info->length - beg_local ;
-  const unsigned len_local   = info->chunk_length < max_local ?
-                               info->chunk_length : max_local ;
-
-  const unsigned chunk_size = info->chunk_length * info->number ;
-
-  test_dnax_column( info->number ,
-                    info->chunk_length ,
-                    len_local ,
-                    info->coef ,
-                    info->array + which_chunk * chunk_size );
-
-  return ;
-}
-
-static
-void test_dnax_row_work( TPI_Work * work )
-{
-  const struct TestTPI_DNAX * const info =
-    (struct TestTPI_DNAX *) work->info ;
-
-  const unsigned which_chunk = work->rank ;
-  const unsigned beg_local   = info->chunk_length * which_chunk ;
-  const unsigned max_local   = info->length - beg_local ;
-  const unsigned len_local   = info->chunk_length < max_local ?
-                               info->chunk_length : max_local ;
-
-  const unsigned chunk_size = info->chunk_length * info->number ;
-
-  test_dnax_row( info->number ,
-                 info->number ,
-                 len_local ,
-                 info->coef ,
-                 info->array + which_chunk * chunk_size );
-
-  return ;
-}
-
-/*------------------------------------------------------------------------*/
-/* Process identical block of allocated memory as a
- * as a flat array, chunked-column, and chunked-row.
- */
-
-static
-void test_tpi_dnax_driver( const int nthread ,
-                           const unsigned Mflop_target ,
-                           const unsigned num_trials ,
-                           const unsigned num_test ,
-                           const unsigned num_test_array[] ,
-                           const unsigned length_array ,
-                           const unsigned length_chunk )
-{
-  const unsigned max_array = num_test_array[ num_test - 1 ];
-
-  const unsigned num_chunk =
-    ( length_array + length_chunk - 1 ) / length_chunk ;
-
-  const unsigned stride_array = num_chunk * length_chunk ;
-  const unsigned size_alloc   = max_array * stride_array ;
-
-  SCALAR * const coef  = (SCALAR *) malloc( max_array * sizeof(SCALAR) );
-  SCALAR * const array = (SCALAR *) malloc( size_alloc * sizeof(SCALAR) );
-
-  struct TestTPI_DNAX data = { NULL , NULL , 0 , 0 , 0 , 0 };
-
-  unsigned i_test , i , j ;
-
-  data.coef = coef ;
-
-  if ( NULL == array ) {
-    fprintf(stderr,"allocation failure for %u\n",size_alloc);
-    abort();
-  }
-
-  for ( i = 0 ; i < max_array ; ++i ) { coef[i] = 0 ; }
-
-  printf("\n\"test_tpi_dnax[%d]( length_array = %u , stride_array = %u )\"\n",
-         nthread , length_array , stride_array );
-  printf("\"NUMBER OF THREADS\" , %d\n" , nthread );
-  printf("\"NUMBER OF CHUNKS\" , %u\n" , num_chunk );
-  printf("\"NUMBER OF TRIALS\" , %u \n", num_trials );
-
-  printf("\"TEST\" , \"#ARRAY\" \"DT-MEAN\" , \"DT-STDDEV\" , \"MFLOP-MEAN\" , \"MFLOP-STDDEV\"\n");
-
-  /*----------------------------------------------------------------------*/
-
-  for ( i_test = 0 ; i_test < num_test ; ++i_test ) {
-    const unsigned num_array = num_test_array[ i_test ];
-    const unsigned num_sets  = max_array / num_array ;
-
-    const double mflop_cycle =
-      ((double)( 2 * num_array * length_array )) / 1.0e6 ;
-
-    const unsigned ncycle = 1 + (unsigned)( Mflop_target / mflop_cycle );
-
-    double dt_sum = 0 ;
-    double dt_sum_2 = 0 ;
-
-    data.length       = length_array ;
-    data.number       = num_array ;
-    data.stride       = stride_array ;
-    data.chunk_length = length_chunk ;
-
-    for ( i = 0 ; i < size_alloc ; ++i ) { array[i] = 0 ; }
-
-    for ( j = 0 ; j < num_trials ; ++j ) {
-
-      double dt_tmp = TPI_Walltime();
-      for ( i = 0 ; i < ncycle ; ++i ) {
-        data.array = array + stride_array * num_array * ( i % num_sets );
-        TPI_Run( & test_dnax_flat_work , & data , num_chunk , 0 );
-      }
-      dt_tmp = TPI_Walltime() - dt_tmp ;
-
-      dt_sum += dt_tmp ;
-      dt_sum_2 += dt_tmp * dt_tmp ;
-    }
-
-    {
-      const double dt_mean = dt_sum / num_trials ;
-      const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) / ( num_trials * ( num_trials - 1 ) ) );
-      const double mflop_mean = mflop_cycle * ncycle / dt_mean ;
-      const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
-
-      printf("\"FLAT  ARRAY\"  , %6u , %9.5g , %9.3g , %9.5g , %9.3g\n",
-             num_array, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
-    }
-  }
-
-  /*----------------------------------------------------------------------*/
-
-  for ( i_test = 0 ; i_test < num_test ; ++i_test ) {
-
-    const unsigned num_array = num_test_array[ i_test ];
-    const unsigned num_sets  = max_array / num_array ;
-
-    const double mflop_cycle =
-      ((double)( 2 * num_array * length_array )) / 1.0e6 ;
-
-    const unsigned ncycle = 1 + (unsigned)( Mflop_target / mflop_cycle );
-
-    double dt_sum = 0 ;
-    double dt_sum_2 = 0 ;
-
-    data.length       = length_array ;
-    data.number       = num_array ;
-    data.stride       = stride_array ;
-    data.chunk_length = length_chunk ;
-
-    for ( i = 0 ; i < size_alloc ; ++i ) { array[i] = 0 ; }
-
-    for ( j = 0 ; j < num_trials ; ++j ) {
-
-      double dt_tmp = TPI_Walltime();
-      for ( i = 0 ; i < ncycle ; ++i ) {
-        data.array = array + stride_array * num_array * ( i % num_sets );
-        TPI_Run( & test_dnax_column_work , & data , num_chunk , 0 );
-      }
-      dt_tmp = TPI_Walltime() - dt_tmp ;
-
-      dt_sum += dt_tmp ;
-      dt_sum_2 += dt_tmp * dt_tmp ;
-    }
-
-    {
-      const double dt_mean = dt_sum / num_trials ;
-      const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) / ( num_trials * ( num_trials - 1 ) ) );
-      const double mflop_mean = mflop_cycle * ncycle / dt_mean ;
-      const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
-
-      printf("\"CHUNK COLUMN\" , %6u , %9.5g , %9.3g , %9.5g , %9.3g\n",
-             num_array, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
-    }
-  }
-
-  /*----------------------------------------------------------------------*/
-
-  for ( i_test = 0 ; i_test < num_test ; ++i_test ) {
-
-    const unsigned num_array = num_test_array[ i_test ];
-    const unsigned num_sets  = max_array / num_array ;
-
-    const double mflop_cycle =
-      ((double)( 2 * num_array * length_array )) / 1.0e6 ;
-
-    const unsigned ncycle = 1 + (unsigned)( Mflop_target / mflop_cycle );
-
-    double dt_sum = 0 ;
-    double dt_sum_2 = 0 ;
-
-    data.length       = length_array ;
-    data.number       = num_array ;
-    data.stride       = stride_array ;
-    data.chunk_length = length_chunk ;
-
-    for ( i = 0 ; i < size_alloc ; ++i ) { array[i] = 0 ; }
-
-    for ( j = 0 ; j < num_trials ; ++j ) {
-
-      double dt_tmp = TPI_Walltime();
-
-      for ( i = 0 ; i < ncycle ; ++i ) {
-        data.array = array + stride_array * num_array * ( i % num_sets );
-        TPI_Run( & test_dnax_row_work , & data , num_chunk , 0 );
-      }
-      dt_tmp = TPI_Walltime() - dt_tmp ;
-
-      dt_sum += dt_tmp ;
-      dt_sum_2 += dt_tmp * dt_tmp ;
-    }
-
-    {
-      const double dt_mean = dt_sum / num_trials ;
-      const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) / ( num_trials * ( num_trials - 1 ) ) );
-      const double mflop_mean = mflop_cycle * ncycle / dt_mean ;
-      const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
-
-      printf("\"CHUNK ROW\"    , %6u , %9.5g , %9.3g , %9.5g , %9.3g\n",
-             num_array, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
-    }
-  }
-
-  /*----------------------------------------------------------------------*/
-
-  free( array );
-  free( coef );
-}
-
-/*------------------------------------------------------------------------*/
-
-int test_c_tpi_dnax( int nthread , int ntrial )
-{
-  const unsigned Mflop_target = 10 ;
-  const unsigned num_array[6] = { 2 , 5 , 10 , 20 , 50 , 100 };
-  const unsigned ntest = sizeof(num_array) / sizeof(unsigned);
-
-  if ( ntrial <= 0 ) { ntrial = 7 ; }
-
-  TPI_Init( nthread );
-
-  test_tpi_dnax_driver( nthread ,
-                        Mflop_target * nthread ,
-                        ntrial    /* number trials */ ,
-                        ntest     /* number of tests */ ,
-                        num_array /* number of arrays for each test */ ,
-                        1e6       /* array computation length */ ,
-                        1000      /* chunk length */ );
-
-  TPI_Finalize();
-
-  return 0 ;
-}
-
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/test_mpi_sum.c b/kokkos/basic/optional/ThreadPool/test/test_mpi_sum.c
deleted file mode 100644
index 51d6b9e..0000000
--- a/kokkos/basic/optional/ThreadPool/test/test_mpi_sum.c
+++ /dev/null
@@ -1,764 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <TPI.h>
-#include <ThreadPool_config.h>
-
-int rand_r( unsigned int * );
-
-/*--------------------------------------------------------------------*/
-
-#if defined(HAVE_MPI)
-
-#include <mpi.h>
-
-typedef MPI_Comm COMM ;
-
-#else
-
-typedef int COMM ;
-
-#endif
-
-static int comm_size( COMM );
-static int comm_rank( COMM );
-static void comm_reduce_dmax( COMM , double * );
-static void comm_reduce_dsum( COMM , double * );
-static void comm_reduce_d4_sum( COMM , double * );
-
-/*--------------------------------------------------------------------*/
-
-static void my_span( const unsigned count , const unsigned rank ,
-                     const unsigned size ,
-                     unsigned * begin , unsigned * length )
-{
-  const unsigned int max = ( size + count - 1 ) / count ;
-  const unsigned int end = size - max * ( count - ( rank + 1 ) );
-  if ( rank ) {
-    *begin  = end - max ;
-    *length = max ;
-  }
-  else {
-    *begin  = 0 ;
-    *length = end ;
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-#define LESS_ABS( X , Y )	( ( X < 0 ? -X : X ) < ( Y < 0 ? -Y : Y ) )
-
-static void d2_add_d( double v[] , const double a )
-{
-  const int AltV = a < 0 ? ( - a < ( v[0] < 0 ? - v[0] : v[0] ) )
-                         : (   a < ( v[0] < 0 ? - v[0] : v[0] ) );
-
-  const double VpA = v[0] + a ;
-
-  v[1] += AltV ? ( a - ( VpA - v[0] ) ) : ( v[0] - ( VpA - a ) );
-  v[0]  = VpA + v[1] ;
-  v[1] += VpA - v[0] ;
-}
-
-void d4_dot( double v[] , unsigned n , const double * x , const double * y )
-{
-  double * pos = v ;
-  double * neg = v + 2 ;
-  const double * const x_end = x + n ;
-  for ( ; x < x_end ; ++x , ++y ) {
-    const double a = *x * *y ;
-    if ( a < 0 ) { d2_add_d( neg , a ); }
-    else         { d2_add_d( pos , a ); }
-  }
-}
-
-double ddot( unsigned n , const double * x , const double * y )
-{
-  double val = 0 ;
-  const double * const x_end = x + n ;
-  for ( ; x < x_end ; ++x , ++y ) { val += *x * *y ; }
-  return val ;
-}
-
-/*--------------------------------------------------------------------*/
-
-struct TaskXY {
-  unsigned int   nreduce ;
-  unsigned int   n ;
-  const double * x ;
-  const double * y ;
-};
-
-static
-void reduce_init( TPI_Work * work )
-{
-  struct TaskXY * const info = (struct TaskXY *) work->info ;
-  double        * const dst  = (double *)        work->reduce ;
-
-  if ( info->nreduce == 4 ) {
-    dst[0] = 0 ;
-    dst[1] = 0 ;
-    dst[2] = 0 ;
-    dst[3] = 0 ;
-  }
-  else if ( info->nreduce == 1 ) {
-    dst[0] = 0 ;
-  }
-}
-
-static
-void reduce_join( TPI_Work * work , const void * arg_src )
-{
-  struct TaskXY * const info = (struct TaskXY *) work->info ;
-  double        * const dst  = (double *)        work->reduce ;
-  const double  * const src  = (const double *)  arg_src ;
-
-  if ( info->nreduce == 4 ) {
-    d2_add_d( dst ,     src[0] );
-    d2_add_d( dst ,     src[1] );
-    d2_add_d( dst + 2 , src[2] );
-    d2_add_d( dst + 2 , src[3] );
-  }
-  else if ( info->nreduce == 1 ) {
-    dst[0] += src[0] ;
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-static
-void work_d4_dot_tp( TPI_Work * work )
-{
-  struct TaskXY * const info = (struct TaskXY *) work->info ;
-  double        * const dst  = (double *)        work->reduce ;
-
-  unsigned int begin , length ;
-
-  my_span( work->count , work->rank , info->n , & begin , & length );
-
-  d4_dot( dst , length , info->x + begin , info->y + begin );
-}
-
-double d4_dot_tp( COMM comm, unsigned nwork, unsigned n,
-                  const double * x, const double * y )
-{
-  struct TaskXY info = { 4 , 0 , NULL , NULL };
-  double result[4] = { 0 , 0 , 0 , 0 };
-  info.n = n ;
-  info.x = x ;
-  info.y = y ;
-
-  if ( nwork ) {
-    TPI_Run_reduce( work_d4_dot_tp , & info , nwork ,
-                    reduce_join, reduce_init, sizeof(result) , result );
-  }
-  else {
-    TPI_Run_threads_reduce( work_d4_dot_tp , & info ,
-                            reduce_join, reduce_init, sizeof(result), result);
-  }
-
-  comm_reduce_d4_sum( comm , result );
-
-  d2_add_d( result , result[2] );
-  d2_add_d( result , result[3] );
-
-  return result[0] ;
-}
-
-static
-void task_ddot_tp( TPI_Work * work )
-{
-  struct TaskXY * const info = (struct TaskXY *) work->info ;
-  double        * const dst  = (double *) work->reduce ;
-  unsigned int begin , length ;
-
-  my_span( work->count , work->rank , info->n , & begin , & length );
-
-  *dst += ddot( length , info->x + begin , info->y + begin );
-
-  return ;
-}
-
-double ddot_tp( COMM comm, unsigned nwork, unsigned n,
-                const double * x, const double * y )
-{
-  struct TaskXY info = { 1 , 0 , NULL , NULL };
-  double result = 0 ;
-  info.n = n ;
-  info.x = x ;
-  info.y = y ;
-
-  if ( nwork ) {
-    TPI_Run_reduce( task_ddot_tp , & info , nwork ,
-                    reduce_join, reduce_init, sizeof(result), & result);
-  }
-  else {
-    TPI_Run_threads_reduce( task_ddot_tp , & info ,
-                            reduce_join, reduce_init, sizeof(result), & result);
-  }
-
-  comm_reduce_dsum( comm , & result );
-
-  return result ;
-}
-
-/*--------------------------------------------------------------------*/
-
-void dfill_rand( unsigned seed , unsigned n , double * x , double mag )
-{
-  const double scale = 2.0 * mag / (double) RAND_MAX ;
-  double * const xe = x + n ;
-  for ( ; xe != x ; ++x , ++seed ) {
-    unsigned s = seed ;
-    *x = scale * ((double) rand_r( & s )) - mag ;
-  }
-}
-
-struct FillWork {
-  double   mag ;
-  double * beg ;
-  unsigned length ;
-  unsigned seed ;
-};
-
-static void task_dfill_rand( TPI_Work * work )
-{
-  struct FillWork * const w = (struct FillWork *) work->info ;
-
-  unsigned int begin , length ;
-
-  my_span( work->count, work->rank, w->length, & begin , & length );
-
-  dfill_rand( w->seed + begin , length , w->beg + begin , w->mag );
-}
-
-void dfill_rand_tp( unsigned nblock , unsigned seed ,
-                    unsigned n , double * x , double mag )
-{
-  struct FillWork data ;
-  data.mag    = mag ;
-  data.beg    = x ;
-  data.length = n ;
-  data.seed   = seed ;
-  if ( nblock ) {
-    const int nwork = ( n + nblock - 1 ) / nblock ;
-    TPI_Run( & task_dfill_rand , & data , nwork , 0 );
-  }
-  else {
-    TPI_Run_threads( & task_dfill_rand , & data , 0 );
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-static
-void test_ddot_performance(
-  COMM comm ,
-  const int nthreads ,
-  const int nblock ,
-  const unsigned int num_trials ,
-  const unsigned int num_tests ,
-  const unsigned int length_array[]  /* Global array length for each test */ ,
-  const double   mag )
-{
-  const unsigned int ddot_flop   = 2 ;  /* 1 mult, 1 sum */
-  const unsigned int d4_dot_flop = 12 ; /* 1 mult, 7 sum, 4 compare */
-
-  const unsigned int p_rank = comm_rank( comm );
-  const unsigned int p_size = comm_size( comm );
-
-  const unsigned int max_array = length_array[ num_tests - 1 ];
-
-  unsigned int local_max_size = 0 ;
-  unsigned int i_test ;
-
-  TPI_Init( nthreads );
-
-  if ( 0 == p_rank ) {
-    fprintf(stdout,"\n\"DDOT and D4DOT Performance testing\"\n");
-    fprintf(stdout,"\"MPI size = %u , TPI size = %d , BlockSize = %d , #Trials = %u\"\n",p_size,nthreads,nblock,num_trials);
-    fprintf(stdout,"\"TEST\" , \"LENGTH\" , \"#CYCLE\" , \"DT-MEAN\" , \"DT-STDDEV\" , \"MFLOP-MEAN\" , \"MFLOP-STDDEV\"\n");
-  }
-
-  for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
-    const unsigned length = length_array[ i_test ]; /* Global */
-    const unsigned ncycle = 2 * max_array / length ;
-    const unsigned local_max = ncycle * ( ( length + p_size - 1 ) / p_size );
-    if ( local_max_size < local_max ) { local_max_size = local_max ; }
-  }
-
-  {
-    double * const x = (double*) malloc(local_max_size * 2 * sizeof(double));
-    double * const y = x + local_max_size ;
-
-    unsigned int i , j ;
-
-    dfill_rand_tp( nblock, 0,              local_max_size, x, mag );
-    dfill_rand_tp( nblock, local_max_size, local_max_size, y, mag );
-
-    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
-      const unsigned length = length_array[ i_test ]; /* Global */
-      const unsigned ncycle = 2 * max_array / length ;
-
-      unsigned int local_begin , local_length , local_nwork ;
-
-      double dt_sum = 0.0 ;
-      double dt_sum_2 = 0.0 ;
-
-      my_span( p_size, p_rank, length, & local_begin , & local_length );
-
-      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
-
-      /*--------------------------------------------------------------*/
-
-      for ( i = 0 ; i < num_trials ; ++i ) {
-        double dt = TPI_Walltime();
-        for ( j = 0 ; j < ncycle ; ++j ) {
-            ddot_tp( comm, local_nwork, local_length,
-                     x + j * local_length ,
-                     y + j * local_length );
-        }
-        dt = TPI_Walltime() - dt ;
-        comm_reduce_dmax( comm , & dt );
-        dt_sum   += dt ;
-        dt_sum_2 += dt * dt ;
-      }
-
-      if ( 0 == p_rank ) {
-        const double mflop = ((double)( ddot_flop * length * ncycle ) ) / ((double) 1e6 );
-
-        const double dt_mean = dt_sum / num_trials ;
-        const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) /
-                                     ( num_trials * ( num_trials - 1 ) ) );
-        const double mflop_mean = mflop / dt_mean ;
-        const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
-
-        fprintf(stdout,"\"DDOT\"  , %8u , %8u , %9.5g , %9.5g , %9.5g , %9.5g\n",
-                length, ncycle, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
-        fflush(stdout);
-      }
-    }
-
-    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
-      const unsigned length = length_array[ i_test ]; /* Global */
-      const unsigned ncycle = 2 * max_array / length ;
-
-      unsigned int local_begin , local_length , local_nwork ;
-
-      double dt_sum = 0 ;
-      double dt_sum_2 = 0 ;
-
-      my_span( p_size, p_rank, length, & local_begin , & local_length );
-
-      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
-
-      /*--------------------------------------------------------------*/
-
-      for ( i = 0 ; i < num_trials ; ++i ) {
-        double dt = TPI_Walltime();
-        for ( j = 0 ; j < ncycle ; ++j ) {
-            d4_dot_tp( comm, local_nwork, local_length,
-                       x + j * local_length ,
-                       y + j * local_length );
-        }
-        dt = TPI_Walltime() - dt ;
-        comm_reduce_dmax( comm , & dt );
-        dt_sum   += dt ;
-        dt_sum_2 += dt * dt ;
-      }
-
-      if ( 0 == p_rank ) {
-        const double mflop = ((double)( d4_dot_flop * length * ncycle ) ) / ((double) 1e6 );
-
-        const double dt_mean = dt_sum / num_trials ;
-        const double dt_sdev = sqrt( ( num_trials * dt_sum_2 - dt_sum * dt_sum ) /
-                                     ( num_trials * ( num_trials - 1 ) ) );
-        const double mflop_mean = mflop / dt_mean ;
-        const double mflop_sdev = mflop_mean * dt_sdev / ( dt_mean + dt_sdev );
-
-        fprintf(stdout,"\"D4DOT\" , %8u , %8u , %9.5g , %9.5g , %9.5g , %9.5g\n",
-                length, ncycle, dt_mean, dt_sdev, mflop_mean, mflop_sdev );
-        fflush(stdout);
-      }
-    }
-
-    /*--------------------------------------------------------------*/
-
-    free( x );
-  }
-
-  TPI_Finalize();
-
-  return ;
-}
-
-/*--------------------------------------------------------------------*/
-
-static
-void test_ddot_accuracy(
-  COMM comm ,
-  const int nthreads ,
-  const int nblock ,
-  const unsigned int num_tests ,
-  const unsigned int length_array[]  /* Global array length for each test */ ,
-  const double   mag )
-{
-  const unsigned int p_rank = comm_rank( comm );
-  const unsigned int p_size = comm_size( comm );
-
-  const unsigned int max_array = length_array[ num_tests - 1 ];
-  const unsigned int local_max_size = ( max_array + p_size - 1 ) / p_size ;
-
-  unsigned int i_test ;
-
-  TPI_Init( nthreads );
-
-  if ( 0 == p_rank ) {
-    fprintf(stdout,"\n\"DDOT and D4DOT Accuracy testing\"\n");
-    fprintf(stdout,"\"MPI size = %u , TPI size = %d , BlockSize = %d\"\n",p_size,nthreads,nblock);
-    fprintf(stdout,"\"TEST\" , \"LENGTH\" , \"VALUE\"\n");
-  }
-
-  {
-    double * const x = (double*) malloc(local_max_size * 2 * sizeof(double));
-    double * const y = x + local_max_size ;
-
-    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
-      const unsigned length      = length_array[ i_test ]; /* Global */
-      const unsigned length_half = length / 2 ;
-
-      unsigned local_begin , local_length , local_nwork ;
-
-      double val_ddot ;
-
-      my_span( p_size, p_rank, length, & local_begin , & local_length );
-
-      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
-
-      /*--------------------------------------------------------------*/
-
-      if ( local_begin < length_half ) {
-        const unsigned len = local_length < length_half - local_begin
-                           ? local_length : length_half - local_begin ;
-
-        dfill_rand_tp( nblock,          local_begin, len, x, mag );
-        dfill_rand_tp( nblock, length + local_begin, len, y, mag );
-      }
-
-      if ( length_half < local_begin + local_length ) {
-        const unsigned beg = length_half > local_begin
-                           ? length_half : local_begin ;
-        const unsigned off = beg - local_begin ;
-        const unsigned len = local_length - off ;
-
-        dfill_rand_tp( nblock,          beg - length_half, len, x + off, mag );
-        dfill_rand_tp( nblock, length + beg - length_half, len, y + off, - mag );
-      }
-
-      /*--------------------------------------------------------------*/
-
-      val_ddot = ddot_tp( comm, local_nwork, local_length, x, y );
-
-      if ( 0 == p_rank ) {
-        fprintf(stdout,"\"DDOT\"  , %8u , %9.3g\n", length , val_ddot );
-        fflush(stdout);
-      }
-    }
-
-    for ( i_test = 0 ; i_test < num_tests ; ++i_test ) {
-      const unsigned length      = length_array[ i_test ]; /* Global */
-      const unsigned length_half = length / 2 ;
-
-      unsigned local_begin , local_length , local_nwork ;
-
-      double val_d4_dot ;
-
-      my_span( p_size, p_rank, length, & local_begin , & local_length );
-
-      local_nwork = nblock ? ( local_length + nblock - 1 ) / nblock : 0 ;
-
-      /*--------------------------------------------------------------*/
-
-      if ( local_begin < length_half ) {
-        const unsigned len = local_length < length_half - local_begin
-                           ? local_length : length_half - local_begin ;
-
-        dfill_rand_tp( nblock,          local_begin, len, x, mag );
-        dfill_rand_tp( nblock, length + local_begin, len, y, mag );
-      }
-
-      if ( length_half < local_begin + local_length ) {
-        const unsigned beg = length_half > local_begin
-                           ? length_half : local_begin ;
-        const unsigned off = beg - local_begin ;
-        const unsigned len = local_length - off ;
-
-        dfill_rand_tp( nblock,          beg - length_half, len, x + off, mag );
-        dfill_rand_tp( nblock, length + beg - length_half, len, y + off, - mag );
-      }
-
-      /*--------------------------------------------------------------*/
-
-      val_d4_dot = d4_dot_tp( comm, local_nwork, local_length, x , y );
-
-      if ( 0 == p_rank ) {
-        fprintf(stdout,"\"DDOT\"  , %8u , %9.3g\n", length , val_d4_dot );
-        fflush(stdout);
-      }
-    }
-
-    /*--------------------------------------------------------------*/
-
-    free( x );
-  }
-
-  TPI_Finalize();
-
-  return ;
-}
-
-/*--------------------------------------------------------------------*/
-
-const unsigned test_lengths[] = 
-  { 1e4 , 2e4 , 5e4 ,
-    1e5 , 2e5 , 5e5 ,
-    1e6 , 2e6 , 5e6 , 1e7 };
-
-const unsigned test_count = sizeof(test_lengths) / sizeof(unsigned);
-const unsigned nblock = 2500 ;
-
-const double test_mag = 1e4 ;
-
-static void test_performance(
-  COMM comm , const int test_thread_count , const int test_thread[] )
-{
-  const unsigned num_trials = 11 ;
-
-  int i ;
-
-  for ( i = 0 ; i < test_thread_count ; ++i ) {
-
-    test_ddot_performance( comm , test_thread[i] , nblock,
-                           num_trials , test_count , test_lengths , test_mag );
-
-    test_ddot_performance( comm , test_thread[i] , 0,
-                           num_trials , test_count , test_lengths , test_mag );
-  }
-}
-
-static void test_accuracy(
-  COMM comm , const int test_thread_count , const int test_thread[] ,
-              unsigned test_do )
-{
-  int i ;
-
-  if ( test_count < test_do ) { test_do = test_count ; }
-
-  for ( i = 0 ; i < test_thread_count ; ++i ) {
-
-    test_ddot_accuracy( comm, test_thread[i], nblock,
-                        test_do, test_lengths, test_mag );
-
-    test_ddot_accuracy( comm, test_thread[i], 0,
-                        test_do, test_lengths, test_mag );
-  }
-}
-
-/*--------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-#define TEST_THREAD_MAX 128
-
-#if defined(HAVE_MPI)
-
-int main( int argc , char **argv )
-{
-  int nthread[ TEST_THREAD_MAX ];
-  int i ;
-
-  MPI_Init( & argc , & argv );
-
-  for ( i = 0 ; i < TEST_THREAD_MAX ; ++i ) { nthread[i] = 0 ; }
-
-  if ( 0 == comm_rank( MPI_COMM_WORLD ) ) {
-    if ( 1 < argc && argc < TEST_THREAD_MAX ) {
-      nthread[0] = 1 ;
-      nthread[1] = argc - 1 ;
-      for ( i = 1 ; i < argc ; ++i ) { nthread[i+1] = atoi( argv[i] ); }
-    }
-    else {
-      nthread[0] = 0 ;
-      nthread[1] = 1 ;
-      nthread[2] = 1 ;
-    }
-  }
-
-  MPI_Bcast( nthread , TEST_THREAD_MAX , MPI_INT , 0 , MPI_COMM_WORLD );
-
-  if ( nthread[0] ) {
-    test_accuracy(    MPI_COMM_WORLD , nthread[1] , nthread + 2 , test_count );
-    test_performance( MPI_COMM_WORLD , nthread[1] , nthread + 2 );
-  }
-  else {
-    test_accuracy(    MPI_COMM_WORLD , nthread[1] , nthread + 2 , 3 );
-  }
-
-  MPI_Finalize();
-
-  return 0 ;
-}
-
-static int comm_size( COMM comm )
-{
-  int size = 0 ;
-  MPI_Comm_size( comm , & size );
-  return size ;
-}
-
-static int comm_rank( COMM comm )
-{
-  int rank = 0 ;
-  MPI_Comm_rank( comm , & rank );
-  return rank ;
-}
-
-static void comm_reduce_dmax( COMM comm , double * val )
-{
-  double tmp ;
-  if ( MPI_SUCCESS ==
-       MPI_Allreduce( val , & tmp , 1 , MPI_DOUBLE , MPI_MAX , comm ) ) {
-    *val = tmp ;
-  }
-  else {
-    *val = 0 ;
-  }
-}
-
-static void comm_reduce_dsum( COMM comm , double * val )
-{
-  double tmp ;
-  if ( MPI_SUCCESS ==
-       MPI_Allreduce( val , & tmp , 1 , MPI_DOUBLE , MPI_SUM , comm ) ) {
-    *val = tmp ;
-  }
-  else {
-    *val = 0 ;
-  }
-}
-
-static void comm_reduce_d4_op( void * argin ,
-                               void * argout ,
-                               int * n ,
-                               MPI_Datatype * d )
-{
-  if ( d && n && *n == 4 ) {
-    double * const in  = (double*) argin ;
-    double * const out = (double*) argout ;
-    d2_add_d( out ,     in[0] );
-    d2_add_d( out ,     in[1] );
-    d2_add_d( out + 2 , in[2] );
-    d2_add_d( out + 2 , in[3] );
-  }
-  return ; 
-}
-
-static void comm_reduce_d4_sum( COMM comm , double * val )
-{
-  double tmp[4] ;
-  MPI_Op mpi_op = MPI_OP_NULL ;
-
-  /* Use Reduce->Bcast instead of Allreduce due to a bug with the SUN MPI. */
-
-  MPI_Op_create( comm_reduce_d4_op , 0 , & mpi_op );
-  MPI_Reduce( val , tmp , 4 , MPI_DOUBLE , mpi_op , 0 , comm );
-  MPI_Bcast(        tmp , 4 , MPI_DOUBLE ,          0 , comm );
-  MPI_Op_free( & mpi_op );
-
-  val[0] = tmp[0] ;
-  val[1] = tmp[1] ;
-  val[2] = tmp[2] ;
-  val[3] = tmp[3] ;
-}
-
-#else
-
-int main( int argc , char **argv )
-{
-  int nthread[ TEST_THREAD_MAX ];
-  int i ;
-
-  for ( i = 0 ; i < TEST_THREAD_MAX ; ++i ) { nthread[i] = 0 ; }
-
-  if ( 1 < argc && argc < TEST_THREAD_MAX ) {
-    nthread[0] = 1 ;
-    nthread[1] = argc - 1 ;
-    for ( i = 1 ; i < argc ; ++i ) { nthread[i+1] = atoi( argv[i] ); }
-  }
-  else {
-    nthread[0] = 0 ;
-    nthread[1] = 4 ;
-    nthread[2] = 1 ;
-    nthread[3] = 2 ;
-    nthread[4] = 4 ;
-    nthread[5] = 8 ;
-  }
-
-  if ( nthread[0] ) {
-    test_accuracy(    0 , nthread[1] , nthread + 2 , test_count );
-    test_performance( 0 , nthread[1] , nthread + 2 );
-  }
-  else {
-    test_accuracy(    0 , nthread[1] , nthread + 2 , 3 );
-  }
-
-  return 0 ;
-}
-
-static int comm_size( COMM comm ) { return comm ? -1 : 1 ; }
-static int comm_rank( COMM comm ) { return comm ? -1 : 0 ; }
-static void comm_reduce_dmax( COMM comm , double * val )
-{
-  if ( comm ) { *val = 0 ; }
-  return ;
-}
-static void comm_reduce_dsum( COMM comm , double * val )
-{
-  if ( comm ) { *val = 0 ; }
-  return ;
-}
-static void comm_reduce_d4_sum( COMM comm , double * val )
-{
-  if ( comm ) { val[0] = val[1] = val[2] = val[3] = 0 ; }
-  return ;
-}
-
-#endif
-
-/*--------------------------------------------------------------------*/
-
diff --git a/kokkos/basic/optional/ThreadPool/test/test_pthreads.c b/kokkos/basic/optional/ThreadPool/test/test_pthreads.c
deleted file mode 100644
index 235eb41..0000000
--- a/kokkos/basic/optional/ThreadPool/test/test_pthreads.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <pthread.h>
-#include <TPI.h>
-
-/*------------------------------------------------------------------------*/
-/* Test various ways of controling worker threads */
-
-typedef struct TestPthreads_struct {
-  pthread_mutex_t  m_lock ;
-  pthread_cond_t   m_cond ;
-  int              m_thread_rank ;
-  int              m_thread_count ;
-} TestPthreads ;
-
-/*------------------------------------------------------------------------*/
-/*------------------------------------------------------------------------*/
-
-static void * test_driver( void * arg )
-{
-  TestPthreads * const data = (TestPthreads*) arg ;
-  TestPthreads * const root = data - data->m_thread_rank ;
-
-  /*------------------------------*/
-  /* Initializing */
-
-  pthread_mutex_lock(   & data->m_lock );
-
-  pthread_mutex_lock(   & root->m_lock );
-  pthread_cond_signal(  & root->m_cond );
-  pthread_mutex_unlock( & root->m_lock );
-
-  /*------------------------------*/
-
-  while ( data->m_thread_rank ) { 
-    pthread_cond_wait( & data->m_cond , & data->m_lock );
-  } 
-  pthread_mutex_unlock( & data->m_lock );
-
-  /*------------------------------*/
-  /* Terminating */
-
-  pthread_mutex_lock( & root->m_lock );
-  if ( 0 == --( root->m_thread_count ) ) {
-    pthread_cond_signal( & root->m_cond );
-  }
-  pthread_mutex_unlock( & root->m_lock );
-
-  return NULL ;
-}
-
-
-static void test_run( pthread_attr_t * const thread_attr ,
-                      const int number_threads ,
-                      const int number_trials ,
-                      const int number_loops ,
-                      double * const dt_start_stop ,
-                      double * const dt_loop )
-{
-  TestPthreads data[ number_threads ];
-  double dt_total ;
-  double dt_run = 0 ;
-  int j ;
-
-  dt_total = TPI_Walltime();
-
-  for ( j = 0 ; j < number_trials ; ++j ) {
-    int i ;
-
-    for ( i = 0 ; i < number_threads ; ++i ) {
-      pthread_cond_init( & data[i].m_cond , NULL );
-      pthread_mutex_init( & data[i].m_lock , NULL );
-      data[i].m_thread_rank = i ;
-      data[i].m_thread_count = number_threads ;
-    }
-
-    pthread_mutex_lock( & data->m_lock );
-
-    for ( i = 1 ; i < number_threads ; ++i ) {
-      pthread_t pt ;
-      pthread_create( & pt, thread_attr, & test_driver , data + i );
-      pthread_cond_wait( & data->m_cond , & data->m_lock );
-      pthread_mutex_lock( & data[i].m_lock );
-    }
-
-    /* Running */
-
-    {
-      double dt = TPI_Walltime();
-      int k ;
-
-      for ( k = 1 ; k < number_loops ; ++k ) {
-        for ( i = 1 ; i < number_threads ; ++i ) {
-          pthread_cond_signal(  & data[i].m_cond );
-          pthread_mutex_unlock( & data[i].m_lock );
-        }
-
-        /* Work goes here */
-
-        for ( i = 1 ; i < number_threads ; ++i ) {
-          pthread_mutex_lock( & data[i].m_lock );
-        }
-      }
-
-      dt_run += TPI_Walltime() - dt ;
-    }
-
-    /* Termination */
-
-    --( data->m_thread_count );
-
-    if ( data->m_thread_count ) {
-      for ( i = 1 ; i < number_threads ; ++i ) {
-        data[i].m_thread_rank = 0 ;
-        pthread_cond_signal(  & data[i].m_cond );
-        pthread_mutex_unlock( & data[i].m_lock );
-      }
-
-      pthread_cond_wait( & data->m_cond , & data->m_lock );
-    }
-
-    pthread_mutex_unlock( & data->m_lock );
-
-    for ( i = 0 ; i < number_threads ; ++i ) {
-      pthread_cond_destroy( & data[i].m_cond );
-      pthread_mutex_destroy( & data[i].m_lock );
-    }
-  }
-
-  dt_total = TPI_Walltime() - dt_total ;
-
-  *dt_loop       = 1.0e6 * dt_run / (double) ( number_trials * number_loops );
-  *dt_start_stop = 1.0e6 * ( dt_total - dt_run ) / (double) number_trials ;
-}
-
-/*------------------------------------------------------------------------*/
-/*------------------------------------------------------------------------*/
-
-static double test_mutex_init_destroy( const int number )
-{
-  pthread_mutex_t mutex ;
-  double dt ;
-  int i ;
-  dt = TPI_Walltime();
-  for ( i = 0 ; i < number ; ++i ) {
-    pthread_mutex_init( & mutex , NULL );
-    pthread_mutex_destroy( & mutex );
-  }
-  dt = ( TPI_Walltime() - dt ) / (double) number ;
-  return dt ;
-}
-
-static double test_mutex_lock_unlock( const int number )
-{
-  pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER ;
-  double dt ;
-  int i ;
-
-  dt = TPI_Walltime();
-  for ( i = 0 ; i < number ; ++i ) {
-    pthread_mutex_lock( & mutex );
-    pthread_mutex_unlock( & mutex );
-  }
-  dt = ( TPI_Walltime() - dt ) / (double) number ;
-
-  pthread_mutex_destroy( & mutex );
-  return dt ;
-}
-
-/*------------------------------------------------------------------------*/
-
-void test_pthreads_performance( int n_test , int * n_concurrent )
-{
-  const int n_mutex = 1e4 /* 1e8 */ ;
-  const int n_trial = 1e2 /* 1e4 */ ;
-  const int n_loop  = 1e3 /* 1e4 */ ;
-
-  {
-    const double dt = 1e6 * test_mutex_init_destroy( n_mutex );
-    fprintf(stdout,"\n\"test pthreads mutex init/destroy (microsec)\" , %g\n",dt);
-  }
-
-  {
-    const double dt = 1e6 * test_mutex_lock_unlock( n_mutex );
-    fprintf(stdout,"\n\"test pthreads mutex lock/unlock (microsec)\" , %g\n",dt);
-  }
-
-  /*------------------------------------------------------------------*/
-
-  {
-    int i ;
-
-    pthread_attr_t thread_attr ;
-
-    fprintf(stdout,"\n\"test pthreads SCOPE_SYSTEM run-blocking\"\n");
-    fprintf(stdout,"\"#Threads\" , \"#Spawned\" \"Spawn (microsec)\" , \"Loop (microsec)\"\n");
-
-    pthread_attr_init( & thread_attr );
-    pthread_attr_setscope(       & thread_attr, PTHREAD_SCOPE_SYSTEM );
-    pthread_attr_setdetachstate( & thread_attr, PTHREAD_CREATE_DETACHED );
-
-    for ( i = 0 ; i < n_test ; ++i ) {
-      const int nthread = n_concurrent[i] ;
-      double dt_start_stop , dt_loop ;
-
-      test_run( & thread_attr, nthread, n_trial, n_loop,
-                & dt_start_stop , & dt_loop );
-
-      fprintf( stdout, "%d , %d , %g , %g\n",
-               nthread , nthread - 1 , dt_start_stop , dt_loop );
-      fflush( stdout );
-    }
-
-    pthread_attr_destroy( & thread_attr );
-  }
-
-  /*------------------------------------------------------------------*/
-
-  {
-    int i ;
-
-    pthread_attr_t thread_attr ;
-
-    fprintf(stdout,"\n\"test pthreads SCOPE_PROCESS run-blocking\"\n");
-    fprintf(stdout,"\"#Threads\" , \"#Spawned\" \"Spawn (microsec)\" , \"Loop (microsec)\"\n");
-
-    pthread_attr_init( & thread_attr );
-    pthread_attr_setscope(       & thread_attr, PTHREAD_SCOPE_PROCESS );
-    pthread_attr_setdetachstate( & thread_attr, PTHREAD_CREATE_DETACHED );
-
-    for ( i = 0 ; i < n_test ; ++i ) {
-      const int nthread = n_concurrent[i] ;
-      double dt_start_stop , dt_loop ;
-
-      test_run( & thread_attr, nthread, n_trial, n_loop,
-                & dt_start_stop , & dt_loop );
-
-      fprintf( stdout, "%d , %d , %g , %g\n",
-               nthread , nthread - 1 , dt_start_stop , dt_loop );
-      fflush( stdout );
-    }
-
-    pthread_attr_destroy( & thread_attr );
-  }
-
-  /*------------------------------------------------------------------*/
-
-  fflush( stdout );
-}
-
-/*------------------------------------------------------------------------*/
-
-
diff --git a/kokkos/basic/optional/ThreadPool/test/test_tpi.cpp b/kokkos/basic/optional/ThreadPool/test/test_tpi.cpp
deleted file mode 100644
index cf5a649..0000000
--- a/kokkos/basic/optional/ThreadPool/test/test_tpi.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards
- */
-
-#include <stdexcept>
-#include <iostream>
-#include <TPI.hpp>
-
-/*------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------*/
-
-template<unsigned N> class TEST ;
-
-template<unsigned N>
-class TEST {
-public:
-  int m_flag[N] ;
-  ~TEST() {}
-  TEST();
-  void flag( TPI::Work & );
-  void verify();
-private:
-  TEST( const TEST & );
-  TEST & operator = ( const TEST & );
-};
-
-template<unsigned N>
-TEST<N>::TEST()
-{
-  for ( unsigned i = 0 ; i < N ; ++i ) { m_flag[i] = 0 ; }
-}
-
-template<unsigned N>
-void TEST<N>::flag( TPI::Work & work )
-{
-  static const char method[] = "TEST::flag" ;
-  if ( work.count != (int) N ) {
-    std::cerr << method
-              << "<" << N << "> count(" << work.count << ") failed"
-              << std::endl ;
-    throw std::exception();
-  }
-  m_flag[ work.rank ] = 1 ;
-}
-
-template<unsigned N>
-void TEST<N>::verify()
-{
-  static const char method[] = "TEST::verify" ;
-
-  for ( unsigned i = 0 ; i < N ; ++i ) {
-    if ( ! m_flag[i] ) {
-      std::cerr << method
-                << "<" << N << "> m_flag[" << i << "] failed"
-                << std::endl ;
-      throw std::exception();
-    }
-    else {
-      m_flag[i] = 0 ;
-    }
-  }
-}
-
-void test_tpi_cpp( int np )
-{
-  TEST<1> test_1 ;
-  TEST<2> test_2 ;
-  TEST<4> test_4 ;
-  TEST<8> test_8 ;
-  TEST<16> test_16 ;
-
-  TPI::Init( np );
-
-  TPI::Run( test_1 , & TEST<1>::flag , 1 );
-  TPI::Run( test_2 , & TEST<2>::flag , 2 );
-  TPI::Run( test_4 , & TEST<4>::flag , 4 );
-  TPI::Run( test_8 , & TEST<8>::flag , 8 );
-  TPI::Run( test_16 , & TEST<16>::flag , 16 );
-
-  test_1.verify();
-  test_2.verify();
-  test_4.verify();
-  test_8.verify();
-  test_16.verify();
-
-  TPI::Finalize();
-}
-
-int main( int argc , char ** argv )
-{
-  if ( argc ) { std::cout << argv[0] ; }
-  else        { std::cout << "test" ; }
-  test_tpi_cpp(1); std::cout << " 1 " ;
-  test_tpi_cpp(2); std::cout << " 2 " ;
-  test_tpi_cpp(4); std::cout << " 4 " ;
-  test_tpi_cpp(8); std::cout << " 8 " ;
-  test_tpi_cpp(16); std::cout << " 16 " ;
-  std::cout << " passed" << std::endl ;
-  return 0 ;
-}
-
diff --git a/kokkos/basic/optional/ThreadPool/test/test_tpi_unit.c b/kokkos/basic/optional/ThreadPool/test/test_tpi_unit.c
deleted file mode 100644
index 34faef8..0000000
--- a/kokkos/basic/optional/ThreadPool/test/test_tpi_unit.c
+++ /dev/null
@@ -1,505 +0,0 @@
-/*------------------------------------------------------------------------*/
-/*                    TPI: Thread Pool Interface                          */
-/*                Copyright (2008) Sandia Corporation                     */
-/*                                                                        */
-/*  Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive   */
-/*  license for use of this work by or on behalf of the U.S. Government.  */
-/*                                                                        */
-/*  This library is free software; you can redistribute it and/or modify  */
-/*  it under the terms of the GNU Lesser General Public License as        */
-/*  published by the Free Software Foundation; either version 2.1 of the  */
-/*  License, or (at your option) any later version.                       */
-/*                                                                        */
-/*  This library is distributed in the hope that it will be useful,       */
-/*  but WITHOUT ANY WARRANTY; without even the implied warranty of        */
-/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU     */
-/*  Lesser General Public License for more details.                       */
-/*                                                                        */
-/*  You should have received a copy of the GNU Lesser General Public      */
-/*  License along with this library; if not, write to the Free Software   */
-/*  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307   */
-/*  USA                                                                   */
-/*------------------------------------------------------------------------*/
-/**
- * @author H. Carter Edwards
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <TPI.h>
-
-#if defined( HAVE_MPI )
-#include <mpi.h>
-#endif
-
-/*--------------------------------------------------------------------*/
-
-static void test_work( TPI_Work * );
-static void test_reduce_work( TPI_Work * );
-static void test_reduce_init( TPI_Work * );
-static void test_reduce_join( TPI_Work * , const void * );
-static void test_reduce_via_lock( TPI_Work * );
-static void test_reduce_via_nolock( TPI_Work * );
-
-void test_tpi_init(   const int ntest, const int nthread[], const int ntrial);
-void test_tpi_block(  const int ntest, const int nthread[], const int ntrial);
-void test_tpi_reduce( const int ntest, const int nthread[], const int ntrial);
-void test_tpi_work(   const int ntest, const int nthread[],
-                      const int nwork , const int ntrial );
-void test_tpi_work_async(
-  const int ntest , const int nthread[] , const int nwork , const int ntrial );
-
-int main( int argc , char ** argv )
-{
-  int num_thread[] = { 1 , 2 , 4 , 6 , 8 , 12 , 16 };
-  int num_test = sizeof(num_thread) / sizeof(int);
-
-#if defined( HAVE_MPI )
-  int rank ;
-
-  MPI_Init( & argc , & argv );
-  MPI_Comm_rank( MPI_COMM_WORLD , & rank );
-  if ( 0 == rank ) {
-#endif
- 
-  const int ntrial = 1 < argc ? atoi( argv[1] ) : 5 ;
-  const int nwork  = 2 < argc ? atoi( argv[2] ) : 100 ;
- 
-  /* Get the configuration print message out. */
-  fprintf( stdout , "\"%s\"\n" , TPI_Version() );
-  fprintf( stdout , "\"Unit Testing: ntrial = %d , nwork = %d\"\n" , ntrial , nwork );
- 
-  test_tpi_init(   num_test , num_thread , ntrial );
-  test_tpi_block(  num_test , num_thread , ntrial );
-  test_tpi_reduce( num_test , num_thread , ntrial );
-  test_tpi_work(   num_test , num_thread , nwork , ntrial );
-  test_tpi_work_async( num_test , num_thread , nwork , ntrial );
- 
-#if defined( HAVE_MPI )
-  }
-  MPI_Finalize();
-#endif
-
-  return 0 ;
-}
-
-/*--------------------------------------------------------------------*/
-
-void test_tpi_init( const int ntest , const int nthread[] , const int ntrial )
-{
-  int j ;
-
-  fprintf( stdout , "\n\"TEST TPI_Init / TPI_Finalize\"\n" );
-  fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Init(avg-msec)\" , \"TPI_Init(stddev-msec)\" , \"TPI_Finalize(avg-msec)\" , \"TPI_Finalize(stddev-msec)\"\n");
-
-  for ( j = 0 ; j < ntest ; ++j ) {
-    const int nth = nthread[j];
-    double dt_init_total   = 0.0 ;
-    double dt_init_total_2 = 0.0 ;
-    double dt_fin_total    = 0.0 ;
-    double dt_fin_total_2  = 0.0 ;
-    int i ;
-    int result ;
-
-    for ( i = 0 ; i < ntrial ; ++i ) {
-      double t , dt ;
-
-      t = TPI_Walltime();
-      result = TPI_Init( nth );
-      dt = TPI_Walltime() - t ;
-      dt_init_total += dt ;
-      dt_init_total_2 += dt * dt ;
-
-      if ( result != nth ) {
-        fprintf(stderr,"%d != TPI_Init(%d) : FAILED at trial %d\n",
-                result , nth , i );
-        abort();
-      }
-
-      t = TPI_Walltime();
-      TPI_Finalize();
-      dt = TPI_Walltime() - t ;
-      dt_fin_total += dt ;
-      dt_fin_total_2 += dt * dt ;
-    }
-
-    if ( 1 < ntrial ) {
-      const double init_mean = 1.0e6 * dt_init_total / ntrial ;
-      const double init_sdev = 1.0e6 * sqrt( ( ntrial * dt_init_total_2 -
-                                       dt_init_total * dt_init_total ) /
-                                     ( ntrial * ( ntrial - 1 ) ) );
-
-      const double fin_mean = 1.0e6 * dt_fin_total / ntrial ;
-      const double fin_sdev = 1.0e6 * sqrt( ( ntrial * dt_fin_total_2 -
-                                      dt_fin_total * dt_fin_total ) /
-                                    ( ntrial * ( ntrial - 1 ) ) );
-      
-      fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n",
-              nth , ntrial , init_mean , init_sdev , fin_mean , fin_sdev );
-    }
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-void test_tpi_block( const int ntest , const int nthread[] , const int ntrial )
-{
-  int i, j ;
-
-  fprintf( stdout , "\n\"TEST TPI_Block / TPI_Unblock\"\n" );
-  fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Block(avg-msec)\" , \"TPI_Block(stddev-msec)\" , \"TPI_Unblock(avg-msec)\" , \"TPI_Unblock(stddev-msec)\"\n");
-
-  for ( j = 0 ; j < ntest ; ++j ) {
-    const int nth = nthread[j];
-
-    double dt_block_total   = 0.0 ;
-    double dt_block_total_2 = 0.0 ;
-    double dt_unblock_total    = 0.0 ;
-    double dt_unblock_total_2  = 0.0 ;
-
-    int result = TPI_Init( nth );
-
-    if ( result != nth ) {
-      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
-      abort();
-    }
-
-    for ( i = 0 ; i < ntrial ; ++i ) {
-      double t , dt ;
-
-      t = TPI_Walltime();
-      TPI_Block();
-      dt = TPI_Walltime() - t ;
-      dt_block_total += dt ;
-      dt_block_total_2 += dt * dt ;
-
-
-      t = TPI_Walltime();
-      TPI_Unblock();
-      dt = TPI_Walltime() - t ;
-      dt_unblock_total += dt ;
-      dt_unblock_total_2 += dt * dt ;
-    }
-
-    TPI_Finalize();
-
-    if ( 1 < ntrial ) {
-      const double block_mean = 1.0e6 * dt_block_total / ntrial ;
-      const double block_sdev = 1.0e6 * sqrt( ( ntrial * dt_block_total_2 -
-                                        dt_block_total * dt_block_total ) /
-                                      ( ntrial * ( ntrial - 1 ) ) );
-
-      const double unblock_mean = 1.0e6 * dt_unblock_total / ntrial ;
-      const double unblock_sdev = 1.0e6 * sqrt( ( ntrial * dt_unblock_total_2 -
-                                          dt_unblock_total * dt_unblock_total) /
-                                        ( ntrial * ( ntrial - 1 ) ) );
-      
-      fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n",
-              nth , ntrial , block_mean , block_sdev , unblock_mean , unblock_sdev );
-    }
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-void test_tpi_reduce( const int ntest , const int nthread[] , const int ntrial )
-{
-  int j ;
-
-  fprintf( stdout , "\n\"TEST TPI_Run_threads(reduce) / TPI_Run_threads_reduce\"\n" );
-  fprintf( stdout , "\"#Thread\" , \"#Trial\" , \"TPI_Run_threads(avg-msec)\" , \"TPI_Run_threads(stddev-msec)\" , \"TPI_Run_threads_reduce(avg-msec)\" , \"TPI_Run_threads_reduce(stddev-msec)\"\n");
-
-  for ( j = 0 ; j < ntest ; ++j ) {
-    const int nth = nthread[j];
-
-    double dt_lock_total   = 0.0 ;
-    double dt_lock_total_2 = 0.0 ;
-    double dt_reduce_total    = 0.0 ;
-    double dt_reduce_total_2  = 0.0 ;
-    int i ;
-
-    int result = TPI_Init( nth );
-
-    if ( result != nth ) {
-      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
-    }
-
-    for ( i = 0 ; i < ntrial ; ++i ) {
-      double t , dt ;
-      int value = 0 ;
-      int * const ptr = & value ;
-
-      t = TPI_Walltime();
-      TPI_Run_threads( test_reduce_via_lock , & ptr , 1 );
-      dt = TPI_Walltime() - t ;
-      dt_lock_total += dt ;
-      dt_lock_total_2 += dt * dt ;
-
-      if ( value != nth ) {
-        fprintf(stderr,
-                "TPI_Run_threads(reduce,...) : FAILED at trial %d\n",
-                i );
-        abort();
-      }
-
-      value = 0 ;
-
-      t = TPI_Walltime();
-      TPI_Run_threads_reduce( test_reduce_via_nolock , NULL ,
-                              test_reduce_join , test_reduce_init ,
-                              sizeof(value) , & value );
-  
-      dt = TPI_Walltime() - t ;
-      dt_reduce_total += dt ;
-      dt_reduce_total_2 += dt * dt ;
-
-      if ( value != nth ) {
-        fprintf(stderr,
-                "TPI_Run_threads_reduce(...) : FAILED at trial %d\n",
-                i );
-        abort();
-      }
-    }
-
-    TPI_Finalize();
-
-    if ( 1 < ntrial ) {
-      const double lock_mean = 1.0e6 * dt_lock_total / ntrial ;
-      const double lock_sdev = 1.0e6 * sqrt( ( ntrial * dt_lock_total_2 -
-                                       dt_lock_total * dt_lock_total ) /
-                                     ( ntrial * ( ntrial - 1 ) ) );
-
-      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
-      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
-                                         dt_reduce_total * dt_reduce_total) /
-                                       ( ntrial * ( ntrial - 1 ) ) );
-      
-      fprintf(stdout,"%d , %d , %10g , %10g , %10g , %10g\n",
-              nth, ntrial, lock_mean, lock_sdev, reduce_mean, reduce_sdev);
-    }
-  }
-}
-
-/*--------------------------------------------------------------------*/
-
-void test_tpi_work( const int ntest , const int nthread[] , const int nwork ,
-                    const int ntrial )
-{
-  int * const flags = (int *) malloc( sizeof(int) * nwork );
-  int j ;
-
-  fprintf( stdout , "\n\"TEST TPI_Run / TPI_Run_reduce\"\n" );
-  fprintf( stdout , "\"#Thread\" , \"#Work\" , \"#Trial\" , \"TPI_Run(avg-msec)\" , \"TPI_Run(stddev-msec)\" , \"TPI_Run_reduce(avg-msec)\" , \"TPI_Run_reduce(stddev-msec)\"\n");
-
-  for ( j = 0 ; j < ntest ; ++j ) {
-    const int nth = nthread[j];
-
-    double dt_work_total   = 0.0 ;
-    double dt_work_total_2 = 0.0 ;
-    double dt_reduce_total    = 0.0 ;
-    double dt_reduce_total_2  = 0.0 ;
-    int i , k ;
-
-    int result = TPI_Init( nth );
-
-    if ( result != nth ) {
-      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
-    }
-
-    for ( i = 0 ; i < ntrial ; ++i ) {
-      double t , dt ;
-      int value = 0 ;
-
-      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
-
-      t = TPI_Walltime();
-      TPI_Run( test_work , & flags , nwork , 0 );
-      dt = TPI_Walltime() - t ;
-      dt_work_total += dt ;
-      dt_work_total_2 += dt * dt ;
-
-      for ( k = 0 ; k < nwork && flags[k] ; ++k );
-
-      if ( k < nwork ) {
-        fprintf(stderr, "TPI_Run(...) : FAILED at trial %d\n", i );
-        abort();
-      }
-
-      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
-
-      t = TPI_Walltime();
-      TPI_Run_reduce( test_reduce_work , & flags , nwork ,
-                      test_reduce_join , test_reduce_init ,
-                      sizeof(value) , & value );
-  
-      dt = TPI_Walltime() - t ;
-      dt_reduce_total += dt ;
-      dt_reduce_total_2 += dt * dt ;
-
-      for ( k = 0 ; k < nwork && flags[k] ; ++k );
-
-      if ( value != nwork || k < nwork ) {
-        fprintf(stderr, "TPI_Run_reduce(...) : FAILED at trial %d\n", i );
-        abort();
-      }
-    }
-
-    TPI_Finalize();
-
-    if ( 1 < ntrial ) {
-      const double work_mean = 1.0e6 * dt_work_total / ntrial ;
-      const double work_sdev = 1.0e6 * sqrt( ( ntrial * dt_work_total_2 -
-                                       dt_work_total * dt_work_total ) /
-                                     ( ntrial * ( ntrial - 1 ) ) );
-
-      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
-      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
-                                         dt_reduce_total * dt_reduce_total) /
-                                       ( ntrial * ( ntrial - 1 ) ) );
-      
-      fprintf(stdout,"%d , %d , %d , %10g , %10g , %10g , %10g\n",
-              nth, ntrial, nwork, work_mean, work_sdev, reduce_mean, reduce_sdev);
-    }
-  }
-
-  free( flags );
-}
-
-/*--------------------------------------------------------------------*/
-
-void test_tpi_work_async(
-  const int ntest , const int nthread[] , const int nwork , const int ntrial )
-{
-  int * const flags = (int *) malloc( sizeof(int) * nwork );
-  int j ;
-
-  fprintf( stdout , "\n\"TEST TPI_Start / TPI_Start_reduce\"\n" );
-  fprintf( stdout , "\"#Thread\" , \"#Work\" , \"#Trial\" , \"TPI_Start(avg-msec)\" , \"TPI_Start(stddev-msec)\" , \"TPI_Start_reduce(avg-msec)\" , \"TPI_Start_reduce(stddev-msec)\"\n");
-
-  for ( j = 0 ; j < ntest ; ++j ) {
-    const int nth = nthread[j];
-
-    double dt_work_total   = 0.0 ;
-    double dt_work_total_2 = 0.0 ;
-    double dt_reduce_total    = 0.0 ;
-    double dt_reduce_total_2  = 0.0 ;
-    int i , k ;
-
-    int result = TPI_Init( nth );
-
-    if ( result != nth ) {
-      fprintf(stderr,"%d != TPI_Init(%d) : FAILED\n", result , nth );
-    }
-
-    for ( i = 0 ; i < ntrial ; ++i ) {
-      double t , dt ;
-      int value = 0 ;
-
-      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
-
-      t = TPI_Walltime();
-      TPI_Start( test_work , & flags , nwork , 0 );
-      TPI_Wait();
-      dt = TPI_Walltime() - t ;
-      dt_work_total += dt ;
-      dt_work_total_2 += dt * dt ;
-
-      for ( k = 0 ; k < nwork && flags[k] ; ++k );
-
-      if ( k < nwork ) {
-        fprintf(stderr, "TPI_Run(...) : FAILED at trial %d\n", i );
-        abort();
-      }
-
-      for ( k = 0 ; k < nwork ; ++k ) { flags[k] = 0 ; }
-
-      t = TPI_Walltime();
-
-      TPI_Start_reduce( test_reduce_work , & flags , nwork ,
-                        test_reduce_join , test_reduce_init ,
-                        sizeof(value) , & value );
-      TPI_Wait();
-  
-      dt = TPI_Walltime() - t ;
-      dt_reduce_total += dt ;
-      dt_reduce_total_2 += dt * dt ;
-
-      for ( k = 0 ; k < nwork && flags[k] ; ++k );
-
-      if ( value != nwork || k < nwork ) {
-        fprintf(stderr, "TPI_Run_reduce(...) : FAILED at trial %d\n", i );
-        abort();
-      }
-    }
-
-    TPI_Finalize();
-
-    if ( 1 < ntrial ) {
-      const double work_mean = 1.0e6 * dt_work_total / ntrial ;
-      const double work_sdev = 1.0e6 * sqrt( ( ntrial * dt_work_total_2 -
-                                       dt_work_total * dt_work_total ) /
-                                     ( ntrial * ( ntrial - 1 ) ) );
-
-      const double reduce_mean = 1.0e6 * dt_reduce_total / ntrial ;
-      const double reduce_sdev = 1.0e6 * sqrt( ( ntrial * dt_reduce_total_2 -
-                                         dt_reduce_total * dt_reduce_total) /
-                                       ( ntrial * ( ntrial - 1 ) ) );
-      
-      fprintf(stdout,"%d , %d , %d , %10g , %10g , %10g , %10g\n",
-              nth, ntrial, nwork, work_mean, work_sdev, reduce_mean, reduce_sdev);
-    }
-  }
-
-  free( flags );
-}
-
-/*--------------------------------------------------------------------*/
-
-static void test_work( TPI_Work * work )
-{
-  int * const flags = * (int *const*) work->info ;
-  flags[ work->rank ] = 1 ;
-}
-
-static void test_reduce_work( TPI_Work * work )
-{
-  int * const flags = * (int *const*) work->info ;
-  flags[ work->rank ] = 1 ;
-
-  *((int *) work->reduce) += 1 ;
-}
-
-static void test_reduce_init( TPI_Work * work )
-{
-  *((int *) work->reduce) = 0 ;
-}
-
-static void test_reduce_join( TPI_Work * work , const void * src )
-{
-  *((int *) work->reduce) += *( (const int *) src );
-}
-
-static void test_reduce_via_lock( TPI_Work * work )
-{
-  int * const value = * ((int *const*) work->info );
-  int result ;
-  if ( ( result = TPI_Lock(0) ) ) {
-    fprintf(stderr,"TPI_Lock(0) = %d : FAILED\n", result);
-    abort();
-  }
-  *value += 1 ;
-  if ( ( result = TPI_Unlock(0) ) ) {
-    fprintf(stderr,"TPI_Unlock(0) = %d : FAILED\n", result);
-    abort();
-  }
-}
-
-static void test_reduce_via_nolock( TPI_Work * work )
-{
-  int * const value = (int *) work->reduce ;
-  *value += 1 ;
-}
-
-/*--------------------------------------------------------------------*/
-
diff --git a/kokkos/basic/optional/copy_from_trilinos b/kokkos/basic/optional/copy_from_trilinos
deleted file mode 100755
index 042e4fb..0000000
--- a/kokkos/basic/optional/copy_from_trilinos
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-TRILINOS_SRC=$1
-
-if [ -d "${TRILINOS_SRC}" -a -d "${TRILINOS_SRC}/packages" ] ;
-then
-
-#-----------------------------------------------------------------------
-cp -r ${TRILINOS_SRC}/packages/ThreadPool/* ThreadPool
-rm -rf ThreadPool/doc
-
-cat << END_CAT > ThreadPool/ThreadPool_config.h
-#ifndef HAVE_PTHREAD
-#define HAVE_PTHREAD
-#endif
-END_CAT
-
-#-----------------------------------------------------------------------
-
-else
-
-  echo 'usage: ' $0 '<path-to-Trilinos-source>'
-
-fi
-
diff --git a/kokkos/basic/optional/cuda/CudaCall.hpp b/kokkos/basic/optional/cuda/CudaCall.hpp
deleted file mode 100644
index f4b8c70..0000000
--- a/kokkos/basic/optional/cuda/CudaCall.hpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef stk_algsup_CudaCall_hpp
-#define stk_algsup_CudaCall_hpp
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-//----------------------------------------------------------------
-inline
-void stk_cuda_call(cudaError err , const char* name )
-{
-  if ( err != cudaSuccess ) {
-    fprintf(stderr, "%s error: %s\n",name, cudaGetErrorString(err) );
-    exit(-1);
-  }
-}
-
-#define CUDA_CALL( cuda_fn ) stk_cuda_call( cuda_fn , #cuda_fn )
-
-
-#endif
-
diff --git a/kokkos/basic/optional/cuda/CudaMemoryModel.hpp b/kokkos/basic/optional/cuda/CudaMemoryModel.hpp
deleted file mode 100644
index 54d189e..0000000
--- a/kokkos/basic/optional/cuda/CudaMemoryModel.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-#ifndef _CudaMemoryModel_hpp_
-#define _CudaMemoryModel_hpp_
-
-#include <iostream>
-#ifdef MINIFE_HAVE_CUDA
-
-#include <stdio.h>
-#include <stdexcept>
-#include <map>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <CudaCall.hpp>
-
-class CudaMemoryModel {
-  public:
-    CudaMemoryModel()
-     : host_to_device_map(),
-       device_to_host_map()
-    {}
-
-    /** Destructor
-     * Upon destruction this class de-allocates all device-buffers that
-     * it was tracking.
-     */
-    virtual ~CudaMemoryModel();
-
-    /** Return a device-pointer corresponding to the given host-ptr and size.
-     * The returned device-pointer points to a buffer which has been allocated
-     * on the CUDA device with length buf_size*sizeof(T), but not initialized.
-     *
-     * If a device-pointer has already been allocated for the given host-pointer
-     * (by a previous call to this method) then that (previously-allocated) device-pointer
-     * is returned.
-     */
-    template<class T>
-    T* get_buffer(const T* host_ptr, size_t buf_size);
-
-    /** Destroy (free) the specified device-pointer.
-     *
-     * De-allocates the cuda-device buffer.
-     */
-    template<class T>
-    void destroy_buffer(T*& device_ptr);
-
-    /** Copy the contents of the given host-ptr to the given device-ptr.
-     * If the given device-ptr is not known (was not created by a previous
-     * call to get_buffer), an exception is thrown.
-     */
-    template<class T>
-    void copy_to_buffer(const T* host_ptr, size_t buf_size, T* device_ptr);
-
-    /** Copy the contents of the given device-ptr to the given host-ptr.
-     * If the given device-ptr is not known (was not created by a previous
-     * call to get_buffer), an exception is thrown.
-     */
-    template<class T>
-    void copy_from_buffer(T* host_ptr, size_t buf_size, const T* device_ptr);
-
- private:
-  std::map<const void*,void*> host_to_device_map;
-  std::map<const void*,const void*> device_to_host_map;
-};
-
-//------------------------------------------------------------------------------
-template<class T>
-inline
-T* CudaMemoryModel::get_buffer(const T* host_ptr, size_t buf_size)
-{
-  T* device_ptr = NULL;
-
-  std::map<const void*,void*>::iterator iter = host_to_device_map.find(host_ptr);
-
-  if (iter == host_to_device_map.end()) {
-    CUDA_CALL( cudaMalloc( (void**)&device_ptr, sizeof(T)*buf_size) );
-
-    host_to_device_map.insert( std::make_pair(host_ptr, device_ptr) );
-    device_to_host_map.insert( std::make_pair(device_ptr, host_ptr) );
-  }
-  else {
-    device_ptr = reinterpret_cast<T*>(iter->second);
-  }
-
-  return device_ptr;
-}
-
-//------------------------------------------------------------------------------
-template<class T>
-inline
-void CudaMemoryModel::destroy_buffer(T*& device_ptr)
-{
-  std::map<const void*,const void*>::iterator iter = device_to_host_map.find(device_ptr);
-  if (iter != device_to_host_map.end()) {
-    const void* host_ptr = iter->second;
-    if (host_ptr != NULL) {
-      std::map<const void*,void*>::iterator iter2 = host_to_device_map.find(host_ptr);
-      if (iter2 != host_to_device_map.end()) {
-        host_to_device_map.erase(iter2);
-      }
-    }
-    CUDA_CALL( cudaFree(device_ptr) );
-    device_ptr = NULL;
-    device_to_host_map.erase(iter);
-  }
-}
-
-//------------------------------------------------------------------------------
-template<class T>
-inline
-void CudaMemoryModel::copy_to_buffer(const T* host_ptr, size_t buf_size, T* device_ptr)
-{
-  std::map<const void*,const void*>::iterator iter = device_to_host_map.find(device_ptr);
-  if (iter == device_to_host_map.end()) {
-    //failed to find device_ptr in device_to_host_map
-    throw std::runtime_error("CudaMemoryModel::copy_to_buffer ERROR, device_ptr not known.");
-  }
-
-  CUDA_CALL( cudaMemcpy( device_ptr, host_ptr, sizeof(T)*buf_size, cudaMemcpyHostToDevice) );
-}
-
-//------------------------------------------------------------------------------
-template<class T>
-inline
-void CudaMemoryModel::copy_from_buffer(T* host_ptr, size_t buf_size, const T* device_ptr)
-{
-  std::map<const void*,const void*>::iterator iter = device_to_host_map.find(device_ptr);
-  if (iter == device_to_host_map.end()) {
-    //failed to find device_ptr in device_to_host_map
-    throw std::runtime_error("CudaMemoryModel::copy_from_buffer ERROR, device_ptr not known.");
-  }
-
-  CUDA_CALL( cudaMemcpy( host_ptr, device_ptr, sizeof(T)*buf_size, cudaMemcpyDeviceToHost) );
-}
-
-inline
-CudaMemoryModel::~CudaMemoryModel()
-{
-  std::map<const void*,const void*>::iterator
-    iter = device_to_host_map.begin(),
-    iter_end = device_to_host_map.end();
-
-  for(; iter!=iter_end; ++iter) {
-    //cast away const so we can free the pointer:
-    void* dev_ptr = const_cast<void*>(iter->first);
-    CUDA_CALL( cudaFree(dev_ptr) );
-  }
-}
-
-#endif
-
-#endif
-
diff --git a/kokkos/basic/optional/cuda/CudaNode.cpp b/kokkos/basic/optional/cuda/CudaNode.cpp
deleted file mode 100644
index 5ddc580..0000000
--- a/kokkos/basic/optional/cuda/CudaNode.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-#include <CudaNode.hpp>
-#include <stdexcept>
-#include <iostream>
-#include <cutil_inline_runtime.h>
-
-// some CUDA rules of thumb employed here (stolen from slides by Mike Bailey, Oregon State)
-// -The number of Blocks should be at least twice the number of MPs 
-// -The number of Threads per Block should be a multiple of 64 
-// -  192 or 256 are good numbers for Threads/Block 
-// We will enforce that numThreads is a power of two (to ease the reduction kernel)
-// greater than 64
-
-CUDANode::CUDANode(int device, int numBlocks, int numThreads, int verbose)
-: numBlocks_(numBlocks)
-, numThreads_(numThreads)
-, h_blk_mem_(NULL)
-, d_blk_mem_(NULL)
-, blk_mem_size_(0)
-{
-  using std::cout;
-  using std::endl;
-  using std::runtime_error;
-  // enforce that numThreads_ is a multiple of 64
-  if (numThreads_ != 64 && numThreads_ != 128 && numThreads_ != 256 && numThreads_ != 512
-      && numThreads_ != 1 && numThreads_ != 2 && numThreads_ != 4 && numThreads_ != 8 && numThreads_ != 16
-      && numThreads_ != 32) {
-//    throw runtime_error("CUDANode::CUDANode(): number of threads per block must be a power of two in [1,512].");
-  }
-  int deviceCount; cudaGetDeviceCount(&deviceCount); 
-  if (device >= deviceCount) {
-    if (deviceCount == 0) {
-//      throw runtime_error("CUDANode::CUDANode(): system has no CUDA devices.");
-    }
-    if (verbose) {
-      cout << "CUDANode::CUDANode(): specified device number not valid. Using device 0." << endl;
-    }
-    device = 0;
-  }
-  cudaDeviceProp deviceProp; 
-  int deviceAlreadyBeingUsed = -1;
-  cudaGetDevice(&deviceAlreadyBeingUsed);
-  if (deviceAlreadyBeingUsed >= 0 && deviceAlreadyBeingUsed < deviceCount) {
-    device = deviceAlreadyBeingUsed;
-  }
-  else {
-    cudaSetDevice(device);
-  }
-  cudaGetDeviceProperties(&deviceProp, device); 
-  // as of CUDA 2.1, device prop contains the following fields
-  // char name[256]; 
-  // size_t totalGlobalMem, sharedMemPerBlock; 
-  // int regsPerBlock, warpSize; 
-  // size_t memPitch; 
-  // int maxThreadsPerBlock, maxThreadsDim[3], maxGridSize[3]; 
-  // size_t totalConstMem; 
-  // int major, minor;
-  // int clockRate; 
-  // size_t textureAlignment; 
-  // int deviceOverlap; 
-  // int multiProcessorCount; 
-  // int kernelExecTimeoutEnabled; 
-  if (verbose) {
-    cout << "CUDANode attached to device #" << device << " \"" << deviceProp.name 
-         << "\", of compute capability " << deviceProp.major << "." << deviceProp.minor
-         << endl;
-  }
-  totalMem_ = deviceProp.totalGlobalMem;
-
-  expand_blk_mem(numBlocks_*8);
-} 
-
-void CUDANode::expand_blk_mem(size_t size_in_bytes)
-{
-  if (blk_mem_size_ >= size_in_bytes) return;
-
-  if (d_blk_mem_ != NULL) {
-    cutilSafeCallNoSync( cudaFree(d_blk_mem_) );
-    delete [] h_blk_mem_;
-  }
-
-  cutilSafeCallNoSync( cudaMalloc(&d_blk_mem_, size_in_bytes) );
-  h_blk_mem_ = new char[size_in_bytes];
-  blk_mem_size_ = size_in_bytes;
-}
-
-CUDANode::~CUDANode()
-{
-  if (d_blk_mem_ != NULL) {
-    cutilSafeCallNoSync( cudaFree(d_blk_mem_) );
-    d_blk_mem_ = NULL; 
-    delete [] h_blk_mem_;
-    h_blk_mem_ = NULL;
-  }
-  blk_mem_size_ = 0;
-}
-
diff --git a/kokkos/basic/optional/cuda/CudaNode.cuh b/kokkos/basic/optional/cuda/CudaNode.cuh
deleted file mode 100644
index 9b1b4fb..0000000
--- a/kokkos/basic/optional/cuda/CudaNode.cuh
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef CUDANODE_CUH_
-#define CUDANODE_CUH_
-
-#include <stdio.h>
-#include <cuda.h>
-#include <sharedmem.cuh>
-#include <cutil_inline_runtime.h>
-#include <cublas.h>
-
-// must define this before including any kernels
-#define KERNEL_PREFIX __device__ __host__
-
-#include <CudaNode.hpp>
-
-#include <DotOp.hpp>
-
-#ifdef CUDANODE_INCLUDE_PARALLEL_FOR
-template <class WDP>
-__global__ void
-Tkern1D(int length, WDP wd, int stride)
-{
-  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-  while(i < length) {
-    wd(i);
-    i += stride;
-  }
-}
-
-template <class WDP>
-void CUDANode::parallel_for(int length, WDP wd) {
-  if (length == 0) return;
-  unsigned int stride = numThreads_ * numBlocks_;
-  Tkern1D<WDP> <<< numBlocks_, numThreads_ >>>(length,wd,stride);
-}
-#endif // parallel_for
-
-#ifdef CUDANODE_INCLUDE_PARALLEL_REDUCE
-template<typename SCALAR>
-void call_dot(DotOp<SCALAR>& wd)
-{
-  printf("ERROR, unknown scalar-type, skipping cuda dot-product.\n");
-}
-template<>
-void call_dot(DotOp<double>& wd)
-{
-  wd.result = cublasDdot(wd.n, wd.x, 1, wd.y, 1);
-}
-template<>
-void call_dot(DotOp<float>& wd)
-{
-  wd.result = cublasSdot(wd.n, wd.x, 1, wd.y, 1);
-}
-
-template <class WDP>
-void CUDANode::parallel_reduce(int length, WDP& wd) 
-{
-  if (length == 1) {
-    wd.result = wd.generate(0);
-    return;
-  }
-
-  call_dot(wd);
-}
-#endif // parallel_reduce
-
-#endif
diff --git a/kokkos/basic/optional/cuda/CudaNode.hpp b/kokkos/basic/optional/cuda/CudaNode.hpp
deleted file mode 100644
index de078ea..0000000
--- a/kokkos/basic/optional/cuda/CudaNode.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef CUDANODE_HPP_
-#define CUDANODE_HPP_
-
-#include <CudaMemoryModel.hpp>
-
-// forward declaration
-class CUDANode;
-
-class CUDANode : public CudaMemoryModel {
-  public:
-
-    CUDANode(int device = 0, int numBlocks = -1, int numThreads = 256, int verbose = 1);
-
-    ~CUDANode();
-
-    //@{ Computational methods
-
-    template <class WDP>
-    void parallel_for(int length, WDP wdp);
-
-    template <class WDP>
-    void parallel_reduce(int length, WDP& wd);
-
-    //@} 
-
-    static CUDANode& singleton(int device=0, int numBlocks=-1, int numThreads=256)
-    {
-      static CUDANode* cuda_node = NULL;
-      if (cuda_node == NULL) {
-        cuda_node = new CUDANode(device, numBlocks, numThreads);
-      }
-      return *cuda_node;
-    }
-      
-  private:
-    //template <class WDP, int FirstLevel>
-    //void call_reduce(int length, WDP wd, int threads, int blocks, void * d_blkpart);
-    // numBlocks_ is 
-    // - the number of blocks launched in a call to parallel_for()
-    // - not used by parallel_reduce()
-    int numBlocks_;
-    // numThreads_ is required to be a power-of-two (our requirement) between 1 and 512 (CUDA's requirement). It is:
-    // - the maximum number of threads used by parallel_reduce()
-    // - the number of threads per block in a call to parallel_for()
-    int numThreads_;
-    // total global device memory, in bytes
-    int totalMem_;
-
-    void expand_blk_mem(size_t size_in_bytes);
-
-    char* h_blk_mem_;
-    void* d_blk_mem_;
-    size_t blk_mem_size_;
-
-};
-
-#endif
diff --git a/kokkos/basic/optional/cuda/CudaNodeImpl.hpp b/kokkos/basic/optional/cuda/CudaNodeImpl.hpp
deleted file mode 100644
index 4b94562..0000000
--- a/kokkos/basic/optional/cuda/CudaNodeImpl.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef CUDANODE_IMPL_HPP_
-#define CUDANODE_IMPL_HPP_
-
-#include <CudaNode.hpp>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cutil_inline_runtime.h>
-#include <stdlib.h>
-#include <stdexcept>
-
-// TODO: consider using cudaMallocHost to allocate page-locked host memory
-//       this speeds up transfer between device and host, and could be very 
-//       useful in the case of Import/Export multivector operations
-
-#endif
diff --git a/kokkos/basic/optional/cuda/Matrix.cu b/kokkos/basic/optional/cuda/Matrix.cu
deleted file mode 100644
index 1487f1a..0000000
--- a/kokkos/basic/optional/cuda/Matrix.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-#define CUDANODE_INCLUDE_PARALLEL_FOR
-
-// include for CudaNode method implementations
-#include <CudaNode.cuh>
-
-// includes for all operators for which Matrix needs support
-#include <MatvecOp.hpp>
-#include <MatrixInitOp.hpp>
-#include <MatrixCopyOp.hpp>
-
-#include <Vector.hpp>
-#include <SparseMatrix.hpp>
-
-// explicit instantiations for Matrix class
-#define EXPLICIT_MATRIX_SUPPORT(MATRIX,VECTOR) \
-template void CUDANode::parallel_for<MatvecOp< MATRIX, VECTOR > >(int , MatvecOp< MATRIX, VECTOR >);
-
-typedef miniFE::SparseMatrix<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,CUDANode> Matrix_type;
-typedef miniFE::Vector<MINIFE_SCALAR,MINIFE_LOCAL_ORDINAL,MINIFE_GLOBAL_ORDINAL,CUDANode> Vector_type;
-
-EXPLICIT_MATRIX_SUPPORT(Matrix_type,Vector_type)
-
diff --git a/kokkos/basic/optional/cuda/Vector.cu b/kokkos/basic/optional/cuda/Vector.cu
deleted file mode 100644
index 9a79955..0000000
--- a/kokkos/basic/optional/cuda/Vector.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-#define CUDANODE_INCLUDE_PARALLEL_REDUCE
-#define CUDANODE_INCLUDE_PARALLEL_FOR
-
-// include for CudaNode method implementations
-#include <CudaNode.cuh>
-
-// includes for all operators for which Vector needs support
-#include <WaxpbyOp.hpp>
-#include <DotOp.hpp>
-#include <MemInitOp.hpp>
-#include <FEComputeElem.hpp>
-
-// explicit instantiations for Vectors
-#define EXPLICIT_VECTOR_SUPPORT(GLOBALORDINAL, SCALAR) \
-template void CUDANode::parallel_for<WaxpbyOp< SCALAR > >(int , WaxpbyOp< SCALAR >); \
-template void CUDANode::parallel_reduce< DotOp< SCALAR > >(int ,  DotOp< SCALAR >& ); \
-template void CUDANode::parallel_for<FEComputeElem< GLOBALORDINAL, SCALAR > >(int , FEComputeElem< GLOBALORDINAL, SCALAR > );
-
-EXPLICIT_VECTOR_SUPPORT(MINIFE_GLOBAL_ORDINAL, MINIFE_SCALAR)
diff --git a/kokkos/basic/optional/cuda/cutil_inline_runtime.h b/kokkos/basic/optional/cuda/cutil_inline_runtime.h
deleted file mode 100644
index 1f49afb..0000000
--- a/kokkos/basic/optional/cuda/cutil_inline_runtime.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
-#define _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include <cufft.h>
-
-// We define these calls here, so the user doesn't need to include __FILE__ and __LINE__
-// The advantage is the developers gets to use the inline function so they can debug
-#define cutilSafeCallNoSync(err)     __cudaSafeCallNoSync(err, __FILE__, __LINE__)
-#define cutilSafeCall(err)           __cudaSafeCall      (err, __FILE__, __LINE__)
-#define cutilSafeThreadSync()        __cudaSafeThreadSync(__FILE__, __LINE__)
-#define cutilCheckMsg(msg)           __cutilCheckMsg     (msg, __FILE__, __LINE__)
-
-inline void __cudaSafeCallNoSync( cudaError err, const char *file, const int line )
-{
-    if( cudaSuccess != err) {
-        fprintf(stderr, "cudaSafeCallNoSync() Runtime API error in file <%s>, line %i : %s.\n",
-                file, line, cudaGetErrorString( err) );
-        exit(-1);
-    }
-}
-
-inline void __cudaSafeCall( cudaError err, const char *file, const int line )
-{
-    if( cudaSuccess != err) {
-        fprintf(stderr, "cudaSafeCall() Runtime API error in file <%s>, line %i : %s.\n",
-                file, line, cudaGetErrorString( err) );
-        exit(-1);
-    }
-}
-
-inline void __cudaSafeThreadSync( const char *file, const int line )
-{
-    cudaError err = cudaThreadSynchronize();
-    if ( cudaSuccess != err) {
-        fprintf(stderr, "cudaThreadSynchronize() Driver API error in file '%s' in line %i : %s.\n",
-                file, line, cudaGetErrorString( err) );
-        exit(-1);
-    }
-}
-
-inline void __cutilCheckMsg( const char *errorMessage, const char *file, const int line )
-{
-    cudaError_t err = cudaGetLastError();
-    if( cudaSuccess != err) {
-        fprintf(stderr, "cutilCheckMsg() CUTIL CUDA error: %s in file <%s>, line %i : %s.\n",
-                errorMessage, file, line, cudaGetErrorString( err) );
-        exit(-1);
-    }
-#ifdef _DEBUG
-    err = cudaThreadSynchronize();
-    if( cudaSuccess != err) {
-        fprintf(stderr, "cutilCheckMsg cudaThreadSynchronize error: %s in file <%s>, line %i : %s.\n",
-                errorMessage, file, line, cudaGetErrorString( err) );
-        exit(-1);
-    }
-#endif
-}
-
-#endif // _CUTIL_INLINE_FUNCTIONS_RUNTIME_H_
diff --git a/kokkos/basic/optional/make_targets b/kokkos/basic/optional/make_targets
deleted file mode 100644
index 01ed2c8..0000000
--- a/kokkos/basic/optional/make_targets
+++ /dev/null
@@ -1,54 +0,0 @@
-#-----------------------------------------------------------------------
-
-TPI.o : ./optional/ThreadPool/src/TPI.c
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
-#-----------------------------------------------------------------------
-
-CudaNode.o : ./optional/cuda/CudaNode.cpp ./optional/cuda/*.hpp ./optional/cuda/*.h
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -c $<
-
-CudaVector.o : ./optional/cuda/Vector.cu ./optional/cuda/*.cuh
-	nvcc $(CUDAFLAGS) $(CPPFLAGS) -c -o $@ $<
-
-CudaMatrix.o : ./optional/cuda/Matrix.cu ./optional/cuda/*.cuh
-	nvcc $(CUDAFLAGS) $(CPPFLAGS) -c -o $@ $<
-
-#-----------------------------------------------------------------------
-# Recursive make to create the object files in this directory,
-# generate the archive, and then remove the object files.
-
-libstk.a :
-	cd ./optional ; \
-	$(MAKE) "CC=$(CC)" "CXX=$(CXX)" "CPPFLAGS=$(CPPFLAGS)" "CFLAGS=$(CFLAGS)" "CXXFLAGS=$(CXXFLAGS)" -f make_targets stk_library
-
-STK_SOURCE =	\
-	./shards/src/*.cpp	\
-	./stk_util/util/*.cpp	\
-	./stk_util/environment/*.cpp	\
-	./stk_util/parallel/*.cpp	\
-	./stk_mesh/base/*.cpp	\
-	./stk_mesh/baseImpl/*.cpp	\
-	./stk_mesh/fem/*.cpp	\
-	stk_helpers.cpp
-
-STK_INCLUDES =	\
-	./shards/src/*.hpp	\
-	./shards/src/*.h	\
-	./stk_util/util/*.hpp	\
-	./stk_util/environment/*.hpp	\
-	./stk_util/parallel/*.hpp	\
-	./stk_mesh/base/*.hpp	\
-	./stk_mesh/fem/*.hpp
-
-STK_INC = -I${PWD}/ThreadPool -I${PWD}/shards
-
-stk_library : $(STK_SOURCE) $(STK_INCLUDES)
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) $(STK_INC) -c $(STK_SOURCE)
-	ar -qc ../libstk.a *.o
-	ranlib ../libstk.a
-	rm *.o
-
-#-----------------------------------------------------------------------
-
-
diff --git a/kokkos/basic/perform_element_loop.hpp b/kokkos/basic/perform_element_loop.hpp
deleted file mode 100644
index f65ad4f..0000000
--- a/kokkos/basic/perform_element_loop.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef _perform_element_loop_hpp_
-#define _perform_element_loop_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <BoxIterator.hpp>
-#include <simple_mesh_description.hpp>
-#include <SparseMatrix_functions.hpp>
-#include <box_utils.hpp>
-#include <Hex8_box_utils.hpp>
-#include <Hex8_ElemData.hpp>
-
-namespace miniFE {
-
-template<typename GlobalOrdinal,
-         typename MatrixType, typename VectorType>
-void
-perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
-                     const Box& local_elem_box,
-                     MatrixType& A, VectorType& b,
-                     Parameters& /*params*/)
-{
-  typedef typename MatrixType::ScalarType Scalar;
-
-  int global_elems_x = mesh.global_box[0][1];
-  int global_elems_y = mesh.global_box[1][1];
-  int global_elems_z = mesh.global_box[2][1];
-
-  //We will iterate the local-element-box (local portion of the mesh), and
-  //get element-IDs in preparation for later assembling the FE operators
-  //into the global sparse linear-system.
-
-  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
-  std::vector<GlobalOrdinal> elemIDs(num_elems);
-
-  BoxIterator iter = BoxIterator::begin(local_elem_box);
-  BoxIterator end  = BoxIterator::end(local_elem_box);
-
-  for(size_t i=0; iter != end; ++iter, ++i) {
-    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
-                                       iter.x, iter.y, iter.z);
-//#ifdef MINIFE_DEBUG
-//std::cout << "elem ID " << elemIDs[i] << " ("<<iter.x<<","<<iter.y<<","<<iter.z<<")"<<std::endl;
-//#endif
-  }
-
-  //Now do the actual finite-element assembly loop:
-
-  ElemData<GlobalOrdinal,Scalar> elem_data;
-
-  compute_gradient_values(elem_data.grad_vals);
-
-  timer_type t_gn = 0, t_ce = 0, t_si = 0;
-  timer_type t0 = 0;
-  for(size_t i=0; i<elemIDs.size(); ++i) {
-    //Given an element-id, populate elem_data with the
-    //element's node_ids and nodal-coords:
-
-    TICK();
-    get_elem_nodes_and_coords(mesh, elemIDs[i], elem_data);
-    TOCK(t_gn);
-
-    //Next compute element-diffusion-matrix and element-source-vector:
-
-    TICK();
-    compute_element_matrix_and_vector(elem_data);
-    TOCK(t_ce);
-
-    //Now assemble the (dense) element-matrix and element-vector into the
-    //global sparse linear system:
-
-    TICK();
-    sum_into_global_linear_system(elem_data, A, b);
-    TOCK(t_si);
-  }
-//std::cout << std::endl<<"get-nodes: " << t_gn << std::endl;
-//std::cout << "compute-elems: " << t_ce << std::endl;
-//std::cout << "sum-in: " << t_si << std::endl;
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/perform_element_loop_TBB_pipe.hpp b/kokkos/basic/perform_element_loop_TBB_pipe.hpp
deleted file mode 100644
index 044a049..0000000
--- a/kokkos/basic/perform_element_loop_TBB_pipe.hpp
+++ /dev/null
@@ -1,382 +0,0 @@
-#ifndef _perform_element_loop_TBB_pipe_hpp_
-#define _perform_element_loop_TBB_pipe_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#ifdef MINIFE_HAVE_TBB
-
-#include <LockingMatrix.hpp>
-#include <LockingVector.hpp>
-#include <BoxIterator.hpp>
-#include <simple_mesh_description.hpp>
-#include <SparseMatrix_functions.hpp>
-#include <Hex8_box_utils.hpp>
-#include <Hex8_ElemData.hpp>
-
-#include <tbb/pipeline.h>
-
-namespace miniFE {
-
-//---------------------------------------------------------------------
-//This file contains three 'filter' classes, and a 'perform_element_loop'
-//function that uses those filter classes to run a TBB pipeline.
-//
-//The filter classes are as follows:
-//1. GetElemNodesCoords
-//     For each element in the mesh, create an elem-data object with coords
-//     and node-ids.
-//2. Compute_FE_Operators
-//     Given an elem-data object (with coords and node-ids), compute the
-//     diffusion-matrix and source-vector.
-//3. LockingSumIntoLinearSystem
-//     Given an elem-data object (with diffusion-matrix and source-vector),
-//     assemble into global-sparse linear-system. Uses a lock on each
-//     matrix row to ensure that multiple threads don't update the same row
-//     at the same time.
-//... or:
-//3. SumIntoLinearSystem
-//     Given an elem-data object (with diffusion-matrix and source-vector),
-//     assemble into global-sparse linear-system.
-//     There are several of these filters, usually 1 per thread, and each
-//     will be responsible for a certain slice of equations. It will check
-//     the elem-data for equations that are in its slice, assemble those, and
-//     pass the elem-data on so that the next SumIntoLinearSystem filter can
-//     deal with equations in a different 'slice'.
-//
-//---------------------------------------------------------------------
-
-//---------------------------------------------------------------------
-
-/** Filter 1.: GetElemNodesCoords
- */
-template<typename GlobalOrdinal,typename Scalar>
-class GetElemNodesCoords : public tbb::filter {
-public:
-  GetElemNodesCoords(const std::vector<GlobalOrdinal>& elemIDs,
-                     const simple_mesh_description<GlobalOrdinal>& mesh,
-                     size_t num_elems_at_a_time)
-   : tbb::filter(/*is_serial=*/true),
-     elemIDs_(elemIDs),
-     i_(0),
-     mesh_(mesh),
-     num_elems_(num_elems_at_a_time)
-  {
-    if (num_elems_ < 1) num_elems_ = 1;
-  }
-
-  ~GetElemNodesCoords(){}
-
-private:
-  /** This operator launches an elem-data object for a 'group' (size num_elems_)
-    * of elements. When all elements have been launched, return NULL to signal
-    * that we're done issuing data.
-   */
-  void* operator()(void* item) {
-    if (i_ >= elemIDs_.size()) return NULL;
-
-    size_t num = num_elems_;
-    if (i_+num > elemIDs_.size()) num = elemIDs_.size() - i_;
-
-    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata_vec = new std::vector<ElemData<GlobalOrdinal,Scalar> >(num);
-
-    size_t i=0;
-    while (i_ < elemIDs_.size() && i < num) {
-      get_elem_nodes_and_coords(mesh_, elemIDs_[i_], (*elemdata_vec)[i]);
-      ++i_;
-      ++i;
-    }
-
-    return elemdata_vec;
-  }
-
-  const std::vector<GlobalOrdinal>& elemIDs_;
-  size_t i_;
-  const simple_mesh_description<GlobalOrdinal>& mesh_;
-  size_t num_elems_;
-};
-
-//---------------------------------------------------------------------
-
-/** Filter 2.: Compute_FE_Operators
- */
-template<typename GlobalOrdinal,typename Scalar>
-class Compute_FE_Operators : public tbb::filter {
-public:
-  Compute_FE_Operators() : tbb::filter(/*is_serial=*/false) {}
-  ~Compute_FE_Operators() {}
-
-private:
-  /** This operator takes a vector of elem-data objects which are assumed
-    * to have nodal-coordinates already populated, and computes the
-    * element-diffusion-matrix and element-source-vector for each.
-   */
-  void* operator()(void* item) {
-    if (item == NULL) return NULL;
-    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata = static_cast<std::vector<ElemData<GlobalOrdinal,Scalar> >*>(item);
-
-    for(size_t i=0; i<elemdata->size(); ++i) {
-      compute_element_matrix_and_vector((*elemdata)[i]);
-    }
-    return elemdata;
-  }
-};
-
-//---------------------------------------------------------------------
-
-/** Filter 3.: SumIntoLinearSystem
- */
-template<typename MatrixType, typename VectorType>
-class SumIntoLinearSystem : public tbb::filter {
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-public:
-  SumIntoLinearSystem(GlobalOrdinal myFirstRow,
-                      GlobalOrdinal myLastRow,
-                      MatrixType& mat, VectorType& vec)
-   : tbb::filter(/*is_serial=*/true),
-     A_(mat), b_(vec),
-     myFirstRow_(myFirstRow),
-     myLastRow_(myLastRow)
-  {
-  }
-
-  ~SumIntoLinearSystem() {}
-
-private:
-  /** This operator takes a vector of elem-data objects which have an
-    * element-diffusion-matrix and source-vector, looks through it for
-    * any rows in this filter's slice of the global matrix, assembles
-    * those rows into the linear-system, then passes the elem-data object
-    * on for use by the next assembly filter.
-    * If this assembly filter is responsible for the last slice of the
-    * row-space, then this is the last filter and so we delete the
-    * elem-data object.
-    */
-  void* operator()(void* item) {
-    if (item == NULL) return NULL;
-    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata_vec = static_cast<std::vector<ElemData<GlobalOrdinal,Scalar> >*>(item);
-
-    for(size_t e=0; e<elemdata_vec->size(); ++e) {
-      ElemData<GlobalOrdinal,Scalar>& elemdata = (*elemdata_vec)[e];
-      size_t nnodes = elemdata.nodes_per_elem;
-      for(size_t i=0; i<nnodes; ++i) {
-        GlobalOrdinal row = elemdata.elem_node_ids[i];
-        if (row < myFirstRow_ || row > myLastRow_) continue;
-  
-        sum_into_row(row, nnodes, elemdata.elem_node_ids,
-                     &(elemdata.elem_diffusion_matrix[i*nnodes]), A_);
-        sum_into_vector(1, &row, &(elemdata.elem_source_vector[i]), b_);
-      }
-    }
-
-    if (myLastRow_ >= A_.rows.size()) {
-      delete elemdata_vec;
-      return NULL;
-    }
-
-    return elemdata_vec;
-  }
-
-  MatrixType& A_;
-  VectorType& b_;
-  GlobalOrdinal myFirstRow_;
-  GlobalOrdinal myLastRow_;
-};
-
-//---------------------------------------------------------------------
-
-static tbb::atomic<size_t> matrix_suminto;
-
-/** Filter 3.: SumIntoLinearSystem with locking
- */
-template<typename MatrixType, typename VectorType>
-class LockingSumIntoLinearSystem : public tbb::filter {
-  typedef typename MatrixType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename MatrixType::ScalarType Scalar;
-
-public:
-  LockingSumIntoLinearSystem(MatrixType& mat, VectorType& vec)
-   : tbb::filter(/*is_serial=*/false),
-     A_(mat), b_(vec)
-  {
-  }
-
-  ~LockingSumIntoLinearSystem() {}
-
-private:
-  /** This operator takes a vector of elem-data objects which have an
-    * element-diffusion-matrix and source-vector, and assembles into
-    * the linear-system, using locking to make sure no other
-    * thread is assembling the same global row at the same time.
-    */
-  void* operator()(void* item) {
-    if (item == NULL) return NULL;
-    std::vector<ElemData<GlobalOrdinal,Scalar> >* elemdata_vec = static_cast<std::vector<ElemData<GlobalOrdinal,Scalar> >*>(item);
-
-    for(size_t e=0; e<elemdata_vec->size(); ++e) {
-      ElemData<GlobalOrdinal,Scalar>& elemdata = (*elemdata_vec)[e];
-      size_t nnodes = elemdata.nodes_per_elem;
-      size_t offset = 0;
-      for(size_t i=0; i<nnodes; ++i) {
-        GlobalOrdinal row = elemdata.elem_node_ids[i];
-        //The contiguous row starting from the diagonal is the upper triangle.
-        const Scalar* row_coefs = &elemdata.elem_diffusion_matrix[offset];
-        const GlobalOrdinal* col_inds = &elemdata.elem_node_ids[i];
-        size_t row_len = nnodes-i;
-
-        ++matrix_suminto;
-  
-        A_.sum_in(row, row_len, col_inds, row_coefs);
-
-        //Now we have to loop to sum in the lower triangle:
-        for(size_t j=i+1; j<nnodes; ++j) {
-          const Scalar* row_coef = &row_coefs[j];
-          const GlobalOrdinal* col = &col_inds[j];
-          A_.sum_in(*col, 1, &row, row_coef);
-        }
-
-        b_.sum_in(1, &row, &(elemdata.elem_source_vector[i]));
-      }
-    }
-
-    delete elemdata_vec;
-    return NULL;
-  }
-
-  LockingMatrix<MatrixType> A_;
-  LockingVector<VectorType> b_;
-};
-
-//---------------------------------------------------------------------
-
-template<typename GlobalOrdinal,
-         typename MatrixType, typename VectorType>
-void
-perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
-                     const Box& local_elem_box,
-                     MatrixType& A, VectorType& b,
-                     Parameters& params)
-{
-  typedef typename MatrixType::ScalarType Scalar;
-
-  if (A.rows.size() == 0) return;
-
-  int num_threads = params.numthreads;
-
-  //We will iterate the local-element-box (local portion of the mesh), and
-  //assemble the FE operators into the global sparse linear-system.
-
-  tbb::pipeline pipe;
-  
-  int global_elems_x = mesh.global_box[0][1];
-  int global_elems_y = mesh.global_box[1][1];
-  int global_elems_z = mesh.global_box[2][1];
-
-  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
-  std::vector<GlobalOrdinal> elemIDs(num_elems);
-
-  BoxIterator iter = BoxIterator::begin(local_elem_box);
-  BoxIterator end  = BoxIterator::end(local_elem_box);
-
-  for(size_t i=0; iter != end; ++iter, ++i) {
-    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
-                                       iter.x, iter.y, -iter.z);
-  }
-
-  //Create the first stage of the pipeline, the filter that will
-  //launch elem-data from the mesh, through the pipeline.
-  GetElemNodesCoords<GlobalOrdinal,Scalar> get_nodes_coords(elemIDs, mesh, params.elem_group_size);
-
-  //Create the second stage of the pipeline, the parallel filter that will
-  //compute element-matrices and element-vectors.
-  Compute_FE_Operators<GlobalOrdinal,Scalar> fe_ops;
-
-  //Add the filters to the pipeline:
-  pipe.add_filter(get_nodes_coords);
-  pipe.add_filter(fe_ops);
-
-  LockingSumIntoLinearSystem<MatrixType,VectorType>* sum_into_linsys = NULL;
-  size_t num_assembly_filters = 0;
-  std::vector<SumIntoLinearSystem<MatrixType,VectorType>*> linsys;
-
-  bool use_locking = params.use_locking==1;
-  if (use_locking) {
-    sum_into_linsys = new LockingSumIntoLinearSystem<MatrixType,VectorType>(A, b);
-    pipe.add_filter(*sum_into_linsys);
-  }
-  else {
-    //If not using locking, create several assembly filters, each of which
-    //will be responsible for assembling rows into a certain slice of the
-    //global matrix.
-  
-    num_assembly_filters = num_threads/3;
-    if (num_assembly_filters == 0) num_assembly_filters = 1;
-    num_assembly_filters = 2;
-  
-    size_t num_rows = A.rows.size();
-    size_t rows_per_thread = num_rows/num_assembly_filters;
-    if (num_rows % num_assembly_filters > 0) ++rows_per_thread;
-    size_t first_row = A.rows[0];
-    for(int i=0; i<num_assembly_filters; ++i) {
-      size_t last_row = first_row + rows_per_thread - 1;
-      SumIntoLinearSystem<MatrixType,VectorType> * sum_into = new SumIntoLinearSystem<MatrixType,VectorType>(first_row, last_row, A, b);
-      linsys.push_back(sum_into);
-      pipe.add_filter(*sum_into);
-  
-      first_row += rows_per_thread;
-    }
-  }
-
-  //Running the pipeline carries out the element-loop and assembly.
-  pipe.run(num_threads);
-
-  pipe.clear();
-
-  if (use_locking) {
-    std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl;
-    std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl;
-    std::cout << "matrix_suminto: " << matrix_suminto << std::endl;
-  }
-  else {
-    std::cout << "no locking, num-assembly-filters: "<<num_assembly_filters<<std::endl;
-  }
-
-  delete sum_into_linsys;
-  for(size_t i=0; i<linsys.size(); ++i) delete linsys[i];
-}
-
-}//namespace miniFE
-
-#else
-#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB is not defined."
-#endif
-
-#endif
-
diff --git a/kokkos/basic/perform_element_loop_TBB_pllfor1.hpp b/kokkos/basic/perform_element_loop_TBB_pllfor1.hpp
deleted file mode 100644
index c2afb5b..0000000
--- a/kokkos/basic/perform_element_loop_TBB_pllfor1.hpp
+++ /dev/null
@@ -1,126 +0,0 @@
-#ifndef _perform_element_loop_TBB_pllfor1_hpp_
-#define _perform_element_loop_TBB_pllfor1_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#ifdef MINIFE_HAVE_TBB
-
-#include <LockingMatrix.hpp>
-#include <LockingVector.hpp>
-#include <BoxIterator.hpp>
-#include <simple_mesh_description.hpp>
-#include <SparseMatrix_functions.hpp>
-#include <Hex8_box_utils.hpp>
-#include <Hex8_ElemData.hpp>
-#include <mytimer.hpp>
-
-namespace miniFE {
-
-//---------------------------------------------------------------------
-
-template<typename GlobalOrdinal,typename Scalar,
-         typename MatrixType, typename VectorType>
-struct FEAssembleSumInto {
-  const simple_mesh_description<GlobalOrdinal>* mesh;
-  GlobalOrdinal* elemIDs;
-  LockingMatrix<MatrixType>* A;
-  LockingVector<VectorType>* b;
-
-inline void operator()(int i)
-{
-  ElemData<GlobalOrdinal,Scalar> elem_data;
-  GlobalOrdinal elemID = elemIDs[i];
-  get_elem_nodes_and_coords(*mesh, elemID, elem_data.elem_node_ids,
-                            elem_data.elem_node_coords);
-  compute_element_matrix_and_vector(elem_data);
-  sum_into_global_linear_system(elem_data, *A, *b);
-}
-};
-
-template<typename GlobalOrdinal,
-         typename MatrixType, typename VectorType>
-void
-perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
-                     const Box& local_elem_box,
-                     MatrixType& A, VectorType& b,
-                     Parameters& params)
-{
-  typedef typename MatrixType::ScalarType Scalar;
-
-  if (A.rows.size() == 0) return;
-
-  int num_threads = params.numthreads;
-
-  timer_type t0 = mytimer();
-
-  //We will iterate the local-element-box (local portion of the mesh), and
-  //assemble the FE operators into the global sparse linear-system.
-  
-  int global_elems_x = mesh.global_box[0][1];
-  int global_elems_y = mesh.global_box[1][1];
-  int global_elems_z = mesh.global_box[2][1];
-
-  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
-  std::vector<GlobalOrdinal> elemIDs(num_elems);
-
-  BoxIterator iter = BoxIterator::begin(local_elem_box);
-  BoxIterator end  = BoxIterator::end(local_elem_box);
-
-  for(size_t i=0; iter != end; ++iter, ++i) {
-    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
-                                       iter.x, iter.y, iter.z);
-  }
-
-  LockingMatrix<MatrixType> lockingA(A);
-  LockingVector<VectorType> lockingb(b);
-
-  FEAssembleSumInto<GlobalOrdinal,Scalar,MatrixType,VectorType> fe_op;
-  fe_op.mesh = &mesh;
-  fe_op.elemIDs = &elemIDs[0];
-  fe_op.A = &lockingA;
-  fe_op.b = &lockingb;
-  
-  typedef typename VectorType::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& compute_node = b.compute_node;
-
-  compute_node.parallel_for(elemIDs.size(), fe_op);
-
-  std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl;
-  std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl;
-}
-
-}//namespace miniFE
-
-#else
-#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB is not defined."
-#endif
-
-#endif
-
diff --git a/kokkos/basic/perform_element_loop_TBB_pllfor2.hpp b/kokkos/basic/perform_element_loop_TBB_pllfor2.hpp
deleted file mode 100644
index 7889787..0000000
--- a/kokkos/basic/perform_element_loop_TBB_pllfor2.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-#ifndef _perform_element_loop_TBB_pllfor2_hpp_
-#define _perform_element_loop_TBB_pllfor2_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#ifdef MINIFE_HAVE_TBB
-
-#ifdef MINIFE_HAVE_CUDA
-#include <CudaNode.hpp>
-#endif
-
-#include <LockingMatrix.hpp>
-#include <LockingVector.hpp>
-#include <ElemData.hpp>
-#include <BoxIterator.hpp>
-#include <simple_mesh_description.hpp>
-#include <SparseMatrix_functions.hpp>
-#include <Hex8_box_utils.hpp>
-#include <GetNodesCoords.hpp>
-#include <FEComputeElem.hpp>
-#include <SumInLinSys.hpp>
-#include <mytimer.hpp>
-
-namespace miniFE {
-
-//---------------------------------------------------------------------
-
-template<typename GlobalOrdinal,
-         typename MatrixType, typename VectorType>
-void
-perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
-                     const Box& local_elem_box,
-                     MatrixType& A, VectorType& b,
-                     Parameters& params)
-{
-  typedef typename MatrixType::ScalarType Scalar;
-
-  if (A.rows.size() == 0) return;
-
-  int num_threads = params.numthreads;
-
-  timer_type t0 = mytimer();
-
-  //We will iterate the local-element-box (local portion of the mesh), and
-  //assemble the FE operators into the global sparse linear-system.
-  
-  int global_elems_x = mesh.global_box[0][1];
-  int global_elems_y = mesh.global_box[1][1];
-  int global_elems_z = mesh.global_box[2][1];
-
-  GlobalOrdinal num_elems = get_num_ids<GlobalOrdinal>(local_elem_box);
-  std::vector<GlobalOrdinal> elemIDs(num_elems);
-
-  BoxIterator iter = BoxIterator::begin(local_elem_box);
-  BoxIterator end  = BoxIterator::end(local_elem_box);
-
-  for(size_t i=0; iter != end; ++iter, ++i) {
-    elemIDs[i] = get_id<GlobalOrdinal>(global_elems_x, global_elems_y, global_elems_z,
-                                       iter.x, iter.y, iter.z);
-  }
-
-  std::vector<GlobalOrdinal> node_ordinals(num_elems*Hex8::numNodesPerElem);
-  std::vector<Scalar> node_coords(num_elems*Hex8::numNodesPerElem*Hex8::spatialDim);
-  std::vector<Scalar> elem_matrices(num_elems*Hex8::numNodesPerElem*Hex8::numNodesPerElem);
-  std::vector<Scalar> elem_vectors(num_elems*Hex8::numNodesPerElem);
-
-  LockingMatrix<MatrixType> lockingA(A);
-  LockingVector<VectorType> lockingb(b);
-
-  GetNodesCoords<GlobalOrdinal,Scalar> get_nodes_coords;
-  get_nodes_coords.elemIDs = &elemIDs[0];
-  get_nodes_coords.mesh = &mesh;
-  get_nodes_coords.node_ordinals = &node_ordinals[0];
-  get_nodes_coords.elem_node_coords = &node_coords[0];
-  
-  typedef typename VectorType::ComputeNodeType ComputeNodeType;
-
-  ComputeNodeType& compute_node = b.compute_node;
-
-  compute_node.parallel_for(elemIDs.size(), get_nodes_coords);
-
-  timer_type t_gn = mytimer() - t0;
-  t0 = mytimer();
-
-#ifdef MINIFE_HAVE_CUDA
-  CUDANode& elem_compute_node = CUDANode::singleton();
-#else
-  ComputeNodeType& elem_compute_node = compute_node;
-#endif
-  timer_type t_ccn = mytimer() - t0;
-  t0 = mytimer();
-
-  Scalar* d_node_coords = elem_compute_node.get_buffer(&node_coords[0], node_coords.size());
-  Scalar* d_elem_matrices = elem_compute_node.get_buffer(&elem_matrices[0], elem_matrices.size());
-  Scalar* d_elem_vectors  = elem_compute_node.get_buffer(&elem_vectors[0], elem_vectors.size());
-
-  elem_compute_node.copy_to_buffer(&node_coords[0], node_coords.size(), d_node_coords);
-
-  FEComputeElem<GlobalOrdinal,Scalar> fe_compute_elem;
-  fe_compute_elem.elem_node_coords = &d_node_coords[0];
-  fe_compute_elem.elem_diffusion_matrix = &d_elem_matrices[0];
-  fe_compute_elem.elem_source_vector = &d_elem_vectors[0];
-
-  elem_compute_node.parallel_for(elemIDs.size(), fe_compute_elem);
-
-  elem_compute_node.copy_from_buffer(&elem_matrices[0], elem_matrices.size(), d_elem_matrices);
-  elem_compute_node.copy_from_buffer(&elem_vectors[0], elem_vectors.size(), d_elem_vectors);
-
- timer_type t_ce = mytimer() - t0;
-
-  t0 = mytimer();
-  SumInLinSys<GlobalOrdinal,Scalar,MatrixType,VectorType> sum_in;
-  sum_in.node_ordinals = &node_ordinals[0];
-  sum_in.elem_diffusion_matrix = &elem_matrices[0];
-  sum_in.elem_source_vector = &elem_vectors[0];
-  sum_in.A = &lockingA;
-  sum_in.b = &lockingb;
-
-  compute_node.parallel_for(elemIDs.size(), sum_in);
-
-  timer_type t_si = mytimer() - t0;
-  std::cout << "time to get nodes/coords: " << t_gn << std::endl;
-  std::cout << "time to create compute-node: " << t_ccn << ", time to compute elements: " << t_ce << std::endl;
-  std::cout << "time to sum into linsys: " << t_si << std::endl;
-  std::cout << "\n{number of matrix conflicts: " << miniFE_num_matrix_conflicts << "}"<<std::endl;
-  std::cout << "{number of vector conflicts: " << miniFE_num_vector_conflicts << "}"<<std::endl;
-}
-
-}//namespace miniFE
-
-#else
-#error "ERROR, this file shouldn't be compiled if MINIFE_HAVE_TBB is not defined."
-#endif
-
-#endif
-
diff --git a/kokkos/basic/run_one_test b/kokkos/basic/run_one_test
deleted file mode 100755
index de4b188..0000000
--- a/kokkos/basic/run_one_test
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-
-if [ $# != 4 ]; then
-echo "usage: run_one_test <np> <nx> <ny> <nz>"
-exit 1
-fi
-
-np=$1
-nx=$2
-ny=$3
-nz=$4
-
-echo " "
-echo "running miniFE test for np=${np}, nx=${nx} ny=${ny} nz=${nz}..."
-
-if [ ! -x miniFE.x ]; then
-echo "miniFE.x doesn't exist or isn't executable. Aborting."
-exit -1
-fi
-
-mpirun -np ${np} miniFE.x nx=${nx} ny=${ny} nz=${nz} >& miniFE_run.out
-rm miniFE_run.out
-
-if [ ! -f A.mtx.${np}.0 ]; then
-echo "matrix file A.mtx.${np}.0 doesn't exist. build miniFE with -DMINIFE_DEBUG."
-fi
-
-p=0
-while [ $p -lt ${np} ]; do
-diff A.mtx.${np}.$p gold_files/1x1x2_A.mtx.${np}.$p >& diff.A.$p.txt
-diff b.vec.${np}.$p gold_files/1x1x2_b.vec.${np}.$p >& diff.b.$p.txt
-diff x.vec.${np}.$p gold_files/1x1x2_x.vec.${np}.$p >& diff.x.$p.txt
-
-test_result="passed"
-if [ -s diff.A.$p.txt ]; then
-echo "TEST FAILED: see diff.A.${p}.txt"
-test_result="failed"
-fi
-
-if [ -s diff.b.$p.txt ]; then
-echo "TEST FAILED: see diff.b.${p}.txt"
-test_result="failed"
-fi
-
-if [ -s diff.x.$p.txt ]; then
-echo "TEST FAILED: see diff.x.${p}.txt"
-test_result="failed"
-fi
-
-if [ $test_result != "passed" ]; then
-echo "test failed"
-exit 1
-fi
-
-let p=p+1
-rm diff.*.txt
-done
-
-echo "tests passed"
-
diff --git a/kokkos/basic/run_tests b/kokkos/basic/run_tests
deleted file mode 100755
index 5e03399..0000000
--- a/kokkos/basic/run_tests
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-echo " "
-echo "running miniFE tests..."
-
-if [ ! -x miniFE.x ]; then
-echo "miniFE.x doesn't exist or isn't executable. Aborting."
-exit -1
-fi
-
-./run_one_test 1 1 1 2
-if [ $? != 0 ]; then
-echo "test failed"
-exit $?
-fi
-
-./run_one_test 2 1 1 2
-if [ $? != 0 ]; then
-echo "test failed"
-exit $?
-fi
-
diff --git a/kokkos/basic/sharedmem.cuh b/kokkos/basic/sharedmem.cuh
deleted file mode 100644
index b13c4f2..0000000
--- a/kokkos/basic/sharedmem.cuh
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
-* Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
-*
-* NOTICE TO USER:
-*
-* This source code is subject to NVIDIA ownership rights under U.S. and
-* international Copyright laws.
-*
-* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-* OR PERFORMANCE OF THIS SOURCE CODE.
-*
-* U.S. Government End Users.  This source code is a "commercial item" as
-* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-* "commercial computer software" and "commercial computer software
-* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-* and is provided to the U.S. Government only as a commercial end item.
-* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-* source code with only those rights set forth herein.
-*/
-
-#ifndef _SHAREDMEM_H_
-#define _SHAREDMEM_H_
-
-//****************************************************************************
-// Because dynamically sized shared memory arrays are declared "extern",
-// we can't templatize them directly.  To get around this, we declare a 
-// simple wrapper struct that will declare the extern array with a different 
-// name depending on the type.  This avoids compiler errors about duplicate
-// definitions.
-// 
-// To use dynamically allocated shared memory in a templatized __global__ or 
-// __device__ function, just replace code like this:
-//
-//
-//  template<class T>
-//  __global__ void
-//  foo( T* g_idata, T* g_odata) 
-//  {
-//      // Shared mem size is determined by the host app at run time
-//      extern __shared__  T sdata[];
-//      ...
-//      doStuff(sdata);
-//      ...
-//   }
-//  
-//   With this
-//  template<class T>
-//  __global__ void
-//  foo( T* g_idata, T* g_odata) 
-//  {
-//      // Shared mem size is determined by the host app at run time
-//      SharedMemory<T> smem;
-//      T* sdata = smem.getPointer();
-//      ...
-//      doStuff(sdata);
-//      ...
-//   }
-//****************************************************************************
-
-// This is the un-specialized struct.  Note that we prevent instantiation of this 
-// struct by putting an undefined symbol in the function body so it won't compile.
-template <typename T>
-struct SharedMemory
-{
-    // Ensure that we won't compile any un-specialized types
-    __device__ T* getPointer() {
-        extern __device__ void error(void);
-        error();
-        return NULL;
-    }
-};
-
-// Following are the specializations for the following types.
-// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
-// One could also specialize it for user-defined types.
-
-template <>
-struct SharedMemory <int>
-{
-    __device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; }    
-};
-
-template <>
-struct SharedMemory <unsigned int>
-{
-    __device__ unsigned int* getPointer() { extern __shared__ unsigned int s_uint[]; return s_uint; }    
-};
-
-template <>
-struct SharedMemory <char>
-{
-    __device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; }    
-};
-
-template <>
-struct SharedMemory <unsigned char>
-{
-    __device__ unsigned char* getPointer() { extern __shared__ unsigned char s_uchar[]; return s_uchar; }    
-};
-
-template <>
-struct SharedMemory <short>
-{
-    __device__ short* getPointer() { extern __shared__ short s_short[]; return s_short; }    
-};
-
-template <>
-struct SharedMemory <unsigned short>
-{
-    __device__ unsigned short* getPointer() { extern __shared__ unsigned short s_ushort[]; return s_ushort; }    
-};
-
-template <>
-struct SharedMemory <long>
-{
-    __device__ long* getPointer() { extern __shared__ long s_long[]; return s_long; }    
-};
-
-template <>
-struct SharedMemory <unsigned long>
-{
-    __device__ unsigned long* getPointer() { extern __shared__ unsigned long s_ulong[]; return s_ulong; }    
-};
-
-template <>
-struct SharedMemory <bool>
-{
-    __device__ bool* getPointer() { extern __shared__ bool s_bool[]; return s_bool; }    
-};
-
-template <>
-struct SharedMemory <float>
-{
-    __device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; }    
-};
-
-template <>
-struct SharedMemory <double>
-{
-    __device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; }    
-};
-
-
-#endif //_SHAREDMEM_H_
diff --git a/kokkos/basic/simple_mesh_description.hpp b/kokkos/basic/simple_mesh_description.hpp
deleted file mode 100644
index 717dc6c..0000000
--- a/kokkos/basic/simple_mesh_description.hpp
+++ /dev/null
@@ -1,239 +0,0 @@
-
-#ifndef _simple_mesh_description_hpp_
-#define _simple_mesh_description_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <utils.hpp>
-#include <set>
-#include <map>
-
-namespace miniFE {
-
-template<typename GlobalOrdinal>
-class simple_mesh_description {
-public:
-  simple_mesh_description(const Box& global_box_in, const Box& local_box_in)
-  {
-   Box local_node_box;
-    for(int i=0; i<3; ++i) {
-      global_box[i][0] = global_box_in[i][0];
-      global_box[i][1] = global_box_in[i][1];
-      local_box[i][0] = local_box_in[i][0];
-      local_box[i][1] = local_box_in[i][1];
-      local_node_box[i][0] = local_box_in[i][0];
-      local_node_box[i][1] = local_box_in[i][1];
-      //num-owned-nodes == num-elems+1 in this dimension if the elem box is not empty
-      //and we are at the high end of the global range in that dimension:
-      if (local_box_in[i][1] > local_box_in[i][0] && local_box_in[i][1] == global_box[i][1]) local_node_box[i][1] += 1;
-    }
-
-    int max_node_x = global_box[0][1]+1;
-    int max_node_y = global_box[1][1]+1;
-    int max_node_z = global_box[2][1]+1;
-    create_map_id_to_row(max_node_x, max_node_y, max_node_z, local_node_box,
-                         map_ids_to_rows);
-
-    //As described in analytic_soln.hpp,
-    //we will impose a 0 boundary-condition on faces x=0, y=0, z=0, y=1, z=1
-    //we will impose a 1 boundary-condition on face x=1
-
-#ifdef MINIFE_DEBUG
-std::cout<<std::endl;
-#endif
-    const int X=0;
-    const int Y=1;
-    const int Z=2;
-
-    const int x1 = max_node_x - 1;
-    const int y1 = max_node_y - 1;
-    const int z1 = max_node_z - 1;
-
-    //if we're on the x=0 face:
-    if (global_box[X][0] == local_box[X][0]) {
-      int miny = local_node_box[Y][0];
-      int minz = local_node_box[Z][0];
-      int maxy = local_node_box[Y][1];
-      int maxz = local_node_box[Z][1];
-      //expand y and z dimensions to include ghost layer
-      if (local_node_box[Y][0] > 0) --miny;
-      if (local_node_box[Z][0] > 0) --minz;
-      if (local_node_box[Y][1] < max_node_y) ++maxy;
-      if (local_node_box[Z][1] < max_node_z) ++maxz;
-
-      for(int iz=minz; iz<maxz; ++iz) {
-        for(int iy=miny; iy<maxy; ++iy) {
-          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
-             0, iy, iz);
-#ifdef MINIFE_DEBUG
-std::cout<<"x=0 BC, node "<<nodeID<<", (0,"<<iy<<","<<iz<<")"<<std::endl;
-#endif
-          bc_rows_0.insert(map_id_to_row(nodeID));
-        }
-      }
-    }
-
-    //if we're on the y=0 face:
-    if (global_box[Y][0] == local_box[Y][0]) {
-      int minx = local_node_box[X][0];
-      int minz = local_node_box[Z][0];
-      int maxx = local_node_box[X][1];
-      int maxz = local_node_box[Z][1];
-      //expand x and z dimensions to include ghost layer
-      if (local_node_box[X][0] > 0) --minx;
-      if (local_node_box[Z][0] > 0) --minz;
-      if (local_node_box[X][1] < max_node_x) ++maxx;
-      if (local_node_box[Z][1] < max_node_z) ++maxz;
-
-      for(int iz=minz; iz<maxz; ++iz) {
-        for(int ix=minx; ix<maxx; ++ix) {
-          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
-             ix, 0, iz);
-#ifdef MINIFE_DEBUG
-std::cout<<"y=0 BC, node "<<nodeID<<", ("<<ix<<",0,"<<iz<<")"<<std::endl;
-#endif
-          bc_rows_0.insert(map_id_to_row(nodeID));
-        }
-      }
-    }
-
-    //if we're on the z=0 face:
-    if (global_box[Z][0] == local_box[Z][0]) {
-      int minx = local_node_box[X][0];
-      int miny = local_node_box[Y][0];
-      int maxx = local_node_box[X][1];
-      int maxy = local_node_box[Y][1];
-      //expand x and y dimensions to include ghost layer
-      if (local_node_box[X][0] > 0) --minx;
-      if (local_node_box[Y][0] > 0) --miny;
-      if (local_node_box[X][1] < max_node_x) ++maxx;
-      if (local_node_box[Y][1] < max_node_y) ++maxy;
-
-      for(int iy=miny; iy<maxy; ++iy) {
-        for(int ix=minx; ix<maxx; ++ix) {
-          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
-             ix, iy, 0);
-#ifdef MINIFE_DEBUG
-std::cout<<"z=0 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<",0)"<<std::endl;
-#endif
-          bc_rows_0.insert(map_id_to_row(nodeID));
-        }
-      }
-    }
-
-    //if we're on the x=1 face:
-    if (global_box[X][1] == local_box[X][1]) {
-      int minz = local_node_box[Z][0];
-      int miny = local_node_box[Y][0];
-      int maxz = local_node_box[Z][1];
-      int maxy = local_node_box[Y][1];
-      //expand z and y dimensions to include ghost layer
-      if (local_node_box[Z][0] > 0) --minz;
-      if (local_node_box[Y][0] > 0) --miny;
-      if (local_node_box[Z][1] < max_node_z) ++maxz;
-      if (local_node_box[Y][1] < max_node_y) ++maxy;
-
-      for(int iy=miny; iy<maxy; ++iy) {
-        for(int iz=minz; iz<maxz; ++iz) {
-          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
-             x1, iy, iz);
-          int row = map_id_to_row(nodeID);
-#ifdef MINIFE_DEBUG
-std::cout<<"x=1 BC, node "<<nodeID<<", row "<<row<<", ("<<x1<<","<<iy<<","<<iz<<")"<<std::endl;
-#endif
-          bc_rows_1.insert(row);
-        }
-      }
-    }
-
-    //if we're on the y=1 face:
-    if (global_box[Y][1] == local_box[Y][1]) {
-      int minz = local_node_box[Z][0];
-      int minx = local_node_box[X][0];
-      int maxz = local_node_box[Z][1];
-      int maxx = local_node_box[X][1];
-      //expand z and x dimensions to include ghost layer
-      if (local_node_box[Z][0] > 0) --minz;
-      if (local_node_box[X][0] > 0) --minx;
-      if (local_node_box[Z][1] < max_node_z) ++maxz;
-      if (local_node_box[X][1] < max_node_x) ++maxx;
-
-      for(int ix=minx; ix<maxx; ++ix) {
-        for(int iz=minz; iz<maxz; ++iz) {
-          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
-             ix, y1, iz);
-#ifdef MINIFE_DEBUG
-std::cout<<"y=1 BC, node "<<nodeID<<", ("<<ix<<","<<y1<<","<<iz<<")"<<std::endl;
-#endif
-          bc_rows_0.insert(map_id_to_row(nodeID));
-        }
-      }
-    }
-
-    //if we're on the z=1 face:
-    if (global_box[Z][1] == local_box[Z][1]) {
-      int miny = local_node_box[Y][0];
-      int minx = local_node_box[X][0];
-      int maxy = local_node_box[Y][1];
-      int maxx = local_node_box[X][1];
-      //expand x and y dimensions to include ghost layer
-      if (local_node_box[Y][0] > 0) --miny;
-      if (local_node_box[X][0] > 0) --minx;
-      if (local_node_box[Y][1] < max_node_y) ++maxy;
-      if (local_node_box[X][1] < max_node_x) ++maxx;
-
-      for(int ix=minx; ix<maxx; ++ix) {
-        for(int iy=miny; iy<maxy; ++iy) {
-          GlobalOrdinal nodeID = get_id<GlobalOrdinal>(max_node_x, max_node_y, max_node_z,
-             ix, iy, z1);
-#ifdef MINIFE_DEBUG
-std::cout<<"z=1 BC, node "<<nodeID<<", ("<<ix<<","<<iy<<","<<z1<<")"<<std::endl;
-#endif
-          bc_rows_0.insert(map_id_to_row(nodeID));
-        }
-      }
-    }
-
-  }
-
-  GlobalOrdinal map_id_to_row(const GlobalOrdinal& id) const
-  {
-    return find_row_for_id(id, map_ids_to_rows);
-  }
-
-  std::set<GlobalOrdinal> bc_rows_0;
-  std::set<GlobalOrdinal> bc_rows_1;
-  std::map<GlobalOrdinal,GlobalOrdinal> map_ids_to_rows;
-  Box global_box;
-  Box local_box;
-};//class simple_mesh_description
-
-}//namespace miniFE
-
-#endif
diff --git a/kokkos/basic/time_kernels.hpp b/kokkos/basic/time_kernels.hpp
deleted file mode 100644
index b14f743..0000000
--- a/kokkos/basic/time_kernels.hpp
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef _time_kernels_hpp_
-#define _time_kernels_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <cmath>
-
-#include <Vector_functions.hpp>
-#include <mytimer.hpp>
-
-#ifdef MINIFE_HAVE_CUDA
-#include <cuda.h>
-#endif
-
-namespace miniFE {
-
-template<typename OperatorType,
-         typename VectorType,
-         typename Matvec>
-void
-time_kernels(OperatorType& A,
-             const VectorType& b,
-             VectorType& x,
-             Matvec matvec,
-             typename OperatorType::LocalOrdinalType max_iter,
-             typename OperatorType::ScalarType& xdotp,
-             timer_type* my_kern_times)
-{
-  typedef typename OperatorType::ScalarType ScalarType;
-  typedef typename OperatorType::LocalOrdinalType OrdinalType;
-  typedef typename TypeTraits<ScalarType>::magnitude_type magnitude_type;
-
-  timer_type t0 = 0, tWAXPY = 0, tDOT = 0, tMATVEC = 0;
-
-  int myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (!A.has_local_indices) {
-    std::cerr << "miniFE::time_kernels ERROR, A.has_local_indices is false, needs to be true. This probably means "
-       << "miniFE::make_local_matrix(A) was not called prior to calling miniFE::time_kernels."
-       << std::endl;
-    return;
-  }
-
-  OrdinalType nrows = A.rows.size();
-  OrdinalType ncols = A.num_cols;
-
-  VectorType p(0, ncols, b.compute_node);
-
-  ScalarType one = 1.0;
-  ScalarType zero = 0.0;
-
-  typedef typename VectorType::ComputeNodeType ComputeNodeType;
-  ComputeNodeType& compute_node = x.compute_node;
-
-  //The following lines that create and initialize buffers are no-ops in many
-  //cases, but perform actual allocations and copies if a off-cpu device such as
-  //a GPU is being used by compute_node.
-
-  //Do any required allocations for buffers that will be needed during CG:
-  ScalarType* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  ScalarType* d_p = compute_node.get_buffer(&p.coefs[0], p.coefs.size());
-  ScalarType* d_b = compute_node.get_buffer(&b.coefs[0], b.coefs.size());
-  OrdinalType* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
-  OrdinalType* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
-  ScalarType* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
-
-  //Copy data to buffers that need to be initialized from input data:
-  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
-  compute_node.copy_to_buffer(&b.coefs[0], b.coefs.size(), d_b);
-  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
-  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
-  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
-
-  TICK();
-  for(OrdinalType i=0; i<max_iter; ++i) {
-    waxpby(one, x, zero, x, p);
-  }
-#ifdef MINIFE_HAVE_CUDA
-  cudaThreadSynchronize();
-#endif
-  TOCK(tWAXPY);
-
-  TICK();
-  for(OrdinalType i=0; i<max_iter; ++i) {
-    matvec(A, p, x);
-  }
-#ifdef MINIFE_HAVE_CUDA
-  cudaThreadSynchronize();
-#endif
-  TOCK(tMATVEC);
-
-  TICK();
-  xdotp = 0;
-  for(OrdinalType i=0; i<max_iter; ++i) {
-    xdotp += dot(x, p);
-  }
-#ifdef MINIFE_HAVE_CUDA
-  cudaThreadSynchronize();
-#endif
-  TOCK(tDOT);
-
-  my_kern_times[WAXPY] = tWAXPY;
-  my_kern_times[DOT] = tDOT;
-  my_kern_times[MATVEC] = tMATVEC;
-  my_kern_times[TOTAL] = 0;
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/basic/utest.cpp b/kokkos/basic/utest.cpp
deleted file mode 100644
index 623c72a..0000000
--- a/kokkos/basic/utest.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-#include <iostream>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-#include <utest_case.hpp>
-#include <utest_cases.hpp>
-
-int main(int argc, char** argv) {
-
-#ifdef HAVE_MPI
-  MPI_Init(&argc, &argv);
-#endif
-
-  //utest_case.hpp declares the 'get_utest_cases' function.
-
-  std::vector<utest_case*>& utest_cases = get_utest_cases();
-  bool tests_passed = true;
-
-  for(size_t i=0; i<utest_cases.size(); ++i) {
-    bool passed = utest_cases[i]->run();
-    if (passed) std::cout << "   pass: " << utest_cases[i]->name() << std::endl;
-    else {
-      std::cout << "!!!FAIL: " << utest_cases[i]->name() << std::endl;
-      tests_passed = false;
-    }
-  }
-
-  if (!tests_passed) {
-    std::cout << "at least 1 test failed."<<std::endl;
-  }
-
-#ifdef HAVE_MPI
-  MPI_Finalize();
-#endif
-
-  return 0;
-}
-
diff --git a/kokkos/basic/utest_case.hpp b/kokkos/basic/utest_case.hpp
deleted file mode 100644
index d6dbf3d..0000000
--- a/kokkos/basic/utest_case.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef _utest_case_hpp_
-#define _utest_case_hpp_
-
-#include <vector>
-
-class utest_case;
-
-std::vector<utest_case*>& get_utest_cases()
-{
-  static std::vector<utest_case*> utest_cases;
-  return utest_cases;
-}
-
-//When a class that inherits the utest_case class is constructed,
-//it gets added to the vector of utest_cases returned by
-//the above 'get_utest_cases' function.
-class utest_case {
-public:
-  utest_case(){ get_utest_cases().push_back(this); }
-  ~utest_case(){}
-  virtual const char* name() = 0;
-  virtual bool run() = 0;
-};
-
-//The following macro declares and instantiates a class that
-//inherits the above utest_case interfaces.
-//
-//use the macro like this:
-//   UTEST_CASE(mytest)
-//   {
-//      ... test code here ...
-//   }
-//
-//See example usages in utest_cases.hpp
-//
-#define UTEST_CASE(TESTNAME) \
-  class TESTNAME##_utest : public utest_case { \
-  public: \
-    TESTNAME##_utest(){} \
-    const char* name() {return #TESTNAME;} \
-    bool run(); \
-  }; \
-  \
-  TESTNAME##_utest instance_##TESTNAME##_utest; \
-  \
-  bool TESTNAME##_utest::run()
-
-#define TEST_EQUAL(A,B) \
-  if ((A) != (B)) return false;
-
-#define TEST_EQUAL_TOL(A,B,tol) \
-  if (std::abs((A) - (B)) > tol) return false;
-
-#endif
-
diff --git a/kokkos/basic/utest_cases.hpp b/kokkos/basic/utest_cases.hpp
deleted file mode 100644
index d15ef9d..0000000
--- a/kokkos/basic/utest_cases.hpp
+++ /dev/null
@@ -1,1232 +0,0 @@
-#ifndef _utest_cases_hpp_
-#define _utest_cases_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <iostream>
-#include <cmath>
-
-#include <BoxPartition.hpp>
-#include <box_utils.hpp>
-#include <simple_mesh_description.hpp>
-#include <generate_matrix_structure.hpp>
-#include <Hex8.hpp>
-#include <Hex8_box_utils.hpp>
-#include <assemble_FE_data.hpp>
-#include <Parameters.hpp>
-#include <make_local_matrix.hpp>
-#include <exchange_externals.hpp>
-#include <Vector_functions.hpp>
-#include <BoxIterator.hpp>
-#include <mytimer.hpp>
-
-#include <SerialComputeNode.hpp>
-
-#ifdef MINIFE_HAVE_TPI
-#include <TPI.h>
-#include <TPINode.hpp>
-#endif
-
-#ifdef MINIFE_HAVE_TBB
-#include <tbb/task_scheduler_init.h>
-#include <TBBNode.hpp>
-#endif
-
-#ifdef MINIFE_HAVE_CUDA
-#include <CudaNode.hpp>
-#endif
-
-#include <utest_case.hpp>
-
-typedef MINIFE_SCALAR Scalar;
-typedef MINIFE_LOCAL_ORDINAL LocalOrdinal;
-typedef MINIFE_GLOBAL_ORDINAL GlobalOrdinal;
-
-template<typename T>
-inline
-int check_get_id(int nx, int ny, int nz, int x, int y, int z, T expected, const char* testname)
-{
-  T val = miniFE::get_id<T>(nx,ny,nz,x,y,z);
-  if (val != expected) {
-    std::cout << testname << " failed. val=" << val<<", expected " << expected << std::endl;
-    return -1;
-  }
-  return 0;
-}
-
-UTEST_CASE(box_partition)
-{
-  int global_box[3][2] = { { 0, 2000 }, { 0, 2000}, { 0, 2000} };
-  int numprocs = 4, myproc = 0;
-
-  int (*local_boxes0)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
-  int (*local_boxes1)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
-  int (*local_boxes2)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
-  int (*local_boxes3)[3][2] = (int(*)[3][2])std::malloc(sizeof(int)*numprocs*3*2);
-
-  box_partition(0, numprocs, 2, global_box, local_boxes0);
-  box_partition(0, numprocs, 2, global_box, local_boxes1);
-  box_partition(0, numprocs, 2, global_box, local_boxes2);
-  box_partition(0, numprocs, 2, global_box, local_boxes3);
-
-  for(int i=1; i<numprocs; ++i) {
-    if (miniFE::get_num_ids<int>(local_boxes0[i]) !=
-        miniFE::get_num_ids<int>(local_boxes0[0])) {
-      return false;
-    }
-    if (miniFE::get_num_ids<int>(local_boxes1[i]) !=
-        miniFE::get_num_ids<int>(local_boxes1[0])) {
-      return false;
-    }
-    if (miniFE::get_num_ids<int>(local_boxes2[i]) !=
-        miniFE::get_num_ids<int>(local_boxes2[0])) {
-      return false;
-    }
-    if (miniFE::get_num_ids<int>(local_boxes3[i]) !=
-        miniFE::get_num_ids<int>(local_boxes3[0])) {
-      return false;
-    }
-
-    if (miniFE::get_num_ids<int>(local_boxes0[i]) < 0 ||
-        miniFE::get_num_ids<int>(local_boxes0[i]) > 2000000000) {
-      return false;
-    }
-  }
-
-  std::free(local_boxes0);
-  std::free(local_boxes1);
-  std::free(local_boxes2);
-  std::free(local_boxes3);
-
-  return true;
-}
-
-UTEST_CASE(generate_matrix_structure1)
-{
-  int global_box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
-  int box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
-
-  miniFE::simple_mesh_description<int> mesh(global_box, box);
-
-  SerialComputeNode compute_node;
-  miniFE::CSRMatrix<Scalar, int, int, SerialComputeNode> A(compute_node);
-
-  miniFE::generate_matrix_structure(mesh, A);
-
-  int nodes_x = global_box[0][1]+1;
-  int nodes_y = global_box[1][1]+1;
-  int nodes_z = global_box[2][1]+1;
-  int nrows = nodes_x*nodes_y*nodes_z;
-  
-  if (A.rows.size() != nrows) {
-    return false;
-  }
-
-  if (A.row_offsets[nrows] != 64) {
-    return false;
-  }
-
-  return true;
-}
-
-UTEST_CASE(generate_matrix_structure2)
-{
-  int global_box[3][2] = {{ 0, 2 }, { 0, 2 }, { 0, 2 } };
-  int box[3][2] = {{ 0, 2 }, { 0, 2 }, { 0, 2 } };
-
-  miniFE::simple_mesh_description<int> mesh(global_box, box);
-
-  SerialComputeNode compute_node;
-  miniFE::CSRMatrix<Scalar, int, int,SerialComputeNode> A(compute_node);
-
-  int nodes_x = global_box[0][1]+1;
-  int nodes_y = global_box[1][1]+1;
-  int nodes_z = global_box[2][1]+1;
-  int nrows = nodes_x*nodes_y*nodes_z;
-  
-  if (nrows != 27) {
-    return false;
-  }
-
-  miniFE::generate_matrix_structure(mesh, A);
-
-  if (A.row_offsets.size() != nrows+1) {
-    return false;
-  }
-
-  if (A.row_offsets[nrows] != 343) {
-    return false;
-  }
-
-  if (A.row_offsets[14]-A.row_offsets[13] != 27) {
-    return false;
-  }
-
-  return true;
-}
-
-UTEST_CASE(get_hex8_node_coords_3d)
-{
-  std::vector<Scalar> coords(24);
-  coords[0] = 0;
-  coords[1] = 0;
-  coords[2] = 0;
-  coords[3] = 1;
-  coords[4] = 0;
-  coords[5] = 0;
-  coords[6] = 1;
-  coords[7] = 0;
-  coords[8] = -1;
-  coords[9] = 0;
-  coords[10] = 0;
-  coords[11] = -1;
-  coords[12] = 0;
-  coords[13] = 1;
-  coords[14] = 0;
-  coords[15] = 1;
-  coords[16] = 1;
-  coords[17] = 0;
-  coords[18] = 1;
-  coords[19] = 1;
-  coords[20] = -1;
-  coords[21] = 0;
-  coords[22] = 1;
-  coords[23] = -1;
-
-  std::vector<Scalar> testcoords(24);
-
-  miniFE::get_hex8_node_coords_3d(0, 0, 0, 1.0, &testcoords[0]);
-
-  if (coords != testcoords) {
-    return false;
-  }
-
-  return true;
-}
-
-inline
-void get_test_elem_mat(std::vector<Scalar>& elem_mat)
-{
-//after much careful debugging, I'm convinced that the following is a
-//correct element-diffusion matrix for the element with local-node-0 at
-//coordinates 0,0,0. So pasting this into a unit-test will guard against
-//unintended changes as I continue working on the code for various reasons.
-
-  elem_mat.resize(36);
-elem_mat[0] = 0.6666666664477059;
-elem_mat[1] = 1.094804871759614e-10;
-elem_mat[2] = -0.1666666666666667;
-elem_mat[3] = 1.094805019211109e-10;
-elem_mat[4] = 1.094804871759614e-10;
-elem_mat[5] = -0.1666666666666667;
-elem_mat[6] = -0.1666666667761472;
-elem_mat[7] = -0.1666666666666667;
-elem_mat[8] = 0.666666666447706;
-elem_mat[9] = 1.094804941148553e-10;
-elem_mat[10] = -0.1666666666666667;
-elem_mat[11] = -0.1666666666666667;
-elem_mat[12] = 1.094804732981736e-10;
-elem_mat[13] = -0.1666666666666667;
-elem_mat[14] = -0.1666666667761472;
-elem_mat[15] = 0.666666666447706;
-elem_mat[16] = 1.094804841401953e-10;
-elem_mat[17] = -0.1666666667761472;
-elem_mat[18] = -0.1666666666666667;
-elem_mat[19] = 1.094804871759614e-10;
-elem_mat[20] = -0.1666666666666667;
-elem_mat[21] = 0.6666666664477059;
-elem_mat[22] = -0.1666666666666668;
-elem_mat[23] = -0.1666666667761472;
-elem_mat[24] = -0.1666666666666667;
-elem_mat[25] = 1.094804702624075e-10;
-elem_mat[26] = 0.666666666447706;
-elem_mat[27] = 1.094804802370675e-10;
-elem_mat[28] = -0.1666666666666667;
-elem_mat[29] = 1.094804698287266e-10;
-elem_mat[30] = 0.666666666447706;
-elem_mat[31] = 1.094805079926431e-10;
-elem_mat[32] = -0.1666666666666667;
-elem_mat[33] = 0.666666666447706;
-elem_mat[34] = 1.094804663592797e-10;
-elem_mat[35] = 0.666666666447706;
-}
-
-UTEST_CASE(diffusionMatrix)
-{
-  std::vector<Scalar> elem_mat_correct(64);
-  get_test_elem_mat(elem_mat_correct);
-
-  const size_t len = miniFE::Hex8::numNodesPerElem*miniFE::Hex8::numNodesPerElem;
-  Scalar elem_mat[len];
-  Scalar testcoords[miniFE::Hex8::numNodesPerElem*miniFE::Hex8::spatialDim];
-
-  miniFE::get_hex8_node_coords_3d(0, 0, 0, 1.0, &testcoords[0]);
-
-  miniFE::Hex8::diffusionMatrix_symm(testcoords, elem_mat);
-
-  for(size_t i=0; i<len; ++i) {
-    if (std::abs(elem_mat[i] - elem_mat_correct[i]) > 1.e-6) {
-      return false;
-    }
-  }
-
-  Scalar elem_vec_correct[miniFE::Hex8::numNodesPerElem];
-  elem_vec_correct[0] = 0.125;
-  elem_vec_correct[1] = 0.125;
-  elem_vec_correct[2] = 0.125;
-  elem_vec_correct[3] = 0.125;
-  elem_vec_correct[4] = 0.125;
-  elem_vec_correct[5] = 0.125;
-  elem_vec_correct[6] = 0.125;
-  elem_vec_correct[7] = 0.125;
-
-  Scalar elem_vec[miniFE::Hex8::numNodesPerElem];
-  miniFE::Hex8::sourceVector(testcoords, elem_vec);
-
-  const size_t nn = miniFE::Hex8::numNodesPerElem;
-  for(size_t i=0; i<nn; ++i) {
-    if (std::abs(elem_vec[i] - elem_vec_correct[i]) > 1.e-13) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-UTEST_CASE(sum_into_row)
-{
-  SerialComputeNode compute_node;
-  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
-  A.rows.resize(1,0);
-  A.row_offsets.resize(2,0);
-  A.row_offsets[1] = 4;
-  A.packed_cols.resize(4);
-  A.packed_cols[0] = 0;
-  A.packed_cols[1] = 1;
-  A.packed_cols[2] = 2;
-  A.packed_cols[3] = 3;
-  A.packed_coefs.resize(4,0);
-
-  std::vector<int> indices(4);
-  indices[0] = 2;
-  indices[1] = 0;
-  indices[2] = 1;
-  indices[3] = 3;
-  std::vector<Scalar> coefs(4);
-  coefs[0] = 2.0;
-  coefs[1] = 0.0;
-  coefs[2] = 1.0;
-  coefs[3] = 3.0;
-
-  miniFE::sum_into_row(0, 4, &indices[0], &coefs[0], A);
-
-  coefs[0] = 0.0;
-  coefs[1] = 1.0;
-  coefs[2] = 2.0;
-  coefs[3] = 3.0;
-
-  if (coefs != A.packed_coefs) {
-    return false;
-  }
-
-  return true;
-}
-
-UTEST_CASE(sum_in_elem_matrix)
-{
-  SerialComputeNode compute_node;
-  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
-  A.rows.resize(4,0);
-  A.rows[0] = 0;
-  A.rows[1] = 1;
-  A.rows[2] = 2;
-  A.rows[3] = 3;
-  A.row_offsets.resize(5,0);
-  A.row_offsets[1] = 4;
-  A.row_offsets[2] = 8;
-  A.row_offsets[3] = 12;
-  A.row_offsets[4] = 16;
-  A.packed_cols.resize(16);
-  A.packed_cols[0] = 0;
-  A.packed_cols[1] = 1;
-  A.packed_cols[2] = 2;
-  A.packed_cols[3] = 3;
-  A.packed_cols[4] = 0;
-  A.packed_cols[5] = 1;
-  A.packed_cols[6] = 2;
-  A.packed_cols[7] = 3;
-  A.packed_cols[8] = 0;
-  A.packed_cols[9] = 1;
-  A.packed_cols[10] = 2;
-  A.packed_cols[11] = 3;
-  A.packed_cols[12] = 0;
-  A.packed_cols[13] = 1;
-  A.packed_cols[14] = 2;
-  A.packed_cols[15] = 3;
-
-  A.packed_coefs.resize(16,0);
-
-  std::vector<int> indices(4);
-  indices[0] = 2;
-  indices[1] = 0;
-  indices[2] = 1;
-  indices[3] = 3;
-  std::vector<Scalar> coefs(16);
-  coefs[0] = 2.0;
-  coefs[1] = 0.0;
-  coefs[2] = 1.0;
-  coefs[3] = 3.0;
-  coefs[4] = 2.0;
-  coefs[5] = 0.0;
-  coefs[6] = 1.0;
-  coefs[7] = 3.0;
-  coefs[8] = 2.0;
-  coefs[9] = 0.0;
-  coefs[10] = 1.0;
-  coefs[11] = 3.0;
-  coefs[12] = 2.0;
-  coefs[13] = 0.0;
-  coefs[14] = 1.0;
-  coefs[15] = 3.0;
-
-  miniFE::sum_in_elem_matrix(4, &indices[0], &coefs[0], A);
-
-  coefs[0] = 0.0;
-  coefs[1] = 1.0;
-  coefs[2] = 2.0;
-  coefs[3] = 3.0;
-  coefs[4] = 0.0;
-  coefs[5] = 1.0;
-  coefs[6] = 2.0;
-  coefs[7] = 3.0;
-  coefs[8] = 0.0;
-  coefs[9] = 1.0;
-  coefs[10] = 2.0;
-  coefs[11] = 3.0;
-  coefs[12] = 0.0;
-  coefs[13] = 1.0;
-  coefs[14] = 2.0;
-  coefs[15] = 3.0;
-
-  if (coefs != A.packed_coefs) {
-    return false;
-  }
-
-  return true;
-}
-
-UTEST_CASE(assemble_FE_data)
-{
-  int global_box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
-  int box[3][2] = {{ 0, 1 }, { 0, 1 }, { 0, 1 } };
-
-  miniFE::simple_mesh_description<int> mesh(global_box, box);
-
-  SerialComputeNode compute_node;
-  miniFE::CSRMatrix<Scalar, int, int, SerialComputeNode> A(compute_node);
-
-  miniFE::generate_matrix_structure(mesh, A);
-
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> b(0, 8, compute_node);
-
-  const int num_nodes = 8;
-
-  std::vector<Scalar> symm_elem_mat_correct;
-  get_test_elem_mat(symm_elem_mat_correct);
-  std::vector<Scalar> full_elem_mat_correct(num_nodes*num_nodes);
-
-  int offset = 0;
-  for(int i=0; i<num_nodes; ++i) {
-    for(int j=0; j<num_nodes; ++j) {
-      if (j>=i) {
-        Scalar coef = symm_elem_mat_correct[offset++];
-        full_elem_mat_correct[i*num_nodes+j] = coef;
-        full_elem_mat_correct[j*num_nodes+i] = coef;
-      }
-    }
-  }
-
-  std::vector<int> elem_node_ids(num_nodes);
-  elem_node_ids[0] = 0;
-  elem_node_ids[1] = 1;
-  elem_node_ids[2] = 5;
-  elem_node_ids[3] = 4;
-  elem_node_ids[4] = 2;
-  elem_node_ids[5] = 3;
-  elem_node_ids[6] = 7;
-  elem_node_ids[7] = 6;
-
-  //now for each row of of the 8x8 elem_mat_correct, reorder that
-  //row according to the order of elem_node_ids, rows and columns.
-  std::vector<Scalar> elem_mat_reordered(num_nodes*num_nodes);
-  offset = 0;
-  int row = 0;
-  for(int i=0; i<num_nodes; ++i) {
-    row = num_nodes*elem_node_ids[i];
-    for(int j=0; j<num_nodes; ++j) {
-      elem_mat_reordered[row+elem_node_ids[j]] = full_elem_mat_correct[offset+j];
-    }
-    offset += num_nodes;
-  }
-
-  //now elem_mat_reordered should contain the same coefficients,
-  //in the same order, as the assembled-matrix coefficients that will be
-  //produced in A by assemble_FE_data:
-
-  miniFE::Parameters params;
-  params.use_locking = 1;
-
-  miniFE::assemble_FE_data(mesh, A, b, params);
-
-  std::vector<Scalar>& assembled_mat = A.packed_coefs;
-
-  for(size_t i=0; i<elem_mat_reordered.size(); ++i) {
-    if (std::abs(elem_mat_reordered[i] - assembled_mat[i]) > 1.e-13) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-UTEST_CASE(pll_matvec2)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 2) {
-    if (myproc == 0) std::cout <<"pll_matvec2_utest only runs when numprocs=2."<<std::endl;
-    return true;
-  }
-
-  //create the following matrix and vector:
-  //
-  // A = | 1 1      |  x = | 1 |
-  //     | 2 1 -1 1 |      | 2 |
-  //     |  -2  1   |      | 3 |
-  //     |   2    1 |      | 4 |
-  //
-  // with the first 2 rows on proc 0 and the other rows on proc 1.
-  //
-  //So a matvec should produce y = | 3 |
-  //                               | 5 |
-  //                               |-1 |
-  //                               | 8 |
-
-  SerialComputeNode compute_node;
-
-  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> x(myproc, 4,compute_node) ,y(myproc, 4,compute_node);
-
-  A.rows.resize(2, 0);
-  if (myproc == 0) {
-    A.rows[0] = 0; A.rows[1] = 1;
-  }
-  else {
-    A.rows[0] = 2; A.rows[1] = 3;
-  }
-
-  A.row_offsets.resize(3, 0);
-  if (myproc == 0) {
-    A.row_offsets[1] = 2; A.row_offsets[2] = 6;
-  }
-  else {
-    A.row_offsets[1] = 2; A.row_offsets[2] = 4;
-  }
-
-  if (myproc == 0) {
-    A.packed_cols.resize(6, 0);
-    A.packed_cols[1] = 1;
-    A.packed_cols[2] = 0;
-    A.packed_cols[3] = 1;
-    A.packed_cols[4] = 2;
-    A.packed_cols[5] = 3;
-  }
-  else {
-    A.packed_cols.resize(4, 0);
-    A.packed_cols[0] = 1;
-    A.packed_cols[1] = 2;
-    A.packed_cols[2] = 1;
-    A.packed_cols[3] = 3;
-  }
-  if (myproc == 0) {
-    A.packed_coefs.resize(6, 1);
-    A.packed_coefs[2] = 2;
-    A.packed_coefs[4] = -1;
-  }
-  else {
-    A.packed_coefs.resize(4, 1);
-    A.packed_coefs[0] = -2;
-    A.packed_coefs[2] = 2;
-  }
-
-  if (myproc == 0) {
-    x.coefs[0] = 1; x.coefs[1] = 2;
-  }
-  else {
-    x.coefs[0] = 3; x.coefs[1] = 4;
-  }
-
-  miniFE::make_local_matrix(A);
-  miniFE::exchange_externals(A, x);
-  miniFE::matvec(A, x, y);
-
-  if (myproc == 0) {
-    if (y.coefs[0] != 3.0 || y.coefs[1] != 5.0) {
-      std::cout << "proc 0: pll_matvec2_utest failed" << std::endl;
-      return false;
-    }
-  }
-  else {
-    if (y.coefs[0] != -1.0 || y.coefs[1] != 8.0) {
-      std::cout << "proc 1: pll_matvec2_utest failed" << std::endl;
-      return false;
-    }
-  }
-
-  return true;
-}
-
-UTEST_CASE(pll_matvec3)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 3) {
-    if (myproc == 0) std::cout <<"pll_matvec3_utest only runs when numprocs=3."<<std::endl;
-    return true;
-  }
-
-  //create the following matrix and vector:
-  //
-  // cols: 0  1  2  3  4  5 
-  // A = | 1       -1        |  x = | 1 |
-  //     |    1          -1  |      | 2 |
-  //     | 2     1    -1     |      | 3 |
-  //     |          1        |      | 4 |
-  //     |    2        1     |      | 5 |
-  //     |          2     1  |      | 6 |
-  //
-  // with the first 2 rows on proc 0, next 2 on proc 1, last 2 on proc 2.
-  //
-  //So a matvec should produce y = |-3 |
-  //                               |-4 |
-  //                               | 0 |
-  //                               | 4 |
-  //                               | 9 |
-  //                               |14 |
-
-  SerialComputeNode compute_node;
-
-  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> A(compute_node);
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,SerialComputeNode> x(myproc, 6, compute_node) ,y(myproc, 6, compute_node);
-
-  A.rows.resize(2, 0);
-  A.rows[0] = myproc*2; A.rows[1] = myproc*2+1;
-
-  A.row_offsets.resize(3, 0);
-  if (myproc == 0) {
-    A.row_offsets[1] = 2; A.row_offsets[2] = 4;
-  }
-  else if (myproc == 1) {
-    A.row_offsets[1] = 3; A.row_offsets[2] = 4;
-  }
-  else {
-    A.row_offsets[1] = 2; A.row_offsets[2] = 4;
-  }
-
-  A.packed_cols.resize(4, 0);
-  if (myproc == 0) {
-    A.packed_cols[1] = 3;
-    A.packed_cols[2] = 1;
-    A.packed_cols[3] = 5;
-  }
-  else if (myproc == 1) {
-    A.packed_cols[1] = 2;
-    A.packed_cols[2] = 4;
-    A.packed_cols[3] = 3;
-  }
-  else {
-    A.packed_cols[0] = 1;
-    A.packed_cols[1] = 4;
-    A.packed_cols[2] = 3;
-    A.packed_cols[3] = 5;
-  }
-
-  A.packed_coefs.resize(4, 1);
-  if (myproc == 0) {
-    A.packed_coefs[1] = -1;
-    A.packed_coefs[3] = -1;
-  }
-  else if (myproc == 1) {
-    A.packed_coefs[0] = 2;
-    A.packed_coefs[2] = -1;
-  }
-  else {
-    A.packed_coefs[0] = 2;
-    A.packed_coefs[2] = 2;
-  }
-
-  if (myproc == 0) {
-    x.coefs[0] = 1; x.coefs[1] = 2;
-  }
-  else if (myproc == 1) {
-    x.coefs[0] = 3; x.coefs[1] = 4;
-  }
-  else {
-    x.coefs[0] = 5; x.coefs[1] = 6;
-  }
-
-  miniFE::make_local_matrix(A);
-  miniFE::exchange_externals(A, x);
-  miniFE::matvec(A, x, y);
-
-  if (myproc == 0) {
-    if (y.coefs[0] != -3.0 || y.coefs[1] != -4.0) {
-      std::cout << "proc 0: pll_matvec3 failed" << std::endl;
-      return false;
-    }
-  }
-  else if (myproc == 1) {
-    if (y.coefs[0] != 0.0 || y.coefs[1] != 4.0) {
-      std::cout << "proc 1: pll_matvec3 failed" << std::endl;
-      return false;
-    }
-  }
-  else {
-    if (y.coefs[0] != 9.0 || y.coefs[1] != 14.0) {
-      std::cout << "proc 2: pll_matvec3 failed" << std::endl;
-      return false;
-    }
-  }
-
-  return true;
-}
-
-UTEST_CASE(ComputeNode_waxpy1)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 1) {
-    if (myproc == 0) std::cout <<"ComputeNode_waxpy1 only runs when numprocs=1."<<std::endl;
-    return true;
-  }
-
-#ifdef MINIFE_HAVE_CUDA
-  CUDANode compute_node(0,16,64);
-  typedef CUDANode ComputeNodeType;
-#else
-  SerialComputeNode compute_node;
-  typedef SerialComputeNode ComputeNodeType;
-#endif
-
-  size_t len = 10;
-
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, len, compute_node), y(0, len, compute_node), w(0, len, compute_node);
-
-  std::vector<GlobalOrdinal> inds(len, 0);
-  for(size_t i=0; i<len; ++i) inds[i] = i;
-
-  std::vector<Scalar> coefs(len, 1);
-
-  miniFE::sum_into_vector(len, &inds[0], &coefs[0], x);
-  miniFE::sum_into_vector(len, &inds[0], &coefs[0], y);
-  miniFE::sum_into_vector(len, &inds[0], &coefs[0], w);
-
-  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-
-  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
-  compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
-
-  miniFE::waxpby(1.0, x, 1.0, y, w);
-
-  Scalar* d_w = compute_node.get_buffer(&w.coefs[0], w.coefs.size());
-  compute_node.copy_from_buffer(&w.coefs[0], w.coefs.size(), d_w);
-
-  Scalar expected = 2;
-  Scalar tol = 1.e-7;
-
-  for(size_t i=0; i<len; ++i) { 
-    if (std::abs(w.coefs[i]-expected) > tol) {
-      return false;
-    }
-  }
-  return true;
-}
-
-UTEST_CASE(ComputeNode_dot1)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 1) {
-    if (myproc == 0) std::cout <<"ComputeNode_dot1 only runs when numprocs=1."<<std::endl;
-    return true;
-  }
-
-#ifdef MINIFE_HAVE_CUDA
-  CUDANode compute_node(0,1,64);
-  typedef CUDANode ComputeNodeType;
-#else
-  SerialComputeNode compute_node;
-  typedef SerialComputeNode ComputeNodeType;
-#endif
-
-  size_t N = 100;
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, N, compute_node), y(0, N, compute_node);
-
-  std::vector<int> inds(N, 0);
-  for(size_t i=0; i<N; ++i) inds[i] = i;
-
-  std::vector<Scalar> coefs(N, 1);
-
-  miniFE::sum_into_vector(N, &inds[0], &coefs[0], x);
-  miniFE::sum_into_vector(N, &inds[0], &coefs[0], y);
-
-  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-
-  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
-  compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
-
-  Scalar dot_prod = miniFE::dot(x,y);
-
-  if (dot_prod != N) {
-    return false;
-  }
-
-  return true;
-}
-
-UTEST_CASE(ComputeNode_TBB_dot1)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 1) {
-    if (myproc == 0) std::cout <<"ComputeNode_TBB_dot1_utest only runs when numprocs=1."<<std::endl;
-    return true;
-  }
-
-#ifdef MINIFE_HAVE_TBB
-  TBBNode compute_node(2);
-  typedef TBBNode ComputeNodeType;
-
-  size_t N = 10;
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, N, compute_node), y(0, N, compute_node);
-
-  std::vector<GlobalOrdinal> inds(N, 0);
-  for(size_t i=0; i<N; ++i) inds[i] = i;
-
-  std::vector<Scalar> coefs(N, 1);
-
-  miniFE::sum_into_vector(inds.size(), &inds[0], &coefs[0], x);
-  miniFE::sum_into_vector(inds.size(), &inds[0], &coefs[0], y);
-
-  Scalar dot_prod = miniFE::dot(x,y);
-
-  if (dot_prod != N) {
-    return false;
-  }
-
-#else
-  std::cout << "ComputeNode_TBB_dot1_utest only runs when MINIFE_HAVE_TBB is defined."<<std::endl;
-#endif
-  return true;
-}
-
-UTEST_CASE(ComputeNode_dot2)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 1) {
-    if (myproc == 0) std::cout <<"ComputeNode_dot2_utest only runs when numprocs=1."<<std::endl;
-    return true;
-  }
-
-#ifdef MINIFE_HAVE_CUDA
-  CUDANode compute_node(0,64,128);
-  typedef CUDANode ComputeNodeType;
-#else
-  SerialComputeNode compute_node;
-  typedef SerialComputeNode ComputeNodeType;
-#endif
-
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, 10, compute_node), y(0, 10, compute_node);
-
-  size_t len = 10;
-  std::vector<int> inds(len, 0);
-  for(size_t i=0; i<len; ++i) inds[i] = i;
-
-  std::vector<Scalar> coefs(len, 1);
-
-  miniFE::sum_into_vector(len, &inds[0], &coefs[0], x);
-  miniFE::sum_into_vector(len, &inds[0], &coefs[0], y);
-
-  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-
-  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
-  compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
-
-  Scalar dot_prod = miniFE::dot(x, y);
-
-  if (std::abs(dot_prod-10.0) > 1.e-12) {
-    return false;
-  }
-  return true;
-}
-
-UTEST_CASE(ser_matvec1)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 1) {
-    if (myproc == 0) std::cout <<"ser_matvec1_utest only runs when numprocs=1."<<std::endl;
-    return 0;
-  }
-
-  //create the following matrix and vector:
-  //
-  // A = | 1 1      |  x = | 1 |
-  //     | 2 1 -1 1 |      | 2 |
-  //     |  -2  1   |      | 3 |
-  //     |   2    1 |      | 4 |
-  //
-  // with the first 2 rows on proc 0 and the other rows on proc 1.
-  //
-  //So a matvec should produce y = | 3 |
-  //                               | 5 |
-  //                               |-1 |
-  //                               | 8 |
-
-#ifdef MINIFE_HAVE_CUDA
-  CUDANode compute_node(0,64,128);
-  typedef CUDANode ComputeNodeType;
-#else
-  SerialComputeNode compute_node;
-  typedef SerialComputeNode ComputeNodeType;
-#endif
-
-  miniFE::CSRMatrix<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> A(compute_node);
-  miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, 4,compute_node) ,y(0, 4,compute_node);
-
-  A.rows.resize(4, 0);
-  A.rows[0] = 0; A.rows[1] = 1;
-  A.rows[2] = 2; A.rows[3] = 3;
-
-  A.row_offsets.resize(5, 0);
-  A.row_offsets[1] = 2; A.row_offsets[2] = 6;
-  A.row_offsets[3] = 8; A.row_offsets[4] = 10;
-
-  A.packed_cols.resize(10, 0);
-  A.packed_cols[1] = 1;
-  A.packed_cols[2] = 0;
-  A.packed_cols[3] = 1;
-  A.packed_cols[4] = 2;
-  A.packed_cols[5] = 3;
-  A.packed_cols[6] = 1;
-  A.packed_cols[7] = 2;
-  A.packed_cols[8] = 1;
-  A.packed_cols[9] = 3;
-
-  A.packed_coefs.resize(10, 1);
-  A.packed_coefs[2] = 2;
-  A.packed_coefs[4] = -1;
-  A.packed_coefs[6] = -2;
-  A.packed_coefs[8] = 2;
-
-  x.coefs[0] = 1; x.coefs[1] = 2; x.coefs[2] = 3; x.coefs[3] = 4;
-
-  for(size_t i=0; i<y.coefs.size(); ++i) y.coefs[i] = 0;
-
-  miniFE::make_local_matrix(A);
-
-  Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-  compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
-
-  LocalOrdinal* d_Arowoff = compute_node.get_buffer(&A.row_offsets[0], A.row_offsets.size());
-  GlobalOrdinal* d_Acols   = compute_node.get_buffer(&A.packed_cols[0], A.packed_cols.size());
-  Scalar* d_Acoefs  = compute_node.get_buffer(&A.packed_coefs[0], A.packed_coefs.size());
-
-  compute_node.copy_to_buffer(&A.row_offsets[0], A.row_offsets.size(), d_Arowoff);
-  compute_node.copy_to_buffer(&A.packed_cols[0], A.packed_cols.size(), d_Acols);
-  compute_node.copy_to_buffer(&A.packed_coefs[0], A.packed_coefs.size(), d_Acoefs);
-
-  miniFE::matvec(A, x, y);
-
-  Scalar* ybuf = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-  compute_node.copy_from_buffer(&y.coefs[0], y.coefs.size(), ybuf);
-
-  if (std::abs(y.coefs[0] - 3.0) > 1.e-12) {
-    std::cout << "failed 0. y.coefs[0]=" <<y.coefs[0]<<", expected 3.0" << std::endl;
-    return false;
-  }
-
-  if (std::abs(y.coefs[1] - 5.0) > 1.e-12) {
-    std::cout << "failed 1. y.coefs[1]=" <<y.coefs[1]<<", expected 5.0" << std::endl;
-    return false;
-  }
-
-  if (std::abs(y.coefs[2] - -1.0) > 1.e-12) {
-    std::cout << "failed 2. y.coefs[2]=" <<y.coefs[2]<<", expected -1.0" << std::endl;
-    return false;
-  }
-
-  if (std::abs(y.coefs[3] - 8.0) > 1.e-12) {
-    std::cout << "failed 3. y.coefs[3]=" <<y.coefs[3]<<", expected 8.0" << std::endl;
-    return false;
-  }
-
-  return true;
-}
-
-using miniFE::mytimer;
-
-UTEST_CASE(waxpby_perf)
-{
-  int numprocs = 1, myproc = 0;
-#ifdef HAVE_MPI
-  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myproc);
-#endif
-
-  if (numprocs != 1) {
-    if (myproc == 0) std::cout <<"waxpby_perf_utest only runs when numprocs=1."<<std::endl;
-    return true;
-  }
-
-  size_t num_iters = 10;
-  size_t len = 8193;
-
-#ifdef MINIFE_HAVE_CUDA
-  CUDANode compute_node(0,16,64);
-  typedef CUDANode ComputeNodeType;
-#else
-  SerialComputeNode compute_node;
-  typedef SerialComputeNode ComputeNodeType;
-#endif
-
-  miniFE::timer_type t0 = 0, tWAXPY = 0;
-
-  while(tWAXPY < 1.e-2) {
-
-    tWAXPY = 0;
-    len *= 2;
-
-    miniFE::Vector<Scalar,LocalOrdinal,GlobalOrdinal,ComputeNodeType> x(0, len,compute_node) ,y(0, len,compute_node), w(0, len,compute_node);
-  
-    Scalar one = 1, zero = 0;
-  
-    for(size_t i=0; i<len; ++i) {
-      x.coefs[i] = one;
-      y.coefs[i] = one;
-      w.coefs[i] = zero;
-    }
-
-    Scalar* d_x = compute_node.get_buffer(&x.coefs[0], x.coefs.size());
-    Scalar* d_y = compute_node.get_buffer(&y.coefs[0], y.coefs.size());
-
-    compute_node.copy_to_buffer(&x.coefs[0], x.coefs.size(), d_x);
-    compute_node.copy_to_buffer(&y.coefs[0], y.coefs.size(), d_y);
-
-    TICK();
-    for(size_t i=0; i<num_iters; ++i) {
-      miniFE::waxpby(one, x, one, y, w);
-    }
-    TOCK(tWAXPY);
-
-#ifdef MINIFE_HAVE_CUDA
-//on cuda this time tends to stay very small because (I think) the
-//waxpby function returns before the cuda calculation finishes. So if we
-//don't artificially make this time large, this loop will go on forever.
-    if (tWAXPY < 1.e-2) tWAXPY = 1.e-2;
-#endif
-  }
-
-  Scalar waxpy_flops = len*3.0*num_iters;
-  Scalar waxpy_mflops = tWAXPY>1.e-2 ? 1.e-6 * (waxpy_flops/tWAXPY) : 0;
-
-  std::cout << "waxpby_perf_utest: WAXPBY time: " << tWAXPY << ", len: " << len << ", num_iters: " << num_iters
-      << ", MFLOPS: " << waxpy_mflops << std::endl;
-  return true;
-}
-
-UTEST_CASE(matmat3x3_1)
-{
-  Scalar A[] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
-  Scalar B[] = {1, 1, 1, 2, 2, 2, 3, 3, 3};
-  Scalar C[9];
-
-  miniFE::matmat3x3<Scalar>(A, B, C);
-
-  TEST_EQUAL(C[0], 6.0);
-  TEST_EQUAL(C[1], 15.0);
-  TEST_EQUAL(C[2], 24.0);
-  TEST_EQUAL(C[3], 12.0);
-  TEST_EQUAL(C[4], 30.0);
-  TEST_EQUAL(C[5], 48.0);
-  TEST_EQUAL(C[6], 18.0);
-  TEST_EQUAL(C[7], 45.0);
-  TEST_EQUAL(C[8], 72.0);
-
-  return true;
-}
-
-UTEST_CASE(matmat3x3_X_3xn_1)
-{
-  Scalar A[] = {1, 4, 7, 2, 5, 8, 3, 6, 9};
-  Scalar B[] = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
-  Scalar C[18];
-
-  miniFE::matmat3x3_X_3xn<Scalar>(A, 6, B, C);
-
-  TEST_EQUAL(C[0], 6.0);
-  TEST_EQUAL(C[1], 15.0);
-  TEST_EQUAL(C[2], 24.0);
-  TEST_EQUAL(C[3], 12.0);
-  TEST_EQUAL(C[4], 30.0);
-  TEST_EQUAL(C[5], 48.0);
-  TEST_EQUAL(C[6], 18.0);
-  TEST_EQUAL(C[7], 45.0);
-  TEST_EQUAL(C[8], 72.0);
-  TEST_EQUAL(C[9], 24.0);
-  TEST_EQUAL(C[10], 60.0);
-  TEST_EQUAL(C[11], 96.0);
-  TEST_EQUAL(C[12], 30.0);
-  TEST_EQUAL(C[13], 75.0);
-  TEST_EQUAL(C[14], 120.0);
-  TEST_EQUAL(C[15], 36.0);
-  TEST_EQUAL(C[16], 90.0);
-  TEST_EQUAL(C[17], 144.0);
-
-  return true;
-}
-
-UTEST_CASE(matTransMat3x3_X_3xn_1)
-{
-  Scalar A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  Scalar B[] = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
-  Scalar C[18];
-
-  miniFE::matTransMat3x3_X_3xn<Scalar>(A, 6, B, C);
-
-  TEST_EQUAL(C[0], 6.0);
-  TEST_EQUAL(C[1], 15.0);
-  TEST_EQUAL(C[2], 24.0);
-  TEST_EQUAL(C[3], 12.0);
-  TEST_EQUAL(C[4], 30.0);
-  TEST_EQUAL(C[5], 48.0);
-  TEST_EQUAL(C[6], 18.0);
-  TEST_EQUAL(C[7], 45.0);
-  TEST_EQUAL(C[8], 72.0);
-  TEST_EQUAL(C[9], 24.0);
-  TEST_EQUAL(C[10], 60.0);
-  TEST_EQUAL(C[11], 96.0);
-  TEST_EQUAL(C[12], 30.0);
-  TEST_EQUAL(C[13], 75.0);
-  TEST_EQUAL(C[14], 120.0);
-  TEST_EQUAL(C[15], 36.0);
-  TEST_EQUAL(C[16], 90.0);
-  TEST_EQUAL(C[17], 144.0);
-
-  return true;
-}
-
-UTEST_CASE(BoxIterator1)
-{
-  int box1[3][2] = {{ 0, 2 }, { 0, 2 }, { 0, 2 } };
-  miniFE::BoxIterator iter = miniFE::BoxIterator::begin(box1);
-  miniFE::BoxIterator end = miniFE::BoxIterator::end(box1);
-
-  for(int iz=box1[2][0]; iz<box1[2][1]; ++iz) {
-   for(int iy=box1[1][0]; iy<box1[1][1]; ++iy) {
-    for(int ix=box1[0][0]; ix<box1[0][1]; ++ix) {
-      TEST_EQUAL((iter == end), false);
-      TEST_EQUAL(ix, iter.x);
-      TEST_EQUAL(iy, iter.y);
-      TEST_EQUAL(iz, iter.z);
-      ++iter;
-    }
-   }
-  }
-
-  TEST_EQUAL((iter == end), true);
-
-  return true;
-}
-
-UTEST_CASE(BoxIterator_get_coords)
-{
-  const int nx=2;
-  const int ny=3;
-  const int nz=4;
-  int box1[3][2] = {{ 0, nx }, { 0, ny }, { 0, nz } };
-  miniFE::BoxIterator iter = miniFE::BoxIterator::begin(box1);
-  miniFE::BoxIterator end = miniFE::BoxIterator::end(box1);
-
-  for(; iter!=end; ++iter) {
-    int elemID = miniFE::get_id<int>(nx,ny,nz,iter.x,iter.y,-iter.z);
-    int x, y, z;
-    miniFE::get_coords<int>(elemID, nx,ny,nz, x,y,z);
-    TEST_EQUAL(x,iter.x);
-    TEST_EQUAL(y,iter.y);
-    TEST_EQUAL(z,-iter.z);
-  }
-
-  return true;
-}
-
-#endif
-
diff --git a/kokkos/basic/verify_solution.hpp b/kokkos/basic/verify_solution.hpp
deleted file mode 100644
index fb3bd3b..0000000
--- a/kokkos/basic/verify_solution.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-#ifndef _verify_solution_hpp_
-#define _verify_solution_hpp_
-
-//@HEADER
-// ************************************************************************
-// 
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-#include <sstream>
-#include <stdexcept>
-#include <map>
-#include <algorithm>
-
-#include <simple_mesh_description.hpp>
-#include <analytic_soln.hpp>
-#include <box_utils.hpp>
-#include <utils.hpp>
-
-#ifdef HAVE_MPI
-#include <mpi.h>
-#endif
-
-namespace miniFE {
-
-template<typename Scalar>
-struct err_info {
-  Scalar err;
-  Scalar computed;
-  Scalar analytic;
-  Scalar coords[3];
-};
-
-template<typename VectorType>
-void
-verify_solution(const simple_mesh_description<typename VectorType::GlobalOrdinalType>& mesh,
-                const VectorType& x)
-{
-  typedef typename VectorType::GlobalOrdinalType GlobalOrdinal;
-  typedef typename VectorType::ScalarType Scalar;
-
-  int global_nodes_x = mesh.global_box[0][1]+1;
-  int global_nodes_y = mesh.global_box[1][1]+1;
-  int global_nodes_z = mesh.global_box[2][1]+1;
-  Box box;
-  copy_box(mesh.local_box, box);
-
-  //num-owned-nodes in each dimension is num-elems+1
-  //only if num-elems > 0 in that dimension *and*
-  //we are at the high end of the global range in that dimension:
-  if (box[0][1] > box[0][0] && box[0][1] == mesh.global_box[0][1]) ++box[0][1];
-  if (box[1][1] > box[1][0] && box[1][1] == mesh.global_box[1][1]) ++box[1][1];
-  if (box[2][1] > box[2][0] && box[2][1] == mesh.global_box[2][1]) ++box[2][1];
-
-  GlobalOrdinal nrows = get_num_ids<GlobalOrdinal>(box);
-
-  std::vector<GlobalOrdinal> rows(nrows);
-  std::vector<Scalar> row_coords(nrows*3);
-
-  unsigned roffset = 0;
-
-  for(int iz=box[2][0]; iz<box[2][1]; ++iz) {
-   for(int iy=box[1][0]; iy<box[1][1]; ++iy) {
-    for(int ix=box[0][0]; ix<box[0][1]; ++ix) {
-      GlobalOrdinal row_id =
-          get_id<GlobalOrdinal>(global_nodes_x, global_nodes_y, global_nodes_z,
-                                ix, iy, iz);
-      Scalar x, y, z;
-      get_coords(row_id, global_nodes_x, global_nodes_y, global_nodes_z, x, y, z);
-
-      rows[roffset] = mesh.map_id_to_row(row_id);
-      row_coords[roffset*3] = x;
-      row_coords[roffset*3+1] = y;
-      row_coords[roffset*3+2] = z;
-      ++roffset;
-    }
-   }
-  }
-
-  if (x.local_size != rows.size() || x.local_size != nrows) {
-    throw std::runtime_error("verify_solution ERROR, size mismatch");
-  }
-
-  const int num_terms = 300;
-
-  err_info<Scalar> max_error;
-  max_error.err = 0.0;
-
-  for(size_t i=0; i<rows.size(); ++i) {
-    Scalar computed_soln = x.coefs[i];
-    Scalar x = row_coords[i*3];
-    Scalar y = row_coords[i*3+1];
-    Scalar z = row_coords[i*3+2];
-    Scalar analytic_soln = 0.0;
-    //set exact boundary-conditions:
-    if (x == 1.0) {
-      //x==1 is first, we want soln to be 1 even around the edges
-      //of the x==1 plane where y and/or z may be 0 or 1...
-      analytic_soln = 1;
-    }
-    else if (x == 0.0 || y == 0.0 || z == 0.0) {
-      analytic_soln = 0;
-    }
-    else if (y == 1.0 || z == 1.0) {
-      analytic_soln = 0;
-    }
-    else {
-      analytic_soln = soln(x, y, z, num_terms, num_terms);
-    }
-
-#ifdef MINIFE_DEBUG
-std::cout<<"("<<x<<","<<y<<","<<z<<") row "<<rows[i]<<": computed: "<<computed_soln<<",  analytic: "<<analytic_soln<<std::endl;
-#endif
-    Scalar err = std::abs(analytic_soln - computed_soln);
-    if (err > max_error.err) {
-      max_error.err = err;
-      max_error.computed = computed_soln;
-      max_error.analytic = analytic_soln;
-      max_error.coords[0] = x;
-      max_error.coords[1] = y;
-      max_error.coords[2] = z;
-    }
-  }
-
-  Scalar local_max_err = max_error.err;
-  Scalar global_max_err = 0;
-#ifdef HAVE_MPI
-  MPI_Allreduce(&local_max_err, &global_max_err, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-#else
-  global_max_err = local_max_err;
-#endif
-
-  if (local_max_err == global_max_err) {
-    if (max_error.err > 1.e-6) {
-      std::cout << "max absolute error is "<<max_error.err<<":"<<std::endl;
-      std::cout << "   at position ("<<max_error.coords[0]<<","<<max_error.coords[1]<<","<<max_error.coords[2]<<"), "<<std::endl;
-      std::cout << "   computed solution: "<<max_error.computed<<",  analytic solution: "<<max_error.analytic<<std::endl;
-    }
-    else {
-      std::cout << "solution matches analytic solution to within 1.e-6 or better."<<std::endl;
-    }
-  }
-}
-
-}//namespace miniFE
-
-#endif
-
diff --git a/kokkos/common/NewMiniApp.cpp b/kokkos/common/NewMiniApp.cpp
deleted file mode 100644
index 0e59a31..0000000
--- a/kokkos/common/NewMiniApp.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-
-//@HEADER
-// ************************************************************************
-// 
-//               Mantevo: A collection of mini-applications for HPC
-//                 Copyright (2008) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-// Changelog
-//
-// Version 0.1
-
-
-#include <string>
-#include <iostream>
-
-#include <vector>
-#include <map>
-#include "vectorTests.hpp"
-#include "YAML_Element.hpp"
-#include "YAML_Doc.hpp"
-#ifdef HAVE_MPI
-#include <mpi.h> // If this routine is compiled with -DHAVE_MPI
-                 // then include mpi.h
-#endif
-void addResults(YAML_Element * currentElement, const std::vector<double> & times, double fnops);
-
-
-#undef DEBUG
-int main(int argc, char *argv[]) {
-#ifdef HAVE_MPI
-  // Initialize MPI
-  MPI_Init(&argc, &argv);
-  int size, rank; // Number of MPI processes, My process ID
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  // I'm alive !!!
-  if (size < 100) std::cout << "Process "<<rank<<" of "<<size<<" is alive." <<std::endl;
-
-#else
-  int size = 1; // Serial case (not using MPI)
-  int rank = 0; 
-#endif
-  
-  YAML_Doc doc("NewMiniApp","0.1");
-
-
-#ifdef DEBUG
-  int junk = 0;
-  std::cout << "Press enter to continue"<< std::endl;
-  std::cin >> junk;
-#endif
-
-  if(argc != 2) {
-    std::cerr << "Usage:" << std::endl
-	   << argv[0] << " n" << std::endl
-	   << "     where n is the problem size" << std::endl;
-    std::exit(1);
-  }
-  
-  size_t n = atoi(argv[1]);
-  int numTrials = 1000000/n; if (numTrials<1) numTrials = 1;
-  double fnops = 2.0 * ((double) size) *((double) n)*((double) numTrials);
-  std::vector<double> times(4,0.0);
-  doc.add("Problem_size",n);
-  doc.add("Number_of_timing_trials",numTrials);
-
-  std::vector<double> dx(n, 1.0), dy(n, 1.0);
-
-  int ierr = vectorTests<double>(numTrials, dx, dy, times); 
-
-  if (ierr) std::cerr << "Error in call to vectorTests: " << ierr << ".\n" << std::endl;
-
-  if (rank==0) { // Only PE 0 needs to compute and report timing results
-
-      doc.add("Total_time_for_vector_tests_in_double",times[0]);
-
-
-      doc.add("Double_precision_results","");
-      doc.get("Double_precision_results")->add("performance_summary","");
-      YAML_Element * currentElement = doc.get("Double_precision_results");
-      addResults(currentElement, times, fnops);
-  }
-
-#ifdef HAVE_MPI
-      MPI_Barrier(MPI_COMM_WORLD);
-#endif
-
-  std::vector<float> fx(n, 1.0f), fy(n, 1.0f);
-  ierr = vectorTests<float>(numTrials, fx, fy, times);
-  if (ierr) std::cerr << "Error in call to vectorTests: " << ierr << ".\n" << std::endl;
-
-  if (rank==0) { // Only PE 0 needs to compute and report timing results
-
-      doc.add("Total_time_for_vector_tests_in_float",times[0]);
-
-
-      doc.add("Float_precision_results","");
-      doc.get("Float_precision_results")->add("performance_summary","");
-      YAML_Element * currentElement = doc.get("Float_precision_results");
-      addResults(currentElement, times, fnops);
-  }
-
-  if (rank==0) { // Only PE 0 needs to compute and report timing results
-
-    std::string yaml = doc.generateYAML();
-    std::cout << yaml;
-  }
-  // Finish up
-#ifdef HAVE_MPI
-  MPI_Finalize();
-#endif
-  return 0;
-} 
-
-void addResults(YAML_Element * currentElement, const std::vector<double> & times, double fnops) {
-
-      currentElement->get("performance_summary")->add("total","");
-      currentElement->get("performance_summary")->get("total")->add("time",times[0]);
-      currentElement->get("performance_summary")->get("total")->add("flops",3.0*fnops);
-      currentElement->get("performance_summary")->get("total")->add("mflops",3.0*fnops/times[0]/1.0E6);
-
-      currentElement->get("performance_summary")->add("std_vector_bracket_notation","");
-      currentElement->get("performance_summary")->get("std_vector_bracket_notation")->add("time",times[1]);
-      currentElement->get("performance_summary")->get("std_vector_bracket_notation")->add("flops",fnops);
-      currentElement->get("performance_summary")->get("std_vector_bracket_notation")->add("mflops",fnops/times[1]/1.0E6);
-
-      currentElement->get("performance_summary")->add("raw_pointer_bracket_notation","");
-      currentElement->get("performance_summary")->get("raw_pointer_bracket_notation")->add("time",times[2]);
-      currentElement->get("performance_summary")->get("raw_pointer_bracket_notation")->add("flops",fnops);
-      currentElement->get("performance_summary")->get("raw_pointer_bracket_notation")->add("mflops",fnops/times[2]/1.0E6);
-
-      currentElement->get("performance_summary")->add("raw_pointer_deref_plusplus_notation","");
-      currentElement->get("performance_summary")->get("raw_pointer_deref_plusplus_notation")->add("time",times[3]);
-      currentElement->get("performance_summary")->get("raw_pointer_deref_plusplus_notation")->add("flops",fnops);
-      currentElement->get("performance_summary")->get("raw_pointer_deref_plusplus_notation")->add("mflops",fnops/times[3]/1.0E6);
-
-      return;
-}
diff --git a/kokkos/common/main.cpp b/kokkos/common/main.cpp
deleted file mode 100644
index 9724f69..0000000
--- a/kokkos/common/main.cpp
+++ /dev/null
@@ -1,292 +0,0 @@
-
-//@HEADER
-// ************************************************************************
-// 
-//               HPCCG: Simple Conjugate Gradient Benchmark Code
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-// Changelog
-//
-// Version 0.3
-// - Added timing of setup time for sparse MV
-// - Corrected percentages reported for sparse MV with overhead
-//
-/////////////////////////////////////////////////////////////////////////
-
-// Main routine of a program that reads a sparse matrix, right side
-// vector, solution vector and initial guess from a file  in HPC
-// format.  This program then calls the HPCCG conjugate gradient
-// solver to solve the problem, and then prints results.
-
-// Calling sequence:
-
-// test_HPCCG linear_system_file
-
-// Routines called:
-
-// read_HPC_row - Reads in linear system
-
-// mytimer - Timing routine (compile with -DWALL to get wall clock
-//           times
-
-// HPCCG - CG Solver
-
-// compute_residual - Compares HPCCG solution to known solution.
-
-#include <iostream>
-using std::cout;
-using std::cerr;
-using std::endl;
-#include <cstdio>
-#include <cstdlib>
-#include <cctype>
-#include <cassert>
-#include <string>
-#include <cmath>
-#ifdef USING_MPI
-#include <mpi.h> // If this routine is compiled with -DUSING_MPI
-                 // then include mpi.h
-#include "make_local_matrix.hpp" // Also include this function
-#endif
-#include "generate_matrix.hpp"
-#include "read_HPC_row.hpp"
-#include "mytimer.hpp"
-#include "HPC_sparsemv.hpp"
-#include "compute_residual.hpp"
-#include "HPCCG.hpp"
-#include "HPC_Sparse_Matrix.hpp"
-#include "YAML_generator.hpp"
-//#include "YAML_generator.cpp"
-
-#undef DEBUG
-int main(int argc, char *argv[])
-{
-  HPC_Sparse_Matrix *A;
-  double *x, *b, *xexact;
-  double norm, d;
-  int ierr = 0;
-  int i, j;
-  int ione = 1;
-  double times[7];
-  double t6 = 0.0;
-  YAML_Doc doc("HPCCG","1.0");
-
-#ifdef USING_MPI
-
-  // Initialize MPI
-  MPI_Init(&argc, &argv);
-  int size, rank; // Number of MPI processes, My process ID
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-  // I'm alive !!!
-
-  if (size < 100) cout << "Process "<<rank<<" of "<<size<<" is alive." <<endl;
-
-#else
-
-  int size = 1; // Serial case (not using MPI)
-  int rank = 0; 
-
-#endif
-
-
-#ifdef DEBUG
-  if (rank==0)
-   {
-    int junk = 0;
-    cout << "Press enter to continue"<< endl;
-    cin >> junk;
-   }
-
-  MPI_Barrier(MPI_COMM_WORLD);
-#endif
-
-
-  if(argc != 2 && argc!=4) {
-    if (rank==0)
-      cerr << "Usage:" << endl
-	   << "Mode 1: " << argv[0] << " nx ny nz" << endl
-	   << "     where nx, ny and nz are the local sub-block dimensions, or" << endl
-	   << "Mode 2: " << argv[0] << " HPC_data_file " << endl
-	   << "     where HPC_data_file is a globally accessible file containing matrix data." << endl;
-    exit(1);
-  }
-  
-  if (argc==4) {
-    int nx = atoi(argv[1]);
-    int ny = atoi(argv[2]);
-    int nz = atoi(argv[3]);
-    doc.add("nx",argv[1]);
-    doc.add("ny",argv[2]);
-    doc.add("nz",argv[3]);
-    generate_matrix(nx, ny, nz, &A, &x, &b, &xexact);
-  }
-  else
-    read_HPC_row(argv[1], &A, &x, &b, &xexact);
-
-#ifdef USING_MPI
-
-  // Transform matrix indices from global to local values.
-  // Define number of columns for the local matrix.
-
-  t6 = mytimer(); make_local_matrix(A);  t6 = mytimer() - t6;
-  times[6] = t6;
-
-#endif
-
-  double t1 = mytimer();   // Initialize it (if needed)
-  int niters = 0;
-  double normr = 0.0;
-  int max_iter = 300;
-  double tolerance = 0.0; // Set tolerance to zero to make all runs do max_iter iterations
-  ierr = HPCCG( A, b, x, max_iter, tolerance, niters, normr, times);
-
-	if (ierr) cerr << "Error in call to CG: " << ierr << ".\n" << endl;
-
-#ifdef USING_MPI
-      double t4 = times[4];
-      double t4min = 0.0;
-      double t4max = 0.0;
-      double t4avg = 0.0;
-      MPI_Allreduce(&t4, &t4min, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
-      MPI_Allreduce(&t4, &t4max, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-      MPI_Allreduce(&t4, &t4avg, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-      t4avg = t4avg/((double) size);
-#endif
-
-  if (rank==0)  // Only PE 0 needs to compute and report timing results
-    {
-      doc.add("time_spent_in_CG",times[0]);
-
-      cout << "Time spent in CG = " << times[0] << ".\n" << endl;
-      double fniters = niters; 
-      double fnrow = A->total_nrow; double fnnz = A->total_nnz;
-      double fnops_ddot = fniters*4*fnrow;
-      double fnops_waxpby = fniters*6*fnrow;
-      double fnops_sparsemv = fniters*2*fnnz;
-      double fnops = fnops_ddot+fnops_waxpby+fnops_sparsemv;
-
-      doc.add("number_of_iterations",fniters);
-      doc.add("final_residual",normr);
-      doc.add("performance_summary","");
-      doc.get("performance_summary")->add("total","");
-      doc.get("performance_summary")->get("total")->add("time",times[0]);
-      doc.get("performance_summary")->get("total")->add("flops",fnops);
-      doc.get("performance_summary")->get("total")->add("mflops",fnops/times[0]/1.0E6);
-
-      doc.get("performance_summary")->add("ddot","");
-      doc.get("performance_summary")->get("ddot")->add("time",times[1]);
-      doc.get("performance_summary")->get("ddot")->add("flops",fnops_ddot);
-      doc.get("performance_summary")->get("ddot")->add("mflops",fnops_ddot/times[1]/1.0E6);
-
-      doc.get("performance_summary")->add("waxpby","");
-      doc.get("performance_summary")->get("waxpby")->add("time",times[2]);
-      doc.get("performance_summary")->get("waxpby")->add("flops",fnops_waxpby);
-      doc.get("performance_summary")->get("waxpby")->add("mflops",fnops_waxpby/times[2]/1.0E6);
-
-      doc.get("performance_summary")->add("sparsemv","");
-      doc.get("performance_summary")->get("sparsemv")->add("time",times[3]);
-      doc.get("performance_summary")->get("sparsemv")->add("flops",fnops_sparsemv);
-      doc.get("performance_summary")->get("sparsemv")->add("mflops",fnops_sparsemv/times[3]/1.0E6);
-
-      cout << "Number of iterations = " << niters << ".\n" << endl;
-      cout << "Final residual = " << normr << ".\n" << endl;
-      cout << "********** Performance Summary (times in sec) ***********" << endl << endl;
-      cout << "Total Time/FLOPS/MFLOPS               = "
-	   << times[0] << "/" << fnops << "/"
-           << fnops/times[0]/1.0E6 << "." << endl;
-      cout << "DDOT  Time/FLOPS/MFLOPS               = " 
-	   << times[1] << "/" << fnops_ddot << "/"
-           << fnops_ddot/times[1]/1.0E6 << "." << endl;
-#ifdef USING_MPI
-      doc.get("performance_summary")->get("ddot")->add("min_MPI_Allreduce_time",t4min);
-      doc.get("performance_summary")->get("ddot")->add("max_MPI_Allreduce_time",t4max);
-      doc.get("performance_summary")->get("ddot")->add("avg_MPI_Allreduce_time",t4avg);
-
-      cout << "     Minimum DDOT MPI_Allreduce time (over all processors) = " << t4min << endl;
-      cout << "     Maximum DDOT MPI_Allreduce time (over all processors) = " << t4max << endl;
-      cout << "     Average DDOT MPI_Allreduce time (over all processors) = " << t4avg << endl;
-#endif
-      cout << "WAXPBY Time/FLOPS/MFLOPS              = " 
-	   << times[2] << "/" << fnops_waxpby << "/"
-           << fnops_waxpby/times[2]/1.0E6 << "." << endl;
-      cout << "SPARSEMV Time/FLOPS/MFLOPS            = " 
-	   << times[3] << "/" << fnops_sparsemv << "/"
-           << fnops_sparsemv/(times[3])/1.0E6 << "." << endl;
-#ifdef USING_MPI
-      double totalSparseMVTime = times[3] + times[5]+ times[6];
-
-      double mflops_w_overhead = fnops_sparsemv/(totalSparseMVTime)/1.0E6;
-      double po_time = (times[5]+times[6]);
-      double po_perc = po_time/totalSparseMVTime*100.0;
-      double po_set_time = times[6];
-      double po_set_perc = po_set_time/totalSparseMVTime*100.0;
-      double po_Bdry_exch_time = times[5];
-      double po_Bdry_exch_perc = po_Bdry_exch_time/totalSparseMVTime*100.0;
-      
-      YAML_Element* currnet_elem = doc.get("performance_summary")->get("sparsemv");
-      currnet_elem->add("mflops_w_overhead",mflops_w_overhead);
-      currnet_elem->add("parallel_overhead","");
-      currnet_elem->get("parallel_overhead")->add("time",po_time);
-      currnet_elem->get("parallel_overhead")->add("percentage",po_perc);
-      currnet_elem->get("parallel_overhead")->add("setup","");
-      currnet_elem->get("parallel_overhead")->get("setup")->add("time",po_set_time);
-      currnet_elem->get("parallel_overhead")->get("setup")->add("percentage",po_set_perc);
-      currnet_elem->get("parallel_overhead")->add("Bdry_exchange","");
-      currnet_elem->get("parallel_overhead")->get("Bdry_exchange")->add("time",po_Bdry_exch_time);
-      currnet_elem->get("parallel_overhead")->get("Bdry_exchange")->add("percentage",po_Bdry_exch_perc);
-
-      cout << "SPARSEMV MFLOPS W OVRHEAD             = " 
-           << mflops_w_overhead << "." << endl;
-      cout << "SPARSEMV PARALLEL OVERHEAD Time       = " 
-           << po_time << " ( " << po_perc << " % )." << endl;
-      cout << "     SPARSEMV PARALLEL OVERHEAD (Setup) Time         = " 
-           << po_set_time << " ( " << po_set_perc << " % )." << endl;
-      cout << "     SPARSEMV PARALLEL OVERHEAD (Bdry Exchange) Time = " 
-           << po_Bdry_exch_time << " ( " << po_Bdry_exch_perc << " % )." << endl;
-#endif
-    }
-
-  // Compute difference between known exact solution and computed solution
-  // All processors are needed here.
-
-  double residual = 0;
-  if ((ierr = compute_residual(A->local_nrow, x, xexact, &residual)))
-    cerr << "Error in call to compute_residual: " << ierr << ".\n" << endl;
-
-  if (rank==0){
-    cout << "Difference between computed and exact  = " 
-         << residual << ".\n" << endl;
-    doc.add("diff_between_computed_and_exact",residual);
-    string yaml = doc.generateYAML();
-    cout << yaml;
-  }
-  // Finish up
-#ifdef USING_MPI
-  MPI_Finalize();
-#endif
-  return 0 ;
-} 
diff --git a/kokkos/common/mytimer.cpp b/kokkos/common/mytimer.cpp
deleted file mode 100644
index 71e19e6..0000000
--- a/kokkos/common/mytimer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-
-//@HEADER
-// ************************************************************************
-// 
-//               HPCCG: Simple Conjugate Gradient Benchmark Code
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-
-/////////////////////////////////////////////////////////////////////////
-
-// Function to return time in seconds.
-// If compiled with no flags, return CPU time (user and system).
-// If compiled with -DWALL, returns elapsed time.
-
-/////////////////////////////////////////////////////////////////////////
-#ifdef HAVE_MPI
-#include <mpi.h> // If this routine is compiled with -DHAVE_MPI
-                 // then include mpi.h
-double mytimer(void)
-{
-   return(MPI_Wtime());
-}
-
-
-#elif defined(UseClock)
-
-#include <time.hpp>
-double mytimer(void)
-{
-   clock_t t1;
-   static clock_t t0=0;
-   static double CPS = CLOCKS_PER_SEC;
-   double d;
-
-   if (t0 == 0) t0 = clock();
-   t1 = clock() - t0;
-   d = t1 / CPS;
-   return(d);
-}
-
-#elif defined(WALL)
-
-#include <cstdlib>
-#include <sys/time.h>
-#include <sys/resource.h>
-double mytimer(void)
-{
-   struct timeval tp;
-   static long start=0, startu;
-   if (!start)
-   {
-      gettimeofday(&tp, NULL);
-      start = tp.tv_sec;
-      startu = tp.tv_usec;
-      return(0.0);
-   }
-   gettimeofday(&tp, NULL);
-   return( ((double) (tp.tv_sec - start)) + (tp.tv_usec-startu)/1000000.0 );
-}
-
-#elif defined(UseTimes)
-
-#include <cstdlib>
-#include <sys/times.h>
-#include <unistd.h>
-double mytimer(void)
-{
-   struct tms ts;
-   static double ClockTick=0.0;
-
-   if (ClockTick == 0.0) ClockTick = (double) sysconf(_SC_CLK_TCK);
-   times(&ts);
-   return( (double) ts.tms_utime / ClockTick );
-}
-
-#else
-
-#include <cstdlib>
-#include <sys/time.h>
-#include <sys/resource.h>
-double mytimer(void)
-{
-   struct rusage ruse;
-   getrusage(RUSAGE_SELF, &ruse);
-   return( (double)(ruse.ru_utime.tv_sec+ruse.ru_utime.tv_usec / 1000000.0) );
-}
-
-#endif
diff --git a/kokkos/common/mytimer.hpp b/kokkos/common/mytimer.hpp
deleted file mode 100644
index 94226a3..0000000
--- a/kokkos/common/mytimer.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-
-//@HEADER
-// ************************************************************************
-// 
-//               HPCCG: Simple Conjugate Gradient Benchmark Code
-//                 Copyright (2006) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-#ifndef MYTIMER_H
-#define MYTIMER_H
-double mytimer(void);
-#endif // MYTIMER_H
diff --git a/kokkos/common/param_utils.cpp b/kokkos/common/param_utils.cpp
deleted file mode 100644
index 0d9cbf3..0000000
--- a/kokkos/common/param_utils.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-
-#include <param_utils.hpp>
-
-#include <sstream>
-#include <fstream>
-
-namespace Mantevo {
-
-//-------------------------------------------------------------
-void read_args_into_string(int argc, char** argv, std::string& arg_string)
-{
-  arg_string = argv[0];
-  for(int i=1; i<argc; ++i) {
-    arg_string += " " + std::string(argv[i]);
-  }
-}
-
-//-------------------------------------------------------------
-void read_file_into_string(const std::string& filename,
-                           std::string& file_contents)
-{
-  file_contents.clear();
-  std::ifstream ifs(filename.c_str());
-  char line[256];
-  while(!ifs.eof()) {
-    ifs.getline(line, 256);
-    file_contents += " " + std::string(line);
-  }
-}
-
-}//namespace Mantevo
-
diff --git a/kokkos/common/param_utils.hpp b/kokkos/common/param_utils.hpp
deleted file mode 100644
index 7777b59..0000000
--- a/kokkos/common/param_utils.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-#ifndef _param_utils_hpp_
-#define _param_utils_hpp_
-
-//@HEADER
-// ************************************************************************
-//
-//               miniFE: simple finite-element assembly and linear-solve
-//                 Copyright (2006) Sandia Corporation
-//
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-//
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-
-#include <string>
-#include <sstream>
-
-//Parameter-parsing Utilities:
-//
-//The functions declared below are intended to assist with parsing
-//input-parameters which may be command-line arguments and/or lines in a
-//text file.
-//
-// Scenario: You want your program to accept parameters that are specified
-// as command-line arguments and/or as lines in a text file (such
-// as a YAML output file). i.e., your program can be run like this:
-// % program.exe foo=3.14159 bar: 42
-// or
-// % program.exe input_file=params.txt
-// or
-// % program.exe foo=3.14159 input_file = params.txt
-//
-//Example:
-// Here is example code to obtain parameters using the 3 functions
-// 'read_args_into_string', 'read_file_into_string' and 'parse_parameter':
-//
-//   std::string arg_string;
-//
-//   //put command-line-arguments into 'arg_string':
-//   read_args_into_string(argc, argv, arg_string);
-//
-//   //do the command-line-arguments specify an 'input_file'?
-//   std::string filename =
-//      parse_parameter<std::string>(arg_string,"input_file","none-specified");
-//
-//   if (filename != "none-specified") {
-//     std::string tmp;
-//     read_file_into_string(filename, tmp);
-//     arg_string += tmp;
-//   }
-//
-//  //now parse the parameters:
-//  float foo = parse_parameter<float>(arg_string, "foo", -9.9);
-//  int bar   = parse_parameter<int>(arg_string, "bar", -1);
-//
-//See the comments below for parse_parameter, for formatting requirements of
-//named parameter-value pairs.
-//
-
-namespace Mantevo {
-
-/**
- * Concatenate command-line arguments into a single string.
- *
- * Note: this function is purely serial. If argc and argv have different
- * values on different MPI processes, then you need to resolve that by
- * broadcasting arg_string's contents.
- */
-void read_args_into_string(int argc, char** argv, std::string& arg_string);
-
-/**
- * Read the contents of a text-file into a single string.
- *
- * Note: this function is purely serial. If you want file_contents on multiple
- * MPI processes, you need to broadcast it (or call this function on each
- * MPI process...).
- */
-void read_file_into_string(const std::string& filename,
-                           std::string& file_contents);
-
-/**
- * Parse a named parameter value from input 'arg_string'.
- *
- * Search 'arg_string' for an occurrence of param_name and attempt to parse
- * a value into the return-type. If param_name is not found, then default_value
- * is returned.
- *
- * Example:
- * arg_string = "foo = 3.14159";
- * float foo = parse_parameter<float>(arg_string, "foo", -999.9);
- * //foo should now contain the value 3.14159; if 'foo' was not found in
- * //arg_string, then -999.9 would have been returned.
- *
- * Other legal name-value separators are ':' and ' '. Extra spaces are also ok,
- * e.g. "foo : 3.114159".
- *
- * Note that if a YAML file is read into a string, that would be a valid input
- * string for this function.
- */
-template<typename T>
-T parse_parameter(const std::string& arg_string,
-                const std::string& param_name,
-                const T& default_value)
-{
-  std::string::size_type pos = arg_string.find(param_name);
-  if (pos == std::string::npos) {
-    //if param_name is not found in arg_string, return default_value:
-    return default_value;
-  }
-
-  pos += param_name.size();
-
-  if (arg_string.size() <= pos) return default_value;
-
-  //skip past ' ', '=' or ':':
-  while(pos < arg_string.size() &&
-        (arg_string[pos] == ' ' ||
-         arg_string[pos] == '=' ||
-         arg_string[pos] == ':'))
-  {
-    ++pos;
-  }
-
-  if (arg_string[pos] == '=' || arg_string[pos] == ':') ++pos;
-
-  std::string str = arg_string.substr(pos);
-
-  std::istringstream isstr(str);
-
-  T return_val = default_value;
-
-  //parse value into return_val:
-  isstr >> return_val;
-
-  //if parse failed, return default_value:
-  if (!isstr) return default_value;
-
-  return return_val;
-}
-
-}//namespace Mantevo
-
-#endif
-
diff --git a/kokkos/common/vectorTests.hpp b/kokkos/common/vectorTests.hpp
deleted file mode 100644
index 53ce682..0000000
--- a/kokkos/common/vectorTests.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-
-//@HEADER
-// ************************************************************************
-// 
-//               Mantevo: A collection of mini-applications for HPC
-//                 Copyright (2008) Sandia Corporation
-// 
-// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
-// license for use of this work by or on behalf of the U.S. Government.
-// 
-// This library is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; either version 2.1 of the
-// License, or (at your option) any later version.
-//  
-// This library is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// Lesser General Public License for more details.
-//  
-// You should have received a copy of the GNU Lesser General Public
-// License along with this library; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
-// USA
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-#include <vector>
-#include "mytimer.hpp"
-template <typename Scalar>
-int vectorTests(int numTrials, const std::vector<Scalar> & x, std::vector<Scalar> & y,
-		std::vector<double> & times) {  
-
-  Scalar alpha = 2.0;
-  double t0;
-
-  size_t n = x.size();
-
-  double tstart = mytimer(); // Initial time
-
-  t0 = mytimer();
-  for (int j=0;j<numTrials; j++)
-    for (size_t i=0; i<n; i++) y[i] = alpha * x[i] + y[i];
-  times[1] = mytimer() - t0;
-
-  const Scalar * xp = &x[0];
-  Scalar * yp=&y[0]; // get addresses
-
-  t0 = mytimer();
-  for (int j=0;j<numTrials; j++)
-    for (size_t i=0; i<n; i++) yp[i] = alpha * xp[i] + yp[i];
-  times[2] = mytimer() - t0;
-
-  t0 = mytimer();
-  for (int j=0;j<numTrials; j++) {
-    const Scalar * xp = &x[0];
-    Scalar * yp=&y[0]; // get addresses
-    for (size_t i=0; i<n; i++) {*yp = alpha * *xp++ + *yp; yp++;}
-  }
-  times[3] = mytimer() - t0;
-
-  times[0] = mytimer() - tstart;
-
-  return(0);
-}
diff --git a/kokkos/fem/Hex8.hpp b/kokkos/fem/Hex8.hpp
index d2cd4f2..4942a94 100644
--- a/kokkos/fem/Hex8.hpp
+++ b/kokkos/fem/Hex8.hpp
@@ -282,8 +282,6 @@ KERNEL_PREFIX void diffusionMatrix_symm(const Scalar* elemNodeCoords,
   //Dynamics", 2nd edition,
   //to compute the element diffusion matrix for the steady conduction equation.
 
-  Scalar pt[spatialDim];
-
 #ifdef MINIFE_DEBUG
   Scalar volume = zero;
 #endif
diff --git a/kokkos/kokkos/TPL/KokkosTPL_dummy.cpp b/kokkos/kokkos/TPL/KokkosTPL_dummy.cpp
deleted file mode 100644
index e69de29..0000000
diff --git a/kokkos/kokkos/TPL/cub/block/block_discontinuity.cuh b/kokkos/kokkos/TPL/cub/block/block_discontinuity.cuh
deleted file mode 100644
index 76af003..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_discontinuity.cuh
+++ /dev/null
@@ -1,587 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * A set of "head flags" (or "tail flags") is often used to indicate corresponding items
- * that differ from their predecessors (or successors).  For example, head flags are convenient
- * for demarcating disjoint data segments as part of a segmented scan or reduction.
- *
- * \tparam T                    The data type to be flagged.
- * \tparam BLOCK_THREADS        The thread block size in threads.
- *
- * \par A Simple Example
- * \blockcollective{BlockDiscontinuity}
- * \par
- * The code snippet below illustrates the head flagging of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockDiscontinuity for 128 threads on type int
- *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
- *
- *     // Allocate shared memory for BlockDiscontinuity
- *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute head flags for discontinuities in the segment
- *     int head_flags[4];
- *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
- * The corresponding output \p head_flags in those threads will be
- * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
- *
- * \par Performance Considerations
- * - Zero bank conflicts for most types.
- *
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS>
-class BlockDiscontinuity
-{
-private:
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    typedef T _TempStorage[BLOCK_THREADS];
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(a, b, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(a, b);
-        }
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockDiscontinuity()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockDiscontinuity(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockDiscontinuity(
-        int linear_tid)             ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockDiscontinuity(
-        TempStorage &temp_storage,  ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)             ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
-     *
-     * The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     * <tt>input<sub><em>i</em></sub></tt> when
-     * <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     * returns \p true (where <em>previous-item</em> is either the preceding item
-     * in the same thread or the last item in the previous thread).
-     * Furthermore, <tt>head_flags<sub><em>i</em></sub></tt> is always set for
-     * <tt>input><sub>0</sub></tt> in <em>thread</em><sub>0</sub>.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
-     * The corresponding output \p head_flags in those threads will be
-     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first item
-        head_flags[0] = (linear_tid == 0) ?
-            1 :                                 // First thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                temp_storage[linear_tid - 1],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        #pragma unroll
-        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEM - 1],
-                input[ITEM],
-                (linear_tid * ITEMS_PER_THREAD) + ITEM);
-        }
-    }
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     * <tt>input<sub><em>i</em></sub></tt> when
-     * <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     * returns \p true (where <em>previous-item</em> is either the preceding item
-     * in the same thread or the last item in the previous thread).
-     * For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     * against \p tile_predecessor_item.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(
-     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
-     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
-     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)                   ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first item
-        int predecessor = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            predecessor,
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for remaining items
-        #pragma unroll
-        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            head_flags[ITEM] = ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEM - 1],
-                input[ITEM],
-                (linear_tid * ITEMS_PER_THREAD) + ITEM);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
-     *
-     * The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     * <tt>input<sub><em>i</em></sub></tt> when
-     * <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     * returns \p true (where <em>next-item</em> is either the next item
-     * in the same thread or the first item in the next thread).
-     * Furthermore, <tt>tail_flags<sub>ITEMS_PER_THREAD-1</sub></tt> is always
-     * set for <em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub>.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
-     * The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage[linear_tid] = input[0];
-
-        __syncthreads();
-
-        // Set flag for last item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set flags for remaining items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
-        {
-            tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEM],
-                input[ITEM + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEM);
-        }
-    }
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     * <tt>input<sub><em>i</em></sub></tt> when
-     * <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     * returns \p true (where <em>next-item</em> is either the next item
-     * in the same thread or the first item in the next thread).
-     * For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     * <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     * against \p tile_predecessor_item.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                   ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage[linear_tid] = input[0];
-
-        __syncthreads();
-
-        // Set flag for last item
-        int successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set flags for remaining items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ITEM++)
-        {
-            tail_flags[ITEM] = ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEM],
-                input[ITEM + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEM);
-        }
-    }
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/block/block_exchange.cuh b/kokkos/kokkos/TPL/cub/block/block_exchange.cuh
deleted file mode 100644
index b7b9534..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_exchange.cuh
+++ /dev/null
@@ -1,918 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_arch.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * It is commonplace for blocks of threads to rearrange data items between
- * threads.  For example, the global memory subsystem prefers access patterns
- * where data items are "striped" across threads (where consecutive threads access consecutive items),
- * yet most block-wide operations prefer a "blocked" partitioning of items across threads
- * (where consecutive items belong to a single thread).
- *
- * \par
- * BlockExchange supports the following types of data exchanges:
- * - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>striped</em>](index.html#sec5sec4) arrangements
- * - Transposing between [<em>blocked</em>](index.html#sec5sec4) and [<em>warp-striped</em>](index.html#sec5sec4) arrangements
- * - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec4)
- * - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec4)
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_THREADS        The thread block size in threads.
- * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
- *
- * \par A Simple Example
- * \blockcollective{BlockExchange}
- * \par
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockExchange for 128 threads owning 4 integer items each
- *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
- *
- *     // Allocate shared memory for BlockExchange
- *     __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- *     // Load a tile of data striped across threads
- *     int thread_data[4];
- *     cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
- *
- *     // Collectively exchange data into a blocked arrangement across threads
- *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of striped input \p thread_data across the block of threads is
- * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- * \par Performance Considerations
- * - Proper device-specific padding ensures zero bank conflicts for most types.
- *
- */
-template <
-    typename        T,
-    int             BLOCK_THREADS,
-    int             ITEMS_PER_THREAD,
-    bool            WARP_TIME_SLICING = false>
-class BlockExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    enum
-    {
-        LOG_WARP_THREADS            = PtxArchProps::LOG_WARP_THREADS,
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
-
-        LOG_SMEM_BANKS              = PtxArchProps::LOG_SMEM_BANKS,
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
-
-        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
-        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        // Insert padding if the number of items per thread is a power of two
-        INSERT_PADDING              = ((ITEMS_PER_THREAD & (ITEMS_PER_THREAD - 1)) == 0),
-        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
-
-public:
-
-    /// \smemstorage{BlockExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-    int warp_lane;
-    int warp_id;
-    int warp_offset;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
-                }
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
-     */
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
-     */
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
-        {
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
-                }
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    __device__ __forceinline__ void StripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        // No timeslicing
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    __device__ __forceinline__ void StripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        // Warp time-slicing
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Write a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_storage[item_offset] = items[ITEM];
-                    }
-                }
-            }
-
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
-     */
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (warp_lane * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
-     */
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
-        {
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + warp_lane;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
-                }
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (warp_lane * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true>  time_slicing)
-    {
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            __syncthreads();
-
-            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage[item_offset] = items[ITEM];
-                }
-            }
-
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (warp_lane * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        int             ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true> time_slicing)
-    {
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage[item_offset] = items[ITEM];
-                }
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockExchange()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x),
-        warp_lane(linear_tid & (WARP_THREADS - 1)),
-        warp_id(linear_tid >> LOG_WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockExchange(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x),
-        warp_lane(linear_tid & (WARP_THREADS - 1)),
-        warp_id(linear_tid >> LOG_WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockExchange(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid),
-        warp_lane(linear_tid & (WARP_THREADS - 1)),
-        warp_id(linear_tid >> LOG_WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockExchange(
-        TempStorage &temp_storage,              ///< [in] Reference to memory allocation having layout type TempStorage
-        int         linear_tid)                 ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid),
-        warp_lane(linear_tid & (WARP_THREADS - 1)),
-        warp_id(linear_tid >> LOG_WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Structured exchanges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a striped arrangement across block threads
-     *     int thread_data[4];
-     *     cub::LoadStriped<LOAD_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of striped input \p thread_data across the block of threads is
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from global memory.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void StripedToBlocked(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToStriped(thread_data);
-     *
-     *     // Store data striped across block threads into an ordered tile
-     *     cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
-     * preparation for storing to global memory.
-     *
-     */
-    __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-    {
-        BlockedToStriped(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
-     *     int thread_data[4];
-     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of warp-striped input \p thread_data across the block of threads is
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * after loading from global memory.  (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a warp-striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data);
-     *
-     *     // Store data striped across warp threads into an ordered tile
-     *     cub::StoreStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * in preparation for storing to global memory. (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     *
-     */
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
-    {
-        BlockedToWarpStriped(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Scatter exchanges
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
-     *
-     * \smemreuse
-     */
-    __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        int             ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \smemreuse
-     */
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        int             ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_histogram.cuh b/kokkos/kokkos/TPL/cub/block/block_histogram.cuh
deleted file mode 100644
index dd346e3..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_histogram.cuh
+++ /dev/null
@@ -1,414 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_histogram_sort.cuh"
-#include "specializations/block_histogram_atomic.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
- */
-enum BlockHistogramAlgorithm
-{
-
-    /**
-     * \par Overview
-     * Sorting followed by differentiation.  Execution is comprised of two phases:
-     * -# Sort the data using efficient radix sort
-     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     */
-    BLOCK_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * Use atomic addition to update byte counts directly
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     */
-    BLOCK_HISTO_ATOMIC,
-};
-
-
-
-/******************************************************************************
- * Block histogram
- ******************************************************************************/
-
-
-/**
- * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- *
- * \par
- * Optionally, BlockHistogram can be specialized to use different algorithms:
- *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
- *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
- *
- * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam BINS                 The number bins within the histogram
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
- *
- * \par A Simple Example
- * \blockcollective{BlockHistogram}
- * \par
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
- *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
- *
- *     // Allocate shared memory for BlockHistogram
- *     __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- *     // Allocate shared memory for block-wide histogram bin counts
- *     __shared__ unsigned int smem_histogram[256];
- *
- *     // Obtain input samples per thread
- *     unsigned char data[4];
- *     ...
- *
- *     // Compute the block-wide histogram
- *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
- *
- * \endcode
- *
- * \par Performance and Usage Considerations
- * - The histogram output can be constructed in shared or global memory
- * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS,
-    BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT>
-class BlockHistogram
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
-     * regardless.
-     */
-    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (CUB_PTX_ARCH < 120)) ?
-            BLOCK_HISTO_SORT :
-            ALGORITHM;
-
-    /// Internal specialization.
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
-        BlockHistogramSort<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS>,
-        BlockHistogramAtomic<T, BLOCK_THREADS, ITEMS_PER_THREAD, BINS> >::Type InternalBlockHistogram;
-
-    /// Shared memory storage layout type for BlockHistogram
-    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{BlockHistogram}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockHistogram()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockHistogram(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockHistogram(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockHistogram(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Histogram operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Initialize the shared histogram counters to zero.
-     *
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
-     */
-    template <typename HistoCounter>
-    __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
-    {
-        // Initialize histogram bin counts to zeros
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-    }
-
-
-    /**
-     * \brief Constructs a block-wide histogram in shared/global memory.  Each thread contributes an array of input elements.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
-     * are partitioned across 128 threads where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Compute the block-wide histogram
-     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            HistoCounter>
-    __device__ __forceinline__ void Histogram(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                ///< [out] Reference to shared/global memory histogram
-    {
-        // Initialize histogram bin counts to zeros
-        InitHistogram(histogram);
-
-        // Composite the histogram
-        InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
-    }
-
-
-
-    /**
-     * \brief Updates an existing block-wide histogram in shared/global memory.  Each thread composites an array of input elements.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            HistoCounter>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
-    {
-        InternalBlockHistogram(temp_storage, linear_tid).Composite(items, histogram);
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_load.cuh b/kokkos/kokkos/TPL/cub/block/block_load.cuh
deleted file mode 100644
index e645bcd..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_load.cuh
+++ /dev/null
@@ -1,1122 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for reading linear tiles of data into the CUDA thread block.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_vector.cuh"
-#include "../thread/thread_load.cuh"
-#include "block_exchange.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup IoModule
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked I/O
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier.
- *
- * \blocked
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM);
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier, guarded by range.
- *
- * \blocked
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM < bounds)
-        {
-            items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM);
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier, guarded by range, with a fall-back assignment of out-of-bound elements..
- *
- * \blocked
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = (ITEM < bounds) ?
-            ThreadLoad<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM) :
-            oob_default;
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped I/O
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block using the specified cache modifier.
- *
- * \striped
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    int             BLOCK_THREADS,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = ThreadLoad<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid);
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block using the specified cache modifier, guarded by range
- *
- * \striped
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    int             BLOCK_THREADS,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int bounds = valid_items - linear_tid;
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM * BLOCK_THREADS < bounds)
-        {
-            items[ITEM] = ThreadLoad<MODIFIER>(block_itr + linear_tid + (ITEM * BLOCK_THREADS));
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block using the specified cache modifier, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \striped
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    int             BLOCK_THREADS,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    int bounds = valid_items - linear_tid;
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = (ITEM * BLOCK_THREADS < bounds) ?
-             ThreadLoad<MODIFIER>(block_itr + linear_tid + (ITEM * BLOCK_THREADS)) :
-             oob_default;
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped I/O
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block using the specified cache modifier.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
-    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
-    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS));
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block using the specified cache modifier, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)               ///< [in] Number of valid items to load
-{
-    int tid                 = linear_tid & (PtxArchProps::WARP_THREADS - 1);
-    int wid                 = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
-    int warp_offset         = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
-    int bounds              = valid_items - warp_offset - tid;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((ITEM * PtxArchProps::WARP_THREADS) < bounds)
-        {
-            items[ITEM] = ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS));
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block using the specified cache modifier, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorRA      <b>[inferred]</b> The random-access iterator type for input (may be a simple pointer type).
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorRA>
-__device__ __forceinline__ void LoadWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,               ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
-    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
-    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
-    int bounds      = valid_items - warp_offset - tid;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = ((ITEM * PtxArchProps::WARP_THREADS) < bounds) ?
-            ThreadLoad<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS)) :
-            oob_default;
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Blocked, vectorized I/O
- *********************************************************************/
-//@{
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block using the specified cache modifier.
- *
- * \blocked
- *
- * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam MODIFIER             cub::PtxLoadModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename        T,
-    int             ITEMS_PER_THREAD>
-__device__ __forceinline__ void LoadBlockedVectorized(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T               *block_ptr,                 ///< [in] Input pointer for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    enum
-    {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
-
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
-
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
-    };
-
-    // Vector type
-    typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
-
-    // Alias local data (use raw_items array here which should get optimized away to prevent conservative PTXAS lmem spilling)
-    T raw_items[ITEMS_PER_THREAD];
-
-    // Direct-load using vector types
-    LoadBlocked<MODIFIER>(
-        linear_tid,
-        reinterpret_cast<Vector *>(block_ptr),
-        reinterpret_cast<Vector (&)[VECTORS_PER_THREAD]>(raw_items));
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = raw_items[ITEM];
-    }
-}
-
-
-//@}  end member group
-
-/** @} */       // end group IoModule
-
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockLoad abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-enum BlockLoadAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is read
-     * directly from memory.  The thread block reads items in a parallel "raking" fashion: thread<sub><em>i</em></sub>
-     * reads the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_LOAD_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is read directly
-     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-     * The thread block reads items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector loads to
-     * read the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p InputIteratorRA is not a simple pointer type
-     *   - The block input offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_LOAD_VECTORIZE,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>striped arrangement</em>](index.html#sec5sec4) of data is read
-     * directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec4). The thread block
-     * reads items in a parallel "strip-mining" fashion:
-     * thread<sub><em>i</em></sub> reads items having stride \p BLOCK_THREADS
-     * between them. cub::BlockExchange is then used to locally reorder the items
-     * into a [<em>blocked arrangement</em>](index.html#sec5sec4).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * A [<em>warp-striped arrangement</em>](index.html#sec5sec4) of data is read
-     * directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec4). Each warp reads its own
-     * contiguous segment in a parallel "strip-mining" fashion: lane<sub><em>i</em></sub>
-     * reads items having stride \p WARP_THREADS between them. cub::BlockExchange
-     * is then used to locally reorder the items into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec4).
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE,
-};
-
-
-/**
- * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec4) across a CUDA thread block.  ![](block_load_logo.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * The BlockLoad class provides a single data movement abstraction that can be specialized
- * to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
- * performance policies for different architectures, data types, granularity sizes, etc.
- *
- * \par
- * Optionally, BlockLoad can be specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
- *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
- *      of data is read directly from memory using CUDA's built-in vectorized loads as a
- *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec4)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec4).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec4)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec4).  [More...](\ref cub::BlockLoadAlgorithm)
- *
- * \tparam InputIteratorRA      The input iterator type (may be a simple pointer type).
- * \tparam BLOCK_THREADS        The thread block size in threads.
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
- * \tparam MODIFIER             <b>[optional]</b> cub::PtxLoadModifier cache modifier.  default: cub::LOAD_DEFAULT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> For transposition-based cub::BlockLoadAlgorithm parameterizations that utilize shared memory: When \p true, only use enough shared memory for a single warp's worth of data, time-slicing the block-wide exchange over multiple synchronized rounds (default: false)
- *
- * \par A Simple Example
- * \blockcollective{BlockLoad}
- * \par
- * The code snippet below illustrates the loading of a linear
- * segment of 512 integers into a "blocked" arrangement across 128 threads where each
- * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
- * meaning memory references are efficiently coalesced using a warp-striped access
- * pattern (after which items are locally reordered among threads).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockLoad for 128 threads owning 4 integer items each
- *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
- *
- *     // Allocate shared memory for BlockLoad
- *     __shared__ typename BlockLoad::TempStorage temp_storage;
- *
- *     // Load a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     BlockLoad(temp_storage).Load(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- * The set of \p thread_data across the block of threads in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename            InputIteratorRA,
-    int                 BLOCK_THREADS,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  ALGORITHM = BLOCK_LOAD_DIRECT,
-    PtxLoadModifier     MODIFIER = LOAD_DEFAULT,
-    bool                WARP_TIME_SLICING = false>
-class BlockLoad
-{
-private:
-
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Load helper
-    template <BlockLoadAlgorithm _POLICY, int DUMMY = 0>
-    struct LoadInternal;
-
-
-    /**
-     * BLOCK_LOAD_DIRECT specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            LoadBlocked<MODIFIER>(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_VECTORIZE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Load(
-            T               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            LoadBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
-        template <
-            typename T,
-            typename _InputIteratorRA>
-        __device__ __forceinline__ void Load(
-            _InputIteratorRA    block_itr,                  ///< [in] The thread block's base input iterator for loading from
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-        {
-            LoadBlocked<MODIFIER>(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
-            BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage, linear_tid).StripedToBlocked(items);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = PtxArchProps::WARP_THREADS
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items);
-            BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        __device__ __forceinline__ void Load(
-            InputIteratorRA block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage, linear_tid).WarpStripedToBlocked(items);
-        }
-    };
-
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef LoadInternal<ALGORITHM> InternalLoad;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalLoad::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-    /// \smemstorage{BlockLoad}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockLoad()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockLoad(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockLoad(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockLoad(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Load a linear segment of items from memory.
-     *
-     * \blocked
-     *
-     * The code snippet below illustrates the loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockLoad for 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void Load(
-        InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range.
-     *
-     * \blocked
-     *
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items remaining unassigned).
-     *
-     */
-    __device__ __forceinline__ void Load(
-        InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items)                ///< [in] Number of valid items to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-     *
-     * \blocked
-     *
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
-     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items are assigned \p -1)
-     *
-     */
-    __device__ __forceinline__ void Load(
-        InputIteratorRA block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items,                ///< [in] Number of valid items to load
-        T               oob_default)                ///< [in] Default value to assign out-of-bound items
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
-    }
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_radix_rank.cuh b/kokkos/kokkos/TPL/cub/block/block_radix_rank.cuh
deleted file mode 100644
index 149a62c..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_radix_rank.cuh
+++ /dev/null
@@ -1,479 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
- */
-
-#pragma once
-
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_scan.cuh"
-#include "../block/block_scan.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
- * \ingroup BlockModule
- *
- * \par Overview
- * Blah...
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 5 bits)
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- *
- * \par Usage Considerations
- * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- * - Assumes a [<em>blocked arrangement</em>](index.html#sec5sec4) of elements across threads
- * - \smemreuse{BlockRadixRank::TempStorage}
- *
- * \par Performance Considerations
- *
- * \par Algorithm
- * These parallel radix ranking variants have <em>O</em>(<em>n</em>) work complexity and are implemented in XXX phases:
- * -# blah
- * -# blah
- *
- * \par Examples
- * \par
- * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
- *      \code
- *      #include <cub/cub.cuh>
- *
- *      template <int BLOCK_THREADS>
- *      __global__ void ExampleKernel(...)
- *      {
- *
- *      \endcode
- */
-template <
-    int                     BLOCK_THREADS,
-    int                     RADIX_BITS,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte>
-class BlockRadixRank
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    // Integer type for digit counters (to be packed into words of type PackedCounters)
-    typedef unsigned short DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
-        unsigned long long,
-        unsigned int>::Type PackedCounter;
-
-    enum
-    {
-        RADIX_DIGITS                 = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS             = PtxArchProps::LOG_WARP_THREADS,
-        WARP_THREADS                 = 1 << LOG_WARP_THREADS,
-        WARPS                        = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_COUNTER            = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER        = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO                = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO            = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES            = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
-        COUNTER_LANES                = 1 << LOG_COUNTER_LANES,
-
-        // The number of packed counters per thread (plus one for padding)
-        RAKING_SEGMENT               = COUNTER_LANES + 1,
-
-        LOG_SMEM_BANKS               = PtxArchProps::LOG_SMEM_BANKS,
-        SMEM_BANKS                   = 1 << LOG_SMEM_BANKS,
-    };
-
-
-    /// BlockScan type
-    typedef BlockScan<PackedCounter, BLOCK_THREADS, INNER_SCAN_ALGORITHM> BlockScan;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct _TempStorage
-    {
-        // Storage for scanning local ranks
-        typename BlockScan::TempStorage block_scan;
-
-        union
-        {
-            DigitCounter            digit_counters[COUNTER_LANES + 1][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
-        };
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-    /// Copy of raking segment, promoted to registers
-    PackedCounter cached_segment[RAKING_SEGMENT];
-
-
-    /******************************************************************************
-     * Templated iteration
-     ******************************************************************************/
-
-    // General template iteration
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        /**
-         * Decode keys.  Decodes the radix digit from the current digit place
-         * and increments the thread's corresponding counter in shared
-         * memory for that digit.
-         *
-         * Saves both (1) the prior value of that counter (the key's
-         * thread-local exclusive prefix sum for that digit), and (2) the shared
-         * memory offset of the counter (for later use).
-         */
-        template <typename UnsignedBits, int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void DecodeKeys(
-            BlockRadixRank  &cta,                                   // BlockRadixRank instance
-            UnsignedBits    (&keys)[KEYS_PER_THREAD],               // Key to decode
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value (out parameter)
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],     // Counter smem offset (out parameter)
-            int             current_bit)                            // The least-significant bit position of the current digit to extract
-        {
-            // Add in sub-counter offset
-            UnsignedBits sub_counter = BFE(keys[COUNT], current_bit + LOG_COUNTER_LANES, LOG_PACKING_RATIO);
-
-            // Add in row offset
-            UnsignedBits row_offset = BFE(keys[COUNT], current_bit, LOG_COUNTER_LANES);
-
-            // Pointer to smem digit counter
-            digit_counters[COUNT] = &cta.temp_storage.digit_counters[row_offset][cta.linear_tid][sub_counter];
-
-            // Load thread-exclusive prefix
-            thread_prefixes[COUNT] = *digit_counters[COUNT];
-
-            // Store inclusive prefix
-            *digit_counters[COUNT] = thread_prefixes[COUNT] + 1;
-
-            // Iterate next key
-            Iterate<COUNT + 1, MAX>::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit);
-        }
-
-
-        // Termination
-        template <int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void UpdateRanks(
-            int             (&ranks)[KEYS_PER_THREAD],              // Local ranks (out parameter)
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD])     // Counter smem offset
-        {
-            // Add in threadblock exclusive prefix
-            ranks[COUNT] = thread_prefixes[COUNT] + *digit_counters[COUNT];
-
-            // Iterate next key
-            Iterate<COUNT + 1, MAX>::UpdateRanks(ranks, thread_prefixes, digit_counters);
-        }
-    };
-
-
-    // Termination
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // DecodeKeys
-        template <typename UnsignedBits, int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void DecodeKeys(
-            BlockRadixRank  &cta,
-            UnsignedBits    (&keys)[KEYS_PER_THREAD],
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],
-            int             current_bit) {}
-
-
-        // UpdateRanks
-        template <int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void UpdateRanks(
-            int             (&ranks)[KEYS_PER_THREAD],
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
-            DigitCounter    *(&digit_counters)[KEYS_PER_THREAD]) {}
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal storage allocator
-     */
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Performs upsweep raking reduction, returning the aggregate
-     */
-    __device__ __forceinline__ PackedCounter Upsweep()
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
-        PackedCounter *raking_ptr;
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data into registers
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                cached_segment[i] = smem_raking_ptr[i];
-            }
-            raking_ptr = cached_segment;
-        }
-        else
-        {
-            raking_ptr = smem_raking_ptr;
-        }
-
-        return ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        PackedCounter raking_partial)
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
-
-        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
-            cached_segment :
-            smem_raking_ptr;
-
-        // Exclusive raking downsweep scan
-        ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data back to smem
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                smem_raking_ptr[i] = cached_segment[i];
-            }
-        }
-    }
-
-
-    /**
-     * Reset shared memory digit counters
-     */
-    __device__ __forceinline__ void ResetCounters()
-    {
-        // Reset shared memory digit counters
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES + 1; LANE++)
-        {
-            *((PackedCounter*) temp_storage.digit_counters[LANE][linear_tid]) = 0;
-        }
-    }
-
-
-    /**
-     * Scan shared memory digit counters.
-     */
-    __device__ __forceinline__ void ScanCounters()
-    {
-        // Upsweep scan
-        PackedCounter raking_partial = Upsweep();
-
-        // Compute inclusive sum
-        PackedCounter inclusive_partial;
-        PackedCounter packed_aggregate;
-        BlockScan(temp_storage.block_scan, linear_tid).InclusiveSum(raking_partial, inclusive_partial, packed_aggregate);
-
-        // Propagate totals in packed fields
-        #pragma unroll
-        for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
-        {
-            inclusive_partial += packed_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
-        }
-
-        // Downsweep scan with exclusive partial
-        PackedCounter exclusive_partial = inclusive_partial - raking_partial;
-        ExclusiveDownsweep(exclusive_partial);
-    }
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockRadixRank()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockRadixRank(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockRadixRank(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockRadixRank(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit)                        ///< [in] The least-significant bit position of the current digit to extract
-    {
-        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
-        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
-
-        // Reset shared memory digit counters
-        ResetCounters();
-
-        // Decode keys and update digit counters
-        Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit);
-
-        __syncthreads();
-
-        // Scan shared memory counters
-        ScanCounters();
-
-        __syncthreads();
-
-        // Extract the local ranks of each key
-        Iterate<0, KEYS_PER_THREAD>::UpdateRanks(ranks, thread_prefixes, digit_counters);
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             &inclusive_digit_prefix)            ///< [out] The incluisve prefix sum for the digit threadIdx.x
-    {
-        // Rank keys
-        RankKeys(keys, ranks, current_bit);
-
-        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
-        if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS))
-        {
-            // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
-            // first counter column, resulting in unavoidable bank conflicts.)
-            int counter_lane = (linear_tid & (COUNTER_LANES - 1));
-            int sub_counter = linear_tid >> (LOG_COUNTER_LANES);
-            inclusive_digit_prefix = temp_storage.digit_counters[counter_lane + 1][0][sub_counter];
-        }
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_radix_sort.cuh b/kokkos/kokkos/TPL/cub/block/block_radix_sort.cuh
deleted file mode 100644
index 873d401..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_radix_sort.cuh
+++ /dev/null
@@ -1,608 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
- */
-
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "block_exchange.cuh"
-#include "block_radix_rank.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending order.  It relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- * <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
- * bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, BlockRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \tparam Key                  Key type
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam Value                <b>[optional]</b> Value type (default: cub::NullType)
- * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- *
- * \par A Simple Example
- * \blockcollective{BlockRadixSort}
- * \par
- * The code snippet below illustrates a sort of 512 integer keys that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockRadixSort for 128 threads owning 4 integer items each
- *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
- *
- *     // Allocate shared memory for BlockRadixSort
- *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_keys[4];
- *     ...
- *
- *     // Collectively sort the keys
- *     BlockRadixSort(temp_storage).Sort(thread_keys);
- *
- *     ...
- * \endcode
- * \par
- * Suppose the set of input \p thread_keys across the block of threads is
- * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
- * corresponding output \p thread_keys in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename                Key,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    typename                Value                   = NullType,
-    int                     RADIX_BITS              = 4,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte>
-class BlockRadixSort
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    // Key traits and unsigned bits type
-    typedef NumericTraits<Key>                  KeyTraits;
-    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
-
-    /// BlockRadixRank utility type
-    typedef BlockRadixRank<BLOCK_THREADS, RADIX_BITS, MEMOIZE_OUTER_SCAN, INNER_SCAN_ALGORITHM, SMEM_CONFIG> BlockRadixRank;
-
-    /// BlockExchange utility type for keys
-    typedef BlockExchange<Key, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeKeys;
-
-    /// BlockExchange utility type for values
-    typedef BlockExchange<Value, BLOCK_THREADS, ITEMS_PER_THREAD> BlockExchangeValues;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        union
-        {
-            typename BlockRadixRank::TempStorage          ranking_storage;
-            typename BlockExchangeKeys::TempStorage        exchange_keys;
-            typename BlockExchangeValues::TempStorage      exchange_values;
-        };
-    };
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockRadixSort()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockRadixSort(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockRadixSort(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockRadixSort(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangements)
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Performs a block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     */
-    __device__ __forceinline__ void Sort(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
-            begin_bit += RADIX_BITS;
-
-            __syncthreads();
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
-
-            // Quit if done
-            if (begin_bit >= end_bit) break;
-
-            __syncthreads();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-
-    /**
-     * \brief Performs a block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values.
-     *
-     * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     * more than one tile of values, simply perform a key-value sort of the keys paired
-     * with a temporary value array that enumerates the key indices.  The reordered indices
-     * can then be used as a gather-vector for exchanging other associated tile data through
-     * shared memory.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void Sort(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
-            begin_bit += RADIX_BITS;
-
-            __syncthreads();
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
-
-            __syncthreads();
-
-            // Exchange values through shared memory in blocked arrangement
-            BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
-
-            // Quit if done
-            if (begin_bit >= end_bit) break;
-
-            __syncthreads();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangement -> striped arrangement)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
-            begin_bit += RADIX_BITS;
-
-            __syncthreads();
-
-            // Check if this is the last pass
-            if (begin_bit >= end_bit)
-            {
-                // Last pass exchanges keys through shared memory in striped arrangement
-                BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
-
-                // Quit
-                break;
-            }
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
-
-            __syncthreads();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-
-    /**
-     * \brief Performs a radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec4) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec4).
-     *
-     * BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     * more than one tile of values, simply perform a key-value sort of the keys paired
-     * with a temporary value array that enumerates the key indices.  The reordered indices
-     * can then be used as a gather-vector for exchanging other associated tile data through
-     * shared memory.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            BlockRadixRank(temp_storage.ranking_storage, linear_tid).RankKeys(unsigned_keys, ranks, begin_bit);
-            begin_bit += RADIX_BITS;
-
-            __syncthreads();
-
-            // Check if this is the last pass
-            if (begin_bit >= end_bit)
-            {
-                // Last pass exchanges keys through shared memory in striped arrangement
-                BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToStriped(keys, ranks);
-
-                __syncthreads();
-
-                // Last pass exchanges through shared memory in striped arrangement
-                BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToStriped(values, ranks);
-
-                // Quit
-                break;
-            }
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys, linear_tid).ScatterToBlocked(keys, ranks);
-
-            __syncthreads();
-
-            // Exchange values through shared memory in blocked arrangement
-            BlockExchangeValues(temp_storage.exchange_values, linear_tid).ScatterToBlocked(values, ranks);
-
-            __syncthreads();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-
-    //@}  end member group
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_raking_layout.cuh b/kokkos/kokkos/TPL/cub/block/block_raking_layout.cuh
deleted file mode 100644
index 878a786..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_raking_layout.cuh
+++ /dev/null
@@ -1,145 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
- */
-
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for raking across thread block data.    ![](raking.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * This type facilitates a shared memory usage pattern where a block of CUDA
- * threads places elements into shared memory and then reduces the active
- * parallelism to one "raking" warp of threads for serially aggregating consecutive
- * sequences of shared items.  Padding is inserted to eliminate bank conflicts
- * (for most data types).
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_THREADS        The thread block size in threads.
- * \tparam BLOCK_STRIPS         When strip-mining, the number of threadblock-strips per tile
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS,
-    int         BLOCK_STRIPS = 1>
-struct BlockRakingLayout
-{
-    //---------------------------------------------------------------------
-    // Constants and typedefs
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// The total number of elements that need to be cooperatively reduced
-        SHARED_ELEMENTS =
-            BLOCK_THREADS * BLOCK_STRIPS,
-
-        /// Maximum number of warp-synchronous raking threads
-        MAX_RAKING_THREADS =
-            CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
-
-        /// Number of raking elements per warp-synchronous raking thread (rounded up)
-        SEGMENT_LENGTH =
-            (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
-
-        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
-        RAKING_THREADS =
-            (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
-
-        /// Pad each segment length with one element if it evenly divides the number of banks
-        SEGMENT_PADDING =
-            (PtxArchProps::SMEM_BANKS % SEGMENT_LENGTH == 0) ? 1 : 0,
-
-        /// Total number of elements in the raking grid
-        GRID_ELEMENTS =
-            RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
-
-        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the warp size)
-        UNGUARDED =
-            (SHARED_ELEMENTS % RAKING_THREADS == 0),
-    };
-
-
-    /**
-     * \brief Shared memory storage type
-     */
-    typedef T TempStorage[BlockRakingLayout::GRID_ELEMENTS];
-
-
-    /**
-     * \brief Returns the location for the calling thread to place data into the grid
-     */
-    static __device__ __forceinline__ T* PlacementPtr(
-        TempStorage &temp_storage,
-        int linear_tid,
-        int block_strip = 0)
-    {
-        // Offset for partial
-        unsigned int offset = (block_strip * BLOCK_THREADS) + linear_tid;
-
-        // Add in one padding element for every segment
-        if (SEGMENT_PADDING > 0)
-        {
-            offset += offset / SEGMENT_LENGTH;
-        }
-
-        // Incorporating a block of padding partials every shared memory segment
-        return temp_storage + offset;
-    }
-
-
-    /**
-     * \brief Returns the location for the calling thread to begin sequential raking
-     */
-    static __device__ __forceinline__ T* RakingPtr(
-        TempStorage &temp_storage,
-        int linear_tid)
-    {
-        return temp_storage + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_reduce.cuh b/kokkos/kokkos/TPL/cub/block/block_reduce.cuh
deleted file mode 100644
index ffdff73..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_reduce.cuh
+++ /dev/null
@@ -1,563 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_reduce_raking.cuh"
-#include "specializations/block_reduce_warp_reductions.cuh"
-#include "../util_type.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA threadblock.
- */
-enum BlockReduceAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm.  Execution is comprised of
-     * three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a
-     *    single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer longer turnaround latencies when the
-     *   GPU is under-occupied, it can often provide higher overall throughput
-     *   across the GPU when suitably occupied.
-     */
-    BLOCK_REDUCE_RAKING,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warp-reductions" reduction algorithm.  Execution is
-     * comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
-     *    reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are
-     *    updated with the aggregate from each preceding warp.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warp-reductions, it
-     *   can often provide lower turnaround latencies when the GPU is
-     *   under-occupied.
-     */
-    BLOCK_REDUCE_WARP_REDUCTIONS,
-};
-
-
-/******************************************************************************
- * Block reduce
- ******************************************************************************/
-
-/**
- * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a list of input elements.
- *
- * \par
- * Optionally, BlockReduce can be specialized by algorithm to accommodate different latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm. [More...](\ref cub::BlockReduceAlgorithm)
- *
- * \tparam T                Data type being reduced
- * \tparam BLOCK_THREADS    The thread block size in threads
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_RAKING)
- *
- * \par Performance Considerations
- * - Very efficient (only one synchronization barrier).
- * - Zero bank conflicts for most types.
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Summation (<b><em>vs.</em></b> generic reduction)
- *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
- *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
- * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockReduce}
- * \par
- * The code snippet below illustrates a sum reduction of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockReduce for 128 threads on type int
- *     typedef cub::BlockReduce<int, 128> BlockReduce;
- *
- *     // Allocate shared memory for BlockReduce
- *     __shared__ typename BlockReduce::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Compute the block-wide sum for thread0
- *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    BlockReduceAlgorithm    ALGORITHM = BLOCK_REDUCE_RAKING>
-class BlockReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /// Internal specialization.
-    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
-        BlockReduceWarpReductions<T, BLOCK_THREADS>,
-        BlockReduceRaking<T, BLOCK_THREADS> >::Type InternalBlockReduce;
-
-    /// Shared memory storage layout type for BlockReduce
-    typedef typename InternalBlockReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockReduce()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockReduce(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockReduce(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a max reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                      ///< [in] Calling thread's input
-        ReductionOp     reduction_op)               ///< [in] Binary reduction operator
-    {
-        return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a max reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
-        ReductionOp     reduction_op)                   ///< [in] Binary reduction operator
-    {
-        // Reduce partials
-        T partial = ThreadReduce(inputs, reduction_op);
-        return Reduce(partial, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid) thread_data = ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage, linear_tid).template Reduce<true>(input, num_valid, reduction_op);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage, linear_tid).template Reduce<false>(input, num_valid, reduction_op);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sum reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input)                      ///< [in] Calling thread's input
-    {
-        return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, BLOCK_THREADS);
-    }
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sum reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ T Sum(
-        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
-    {
-        // Reduce partials
-        T partial = ThreadReduce(inputs, cub::Sum());
-        return Sum(partial);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
-     *
-     * The return value is undefined in threads other than thread<sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item (up to num_items)
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid)
-     *         thread_data = ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input,                  ///< [in] Calling thread's input
-        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage, linear_tid).template Sum<true>(input, num_valid);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage, linear_tid).template Sum<false>(input, num_valid);
-        }
-    }
-
-
-    //@}  end member group
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_scan.cuh b/kokkos/kokkos/TPL/cub/block/block_scan.cuh
deleted file mode 100644
index 1c1a2da..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_scan.cuh
+++ /dev/null
@@ -1,2233 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_scan_raking.cuh"
-#include "specializations/block_scan_warp_scans.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
- */
-enum BlockScanAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
-     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_raking.png
-     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer longer turnaround latencies when the
-     *   GPU is under-occupied, it can often provide higher overall throughput
-     *   across the GPU when suitably occupied.
-     */
-    BLOCK_SCAN_RAKING,
-
-
-    /**
-     * \par Overview
-     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
-     * the expense of higher register pressure.  Raking threads preserve their
-     * "upsweep" segment of values in registers while performing warp-synchronous
-     * scan, allowing the "downsweep" not to re-read them from shared memory.
-     */
-    BLOCK_SCAN_RAKING_MEMOIZE,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
-     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warpscans, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
-     */
-    BLOCK_SCAN_WARP_SCANS,
-};
-
-
-/******************************************************************************
- * Block scan
- ******************************************************************************/
-
-/**
- * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output list where each element is computed to be the reduction
- * of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
- *
- * \par
- * Optionally, BlockScan can be specialized by algorithm to accommodate different latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *
- * \tparam T                Data type being scanned
- * \tparam BLOCK_THREADS    The thread block size in threads
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
- *
- * \par A Simple Example
- * \blockcollective{BlockScan}
- * \par
- * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockScan for 128 threads on type int
- *     typedef cub::BlockScan<int, 128> BlockScan;
- *
- *     // Allocate shared memory for BlockScan
- *     __shared__ typename BlockScan::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute the block-wide exclusive prefix sum
- *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Uses only one or two block-wide synchronization barriers (depending on
- *   algorithm selection)
- * - Zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
- *   - Exclusive variants (<b><em>vs.</em></b> inclusive)
- *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
- * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS,
-    BlockScanAlgorithm  ALGORITHM = BLOCK_SCAN_RAKING>
-class BlockScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
-     * cannot be used with threadblock sizes not a multiple of the
-     * architectural warp size.
-     */
-    static const BlockScanAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % PtxArchProps::WARP_THREADS != 0)) ?
-            BLOCK_SCAN_RAKING :
-            ALGORITHM;
-
-    /// Internal specialization.
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
-        BlockScanWarpScans<T, BLOCK_THREADS>,
-        BlockScanRaking<T, BLOCK_THREADS, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE)> >::Type InternalBlockScan;
-
-
-    /// Shared memory storage layout type for BlockScan
-    typedef typename InternalBlockScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockScan()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockScan(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockScan(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        T block_aggregate;
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).ExclusiveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
-     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.  Furthermore,
-     * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveSum(input, output, block_aggregate, block_prefix_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        // Reduce consecutive thread items in registers
-        Sum scan_op;
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveSum(thread_partial, thread_partial);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        Sum scan_op;
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveSum(thread_partial, thread_partial, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
-     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.  Furthermore,
-     * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 &block_aggregate,             ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp     &block_prefix_op)             ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        // Reduce consecutive thread items in registers
-        Sum scan_op;
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               identity,                       ///< [in] Identity value
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &identity,          ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
-     * scan, etc.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               identity,                       ///< [in] Identity value
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_op);
-    }
-
-
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T           &identity,                    ///< [in] Identity value
-        ScanOp            scan_op)                      ///< [in] Binary scan operator
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, identity, scan_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T           &identity,                    ///< [in] Identity value
-        ScanOp            scan_op,                      ///< [in] Binary scan operator
-        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
-     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
-     * scan, etc.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               identity,                       ///< [in] Identity value
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    //@}  end member group
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (identityless, single datum per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage, linear_tid).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (identityless, multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp            scan_op)                      ///< [in] Binary scan operator
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                      ///< [in] Binary scan operator
-        T               &block_aggregate,             ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)             ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    //@}  end member group
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        T block_aggregate;
-        InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate);
-    }
-
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).InclusiveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
-     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.  Furthermore,
-     * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam BlockPrefixOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage, linear_tid).InclusiveSum(input, output, block_aggregate, block_prefix_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0]);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveSum(thread_partial, thread_partial);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be
-     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveSum(thread_partial, thread_partial, block_aggregate);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).IncluisveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
-     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.  Furthermore,
-     * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_aggregate, block_prefix_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_op);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
-     * scan, etc.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage, linear_tid).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveScan(thread_partial, thread_partial, scan_op);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec4) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename         ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \blocked
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
-     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
-     * scan, etc.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_op);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial);
-        }
-    }
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/block_store.cuh b/kokkos/kokkos/TPL/cub/block/block_store.cuh
deleted file mode 100644
index fb990de..0000000
--- a/kokkos/kokkos/TPL/cub/block/block_store.cuh
+++ /dev/null
@@ -1,926 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for writing linear segments of data from the CUDA thread block
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_vector.cuh"
-#include "../thread/thread_store.cuh"
-#include "block_exchange.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup IoModule
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked I/O
- *********************************************************************/
-//@{
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
- *
- * \blocked
- *
- * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- */
-template <
-    PtxStoreModifier    MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorRA>
-__device__ __forceinline__ void StoreBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier, guarded by range
- *
- * \blocked
- *
- * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- */
-template <
-    PtxStoreModifier    MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorRA>
-__device__ __forceinline__ void StoreBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
-        {
-            ThreadStore<MODIFIER>(block_itr + (linear_tid * ITEMS_PER_THREAD) + ITEM, items[ITEM]);
-        }
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped I/O
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
- *
- * \striped
- *
- * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- */
-template <
-    PtxStoreModifier    MODIFIER,
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorRA>
-__device__ __forceinline__ void StoreStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
-    }
-}
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
- *
- * \striped
- *
- * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- */
-template <
-    PtxStoreModifier    MODIFIER,
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorRA>
-__device__ __forceinline__ void StoreStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
-        {
-            ThreadStore<MODIFIER>(block_itr + (ITEM * BLOCK_THREADS) + linear_tid, items[ITEM]);
-        }
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped I/O
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- */
-template <
-    PtxStoreModifier    MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorRA>
-__device__ __forceinline__ void StoreWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
-    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
-    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
-    }
-}
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items using the specified cache modifier, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorRA     <b>[inferred]</b> The random-access iterator type for output (may be a simple pointer type).
- */
-template <
-    PtxStoreModifier    MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorRA>
-__device__ __forceinline__ void StoreWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    int tid         = linear_tid & (PtxArchProps::WARP_THREADS - 1);
-    int wid         = linear_tid >> PtxArchProps::LOG_WARP_THREADS;
-    int warp_offset = wid * PtxArchProps::WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS) < valid_items)
-        {
-            ThreadStore<MODIFIER>(block_itr + warp_offset + tid + (ITEM * PtxArchProps::WARP_THREADS), items[ITEM]);
-        }
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Blocked, vectorized I/O
- *********************************************************************/
-//@{
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items using the specified cache modifier.
- *
- * \blocked
- *
- * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
- * which is the default starting offset returned by \p cudaMalloc()
- *
- * \par
- * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam MODIFIER             cub::PtxStoreModifier cache modifier.
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- */
-template <
-    PtxStoreModifier    MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void StoreBlockedVectorized(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T                   *block_ptr,                 ///< [in] Input pointer for storing from
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    enum
-    {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
-
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
-
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
-    };
-
-    // Vector type
-    typedef typename VectorHelper<T, VEC_SIZE>::Type Vector;
-
-    // Alias global pointer
-    Vector *block_ptr_vectors = reinterpret_cast<Vector *>(block_ptr);
-
-    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
-    Vector raw_vector[VECTORS_PER_THREAD];
-    T *raw_items = reinterpret_cast<T*>(raw_vector);
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        raw_items[ITEM] = items[ITEM];
-    }
-
-    // Direct-store using vector types
-    StoreBlocked<MODIFIER>(linear_tid, block_ptr_vectors, raw_vector);
-}
-
-
-//@}  end member group
-
-
-/** @} */       // end group IoModule
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockStore abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
- */
-enum BlockStoreAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
-     * directly to memory.  The thread block writes items in a parallel "raking" fashion:
-     * thread<sub><em>i</em></sub> writes the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_STORE_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written directly
-     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
-     * The thread block writes items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector stores to
-     * write the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * For example, <tt>st.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p OutputIteratorRA is not a simple pointer type
-     *   - The block output offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_STORE_VECTORIZE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
-     * transposed into a [<em>striped arrangement</em>](index.html#sec5sec4)
-     * which is then written to memory.  More specifically, cub::BlockExchange
-     * used to locally reorder the items into a
-     * [<em>striped arrangement</em>](index.html#sec5sec4), after which the
-     * thread block writes items in a parallel "strip-mining" fashion: consecutive
-     * items owned by thread<sub><em>i</em></sub> are written to memory with
-     * stride \p BLOCK_THREADS between them.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec4) is locally
-     * transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4)
-     * which is then written to memory.  More specifically, cub::BlockExchange used
-     * to locally reorder the items into a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec4), after which
-     * each warp writes its own contiguous segment in a parallel "strip-mining" fashion:
-     * consecutive items owned by lane<sub><em>i</em></sub> are written to memory
-     * with stride \p WARP_THREADS between them.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE,
-};
-
-
-
-/**
- * \addtogroup BlockModule
- * @{
- */
-
-
-/**
- * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec4) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
- *
- * \par Overview
- * The BlockStore class provides a single data movement abstraction that can be specialized
- * to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
- * performance policies for different architectures, data types, granularity sizes, etc.
- *
- * \par Optionally, BlockStore can be specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4) of data is written
- *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
- *      of data is written directly to memory using CUDA's built-in vectorized stores as a
- *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
- *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec4) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec4)
- *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec4) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *
- * \tparam OutputIteratorRA     The input iterator type (may be a simple pointer type).
- * \tparam BLOCK_THREADS        The thread block size in threads.
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
- * \tparam MODIFIER             <b>[optional]</b> cub::PtxStoreModifier cache modifier.  default: cub::STORE_DEFAULT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> For transposition-based cub::BlockStoreAlgorithm parameterizations that utilize shared memory: When \p true, only use enough shared memory for a single warp's worth of data, time-slicing the block-wide exchange over multiple synchronized rounds (default: false)
- *
- * \par A Simple Example
- * \blockcollective{BlockStore}
- * \par
- * The code snippet below illustrates the storing of a "blocked" arrangement
- * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
- * meaning items are locally reordered among threads so that memory references will be
- * efficiently coalesced using a warp-striped access pattern.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockStore for 128 threads owning 4 integer items each
- *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
- *
- *     // Allocate shared memory for BlockStore
- *     __shared__ typename BlockStore::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Store items to linear memory
- *     int thread_data[4];
- *     BlockStore(temp_storage).Store(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of \p thread_data across the block of threads is
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- *
- */
-template <
-    typename                OutputIteratorRA,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
-    PtxStoreModifier        MODIFIER            = STORE_DEFAULT,
-    bool                    WARP_TIME_SLICING   = false>
-class BlockStore
-{
-private:
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<OutputIteratorRA>::value_type T;
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Store helper
-    template <BlockStoreAlgorithm _POLICY, int DUMMY = 0>
-    struct StoreInternal;
-
-
-    /**
-     * BLOCK_STORE_DIRECT specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        __device__ __forceinline__ void Store(
-            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_VECTORIZE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Store(
-            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreBlockedVectorized<MODIFIER>(linear_tid, block_ptr, items);
-        }
-
-        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
-        template <typename _OutputIteratorRA>
-        __device__ __forceinline__ void Store(
-            _OutputIteratorRA   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreBlocked<MODIFIER>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreBlocked<MODIFIER>(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        __device__ __forceinline__ void Store(
-            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            StoreStriped<MODIFIER, BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = PtxArchProps::WARP_THREADS
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_THREADS, ITEMS_PER_THREAD, WARP_TIME_SLICING> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        __device__ __forceinline__ void Store(
-            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreWarpStriped<MODIFIER>(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef StoreInternal<ALGORITHM> InternalStore;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalStore::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-
-    /// \smemstorage{BlockStore}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockStore()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Threads are identified using <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ BlockStore(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(threadIdx.x)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Each thread is identified using the supplied linear thread identifier
-     */
-    __device__ __forceinline__ BlockStore(
-        int linear_tid)                        ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Each thread is identified using the supplied linear thread identifier.
-     */
-    __device__ __forceinline__ BlockStore(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int linear_tid)                        ///< [in] <b>[optional]</b> A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Store items into a linear segment of memory.
-     *
-     * \blocked
-     *
-     * The code snippet below illustrates the storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockStore for 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     *
-     */
-    __device__ __forceinline__ void Store(
-        OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
-    }
-
-    /**
-     * \brief Store items into a linear segment of memory, guarded by range.
-     *
-     * \blocked
-     *
-     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockStore for 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
-     * only the first two threads being unmasked to store portions of valid data.
-     *
-     */
-    __device__ __forceinline__ void Store(
-        OutputIteratorRA    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-        int                 valid_items)                ///< [in] Number of valid items to write
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
-    }
-};
-
-/** @} */       // end group BlockModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh b/kokkos/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh
deleted file mode 100644
index ecc9800..0000000
--- a/kokkos/kokkos/TPL/cub/block/specializations/block_histogram_atomic.cuh
+++ /dev/null
@@ -1,85 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS>
-struct BlockHistogramAtomic
-{
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramAtomic(
-        TempStorage     &temp_storage,
-        int             linear_tid)
-    {}
-
-
-    /// Composite data onto an existing histogram
-    template <
-        typename            HistoCounter>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
-    {
-        // Update histogram
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-        {
-              atomicAdd(histogram + items[i], 1);
-        }
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh b/kokkos/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh
deleted file mode 100644
index e81edec..0000000
--- a/kokkos/kokkos/TPL/cub/block/specializations/block_histogram_sort.cuh
+++ /dev/null
@@ -1,197 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../block/block_radix_sort.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <
-    typename                T,
-    int                     BLOCK_THREADS,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS>
-struct BlockHistogramSort
-{
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<T, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
-
-    // Shared memory
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            unsigned int run_begin[BINS];
-            unsigned int run_end[BINS];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramSort(
-        TempStorage     &temp_storage,
-        int             linear_tid)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    // Composite data onto an existing histogram
-    template <
-        typename            HistoCounter>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
-    {
-        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort, linear_tid).Sort(items);
-
-        __syncthreads();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-
-        __syncthreads();
-
-        int flags[ITEMS_PER_THREAD];    // unused
-
-        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag, linear_tid).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
-
-        __syncthreads();
-
-        // Composite into histogram
-        histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            int thread_offset = histo_offset + linear_tid;
-            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-        // Finish up with guarded composition if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            int thread_offset = histo_offset + linear_tid;
-            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh b/kokkos/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh
deleted file mode 100644
index 434d25a..0000000
--- a/kokkos/kokkos/TPL/cub/block/specializations/block_reduce_raking.cuh
+++ /dev/null
@@ -1,214 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
- */
-
-#pragma once
-
-#include "../../block/block_raking_layout.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA threadblock
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_THREADS>  ///< The thread block size in threads
-struct BlockReduceRaking
-{
-    /// Layout type for padded threadblock raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, 1> BlockRakingLayout;
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, 1, BlockRakingLayout::RAKING_THREADS>::InternalWarpReduce WarpReduce;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
-
-        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
-        WARP_SYNCHRONOUS_UNGUARDED = ((RAKING_THREADS & (RAKING_THREADS - 1)) == 0),
-
-        /// Whether or not accesses into smem are unguarded
-        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
-
-    };
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
-        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded threadblock raking grid
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRaking(
-        TempStorage &temp_storage,
-        int linear_tid)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-
-    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum reduction_op;
-
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
-            partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE, SEGMENT_LENGTH>(
-                partial,
-                num_valid);
-        }
-        else
-        {
-            // Place partial into shared memory grid.
-            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
-
-            __syncthreads();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = raking_segment[0];
-
-                #pragma unroll
-                for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
-                {
-                    // Update partial if addend is in range
-                    if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
-                    {
-                        partial = reduction_op(partial, raking_segment[ITEM]);
-                    }
-                }
-
-                partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Sum<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
-                    partial,
-                    num_valid);
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
-            partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE, SEGMENT_LENGTH>(
-                partial,
-                num_valid,
-                reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid.
-            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
-
-            __syncthreads();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = raking_segment[0];
-
-                #pragma unroll
-                for (int ITEM = 1; ITEM < SEGMENT_LENGTH; ITEM++)
-                {
-                    // Update partial if addend is in range
-                    if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITEM < num_valid))
-                    {
-                        partial = reduction_op(partial, raking_segment[ITEM]);
-                    }
-                }
-
-                partial = WarpReduce(temp_storage.warp_storage, 0, linear_tid).template Reduce<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
-                    partial,
-                    num_valid,
-                    reduction_op);
-            }
-        }
-
-        return partial;
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh b/kokkos/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh
deleted file mode 100644
index 0e316dd..0000000
--- a/kokkos/kokkos/TPL/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ /dev/null
@@ -1,198 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
- */
-
-#pragma once
-
-#include "../../warp/warp_reduce.cuh"
-#include "../../util_arch.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_THREADS>  ///< The thread block size in threads
-struct BlockReduceWarpReductions
-{
-    /// Constants
-    enum
-    {
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
-
-        /// The logical warp size for warp reductions
-        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, PtxArchProps::WARP_THREADS),
-
-        /// Whether or not the logical warp size evenly divides the threadblock size
-        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
-    };
-
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, WARPS, LOGICAL_WARP_SIZE>::InternalWarpReduce WarpReduce;
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpReduce::TempStorage    warp_reduce;                ///< Buffer for warp-synchronous scan
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
-        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-    int warp_id;
-    int lane_id;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceWarpReductions(
-        TempStorage &temp_storage,
-        int linear_tid)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid),
-        warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
-            0 :
-            linear_tid / PtxArchProps::WARP_THREADS),
-        lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
-            linear_tid :
-            linear_tid % PtxArchProps::WARP_THREADS)
-    {}
-
-
-    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        // Share lane aggregates
-        if (lane_id == 0)
-        {
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-        }
-
-        __syncthreads();
-
-        // Update total aggregate in warp 0, lane 0
-        if (linear_tid == 0)
-        {
-            #pragma unroll
-            for (int SUCCESSOR_WARP = 1; SUCCESSOR_WARP < WARPS; SUCCESSOR_WARP++)
-            {
-                if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
-                {
-                    warp_aggregate = reduction_op(warp_aggregate, temp_storage.warp_aggregates[SUCCESSOR_WARP]);
-                }
-            }
-        }
-
-        return warp_aggregate;
-    }
-
-
-    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   input,          ///< [in] Calling thread's input partial reductions
-        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum     reduction_op;
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < num_valid) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-
-    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        unsigned int    warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE);
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < num_valid) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce, warp_id, lane_id).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid,
-            reduction_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh b/kokkos/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh
deleted file mode 100644
index 75e15d9..0000000
--- a/kokkos/kokkos/TPL/cub/block/specializations/block_scan_raking.cuh
+++ /dev/null
@@ -1,761 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-
-/**
- * \file
- * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../block/block_raking_layout.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../thread/thread_scan.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
- */
-template <
-    typename            T,              ///< Data type being scanned
-    int                 BLOCK_THREADS,  ///< The thread block size in threads
-    bool                MEMOIZE>        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
-struct BlockScanRaking
-{
-    /// Layout type for padded threadblock raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS> BlockRakingLayout;
-
-    /// Constants
-    enum
-    {
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
-
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, 1, RAKING_THREADS> WarpScan;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
-        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded threadblock raking grid
-        T                                           block_aggregate;    ///< Block aggregate
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    int             linear_tid;
-    T               cached_segment[SEGMENT_LENGTH];
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRaking(
-        TempStorage &temp_storage,
-        int linear_tid)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid)
-    {}
-
-    /// Performs upsweep raking reduction, returning the aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ T Upsweep(
-        ScanOp scan_op)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-        T *raking_ptr;
-
-        if (MEMOIZE)
-        {
-            // Copy data into registers
-            #pragma unroll
-            for (int i = 0; i < SEGMENT_LENGTH; i++)
-            {
-                cached_segment[i] = smem_raking_ptr[i];
-            }
-            raking_ptr = cached_segment;
-        }
-        else
-        {
-            raking_ptr = smem_raking_ptr;
-        }
-
-        T raking_partial = raking_ptr[0];
-
-        #pragma unroll
-        for (int i = 1; i < SEGMENT_LENGTH; i++)
-        {
-            if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + i) < BLOCK_THREADS))
-            {
-                raking_partial = scan_op(raking_partial, raking_ptr[i]);
-            }
-        }
-
-        return raking_partial;
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        T *raking_ptr = (MEMOIZE) ?
-            cached_segment :
-            smem_raking_ptr;
-
-        ThreadScanExclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
-
-        if (MEMOIZE)
-        {
-            // Copy data back to smem
-            #pragma unroll
-            for (int i = 0; i < SEGMENT_LENGTH; i++)
-            {
-                smem_raking_ptr[i] = cached_segment[i];
-            }
-        }
-    }
-
-
-    /// Performs inclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        T *raking_ptr = (MEMOIZE) ?
-            cached_segment :
-            smem_raking_ptr;
-
-        ThreadScanInclusive<SEGMENT_LENGTH>(raking_ptr, raking_ptr, scan_op, raking_partial, apply_prefix);
-
-        if (MEMOIZE)
-        {
-            // Copy data back to smem
-            #pragma unroll
-            for (int i = 0; i < SEGMENT_LENGTH; i++)
-            {
-                smem_raking_ptr[i] = cached_segment[i];
-            }
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &identity,          ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                input,
-                output,
-                identity,
-                scan_op,
-                block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                    raking_partial,
-                    raking_partial,
-                    identity,
-                    scan_op,
-                    temp_storage.block_aggregate);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, raking_partial);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename        ScanOp,
-        typename        BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               identity,                       ///< [in] Identity value
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                input,
-                output,
-                identity,
-                scan_op,
-                block_aggregate,
-                block_prefix_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                    raking_partial,
-                    raking_partial,
-                    identity,
-                    scan_op,
-                    temp_storage.block_aggregate,
-                    block_prefix_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, raking_partial);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                input,
-                output,
-                scan_op,
-                block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                    raking_partial,
-                    raking_partial,
-                    scan_op,
-                    temp_storage.block_aggregate);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                input,
-                output,
-                scan_op,
-                block_aggregate,
-                block_prefix_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                    raking_partial,
-                    raking_partial,
-                    scan_op,
-                    temp_storage.block_aggregate,
-                    block_prefix_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, raking_partial);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
-                input,
-                output,
-                block_aggregate);
-        }
-        else
-        {
-            // Raking scan
-            Sum scan_op;
-
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
-                    raking_partial,
-                    raking_partial,
-                    temp_storage.block_aggregate);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, raking_partial);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
-                input,
-                output,
-                block_aggregate,
-                block_prefix_op);
-        }
-        else
-        {
-            // Raking scan
-            Sum scan_op;
-
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
-                    raking_partial,
-                    raking_partial,
-                    temp_storage.block_aggregate,
-                    block_prefix_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, raking_partial);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
-                input,
-                output,
-                scan_op,
-                block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                    raking_partial,
-                    raking_partial,
-                    scan_op,
-                    temp_storage.block_aggregate);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveScan(
-                input,
-                output,
-                scan_op,
-                block_aggregate,
-                block_prefix_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveScan(
-                    raking_partial,
-                    raking_partial,
-                    scan_op,
-                    temp_storage.block_aggregate,
-                    block_prefix_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, raking_partial);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
-                input,
-                output,
-                block_aggregate);
-        }
-        else
-        {
-            // Raking scan
-            Sum scan_op;
-
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Exclusive warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
-                    raking_partial,
-                    raking_partial,
-                    temp_storage.block_aggregate);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0));
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp scan
-            WarpScan(temp_storage.warp_scan, 0, linear_tid).InclusiveSum(
-                input,
-                output,
-                block_aggregate,
-                block_prefix_op);
-        }
-        else
-        {
-            // Raking scan
-            Sum scan_op;
-
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction in grid
-                T raking_partial = Upsweep(scan_op);
-
-                // Warp synchronous scan
-                WarpScan(temp_storage.warp_scan, 0, linear_tid).ExclusiveSum(
-                    raking_partial,
-                    raking_partial,
-                    temp_storage.block_aggregate,
-                    block_prefix_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, raking_partial);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh b/kokkos/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh
deleted file mode 100644
index f7af361..0000000
--- a/kokkos/kokkos/TPL/cub/block/specializations/block_scan_warp_scans.cuh
+++ /dev/null
@@ -1,342 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
- */
-template <
-    typename            T,
-    int                 BLOCK_THREADS>
-struct BlockScanWarpScans
-{
-    /// Constants
-    enum
-    {
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + PtxArchProps::WARP_THREADS - 1) / PtxArchProps::WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PtxArchProps::WARP_THREADS> WarpScan;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpScan::TempStorage      warp_scan;                  ///< Buffer for warp-synchronous scan
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
-        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-    int warp_id;
-    int lane_id;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage,
-        int linear_tid)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(linear_tid),
-        warp_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
-            0 :
-            linear_tid / PtxArchProps::WARP_THREADS),
-        lane_id((BLOCK_THREADS <= PtxArchProps::WARP_THREADS) ?
-            linear_tid :
-            linear_tid % PtxArchProps::WARP_THREADS)
-    {}
-
-
-    /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps.  Also returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &partial,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub>s only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        bool            lane_valid = true)  ///< [in] Whether or not the partial belonging to the current thread is valid
-    {
-        // Share lane aggregates
-        temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        __syncthreads();
-
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; WARP++)
-        {
-            if (warp_id == WARP)
-            {
-                partial = (lane_valid) ?
-                    scan_op(block_aggregate, partial) :     // fold it in our valid partial
-                    block_aggregate;                        // replace our invalid partial with the aggregate
-            }
-
-            block_aggregate = scan_op(block_aggregate, temp_storage.warp_aggregates[WARP]);
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &identity,          ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        T warp_aggregate;
-        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
-
-        // Update outputs and block_aggregate with warp-wide aggregates
-        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               identity,                       ///< [in] Identity value
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        ExclusiveScan(input, output, identity, scan_op, block_aggregate);
-
-        // Compute and share threadblock prefix
-        if (warp_id == 0)
-        {
-            temp_storage.block_prefix = block_prefix_op(block_aggregate);
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        output = scan_op(temp_storage.block_prefix, output);
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        T warp_aggregate;
-        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveScan(input, output, scan_op, warp_aggregate);
-
-        // Update outputs and block_aggregate with warp-wide aggregates
-        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate, (lane_id > 0));
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        ExclusiveScan(input, output, scan_op, block_aggregate);
-
-        // Compute and share threadblock prefix
-        if (warp_id == 0)
-        {
-            temp_storage.block_prefix = block_prefix_op(block_aggregate);
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        output = (linear_tid == 0) ?
-            temp_storage.block_prefix :
-            scan_op(temp_storage.block_prefix, output);
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        T warp_aggregate;
-        WarpScan(temp_storage.warp_scan, warp_id, lane_id).ExclusiveSum(input, output, warp_aggregate);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename BlockPrefixOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        ExclusiveSum(input, output, block_aggregate);
-
-        // Compute and share threadblock prefix
-        if (warp_id == 0)
-        {
-            temp_storage.block_prefix = block_prefix_op(block_aggregate);
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        Sum scan_op;
-        output = scan_op(temp_storage.block_prefix, output);
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        T warp_aggregate;
-        WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveScan(input, output, scan_op, warp_aggregate);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        ApplyWarpAggregates(output, scan_op, warp_aggregate, block_aggregate);
-
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        InclusiveScan(input, output, scan_op, block_aggregate);
-
-        // Compute and share threadblock prefix
-        if (warp_id == 0)
-        {
-            temp_storage.block_prefix = block_prefix_op(block_aggregate);
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        output = scan_op(temp_storage.block_prefix, output);
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        T warp_aggregate;
-        WarpScan(temp_storage.warp_scan, warp_id, lane_id).InclusiveSum(input, output, warp_aggregate);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        ApplyWarpAggregates(output, Sum(), warp_aggregate, block_aggregate);
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename BlockPrefixOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_op value)
-        BlockPrefixOp   &block_prefix_op)               ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        InclusiveSum(input, output, block_aggregate);
-
-        // Compute and share threadblock prefix
-        if (warp_id == 0)
-        {
-            temp_storage.block_prefix = block_prefix_op(block_aggregate);
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        Sum scan_op;
-        output = scan_op(temp_storage.block_prefix, output);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/cub.cuh b/kokkos/kokkos/TPL/cub/cub.cuh
deleted file mode 100644
index dbb77da..0000000
--- a/kokkos/kokkos/TPL/cub/cub.cuh
+++ /dev/null
@@ -1,84 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * CUB umbrella include file
- */
-
-#pragma once
-
-
-// Block
-#include "block/block_histogram.cuh"
-#include "block/block_discontinuity.cuh"
-#include "block/block_exchange.cuh"
-#include "block/block_load.cuh"
-#include "block/block_radix_rank.cuh"
-#include "block/block_radix_sort.cuh"
-#include "block/block_reduce.cuh"
-#include "block/block_scan.cuh"
-#include "block/block_store.cuh"
-
-// Device
-#include "device/device_histogram.cuh"
-#include "device/device_radix_sort.cuh"
-#include "device/device_reduce.cuh"
-#include "device/device_scan.cuh"
-
-// Grid
-//#include "grid/grid_barrier.cuh"
-#include "grid/grid_even_share.cuh"
-#include "grid/grid_mapping.cuh"
-#include "grid/grid_queue.cuh"
-
-// Host
-#include "host/spinlock.cuh"
-
-// Thread
-#include "thread/thread_load.cuh"
-#include "thread/thread_operators.cuh"
-#include "thread/thread_reduce.cuh"
-#include "thread/thread_scan.cuh"
-#include "thread/thread_store.cuh"
-
-// Warp
-#include "warp/warp_reduce.cuh"
-#include "warp/warp_scan.cuh"
-
-// Util
-#include "util_allocator.cuh"
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_device.cuh"
-#include "util_macro.cuh"
-#include "util_ptx.cuh"
-#include "util_type.cuh"
-#include "util_iterator.cuh"
-#include "util_vector.cuh"
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/block_histo_tiles.cuh b/kokkos/kokkos/TPL/cub/device/block/block_histo_tiles.cuh
deleted file mode 100644
index e1165d6..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/block_histo_tiles.cuh
+++ /dev/null
@@ -1,322 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramTiles implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "specializations/block_histo_tiles_gatomic.cuh"
-#include "specializations/block_histo_tiles_satomic.cuh"
-#include "specializations/block_histo_tiles_sort.cuh"
-#include "../../util_type.cuh"
-#include "../../grid/grid_mapping.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-
-/**
- * \brief BlockHistogramTilesAlgorithm enumerates alternative algorithms for BlockHistogramTiles.
- */
-enum BlockHistogramTilesAlgorithm
-{
-
-    /**
-     * \par Overview
-     * A two-kernel approach in which:
-     * -# Thread blocks in the first kernel aggregate their own privatized
-     *    histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
-     * -# A single thread block in the second kernel reduces them into the output histogram(s).
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     */
-    GRID_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * A two-kernel approach in which:
-     * -# Thread blocks in the first kernel aggregate their own privatized
-     *    histograms using shared-memory \p atomicAdd().
-     * -# A single thread block in the second kernel reduces them into the
-     *    output histogram(s).
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     */
-    GRID_HISTO_SHARED_ATOMIC,
-
-
-    /**
-     * \par Overview
-     * A single-kernel approach in which thread blocks update the output histogram(s) directly
-     * using global-memory \p atomicAdd().
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     *
-     * Performance is not significantly impacted when computing histograms having large
-     * numbers of bins (e.g., thousands).
-     */
-    GRID_HISTO_GLOBAL_ATOMIC,
-
-};
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Tuning policy for BlockHistogramTiles
- */
-template <
-    int                             _BLOCK_THREADS,
-    int                             _ITEMS_PER_THREAD,
-    BlockHistogramTilesAlgorithm    _GRID_ALGORITHM,
-    GridMappingStrategy             _GRID_MAPPING,
-    int                             _SM_OCCUPANCY>
-struct BlockHistogramTilesPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
-        SM_OCCUPANCY        = _SM_OCCUPANCY,
-    };
-
-    static const BlockHistogramTilesAlgorithm   GRID_ALGORITHM      = _GRID_ALGORITHM;
-    static const GridMappingStrategy            GRID_MAPPING        = _GRID_MAPPING;
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-
-/**
- * Implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
- */
-template <
-    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
-    typename    SizeT>                          ///< Integer type for offsets
-struct BlockHistogramTiles
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Histogram grid algorithm
-    static const BlockHistogramTilesAlgorithm GRID_ALGORITHM = BlockHistogramTilesPolicy::GRID_ALGORITHM;
-
-    // Alternative internal implementation types
-    typedef BlockHistogramTilesSort<            BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesSortT;
-    typedef BlockHistogramTilesSharedAtomic<    BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesSharedAtomicT;
-    typedef BlockHistogramTilesGlobalAtomic<    BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>   BlockHistogramTilesGlobalAtomicT;
-
-    // Internal block sweep histogram type
-    typedef typename If<(GRID_ALGORITHM == GRID_HISTO_SORT),
-        BlockHistogramTilesSortT,
-        typename If<(GRID_ALGORITHM == GRID_HISTO_SHARED_ATOMIC),
-            BlockHistogramTilesSharedAtomicT,
-            BlockHistogramTilesGlobalAtomicT>::Type>::Type InternalBlockDelegate;
-
-    enum
-    {
-        TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
-    };
-
-
-    // Temporary storage type
-    typedef typename InternalBlockDelegate::TempStorage TempStorage;
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Internal block delegate
-    InternalBlockDelegate internal_delegate;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramTiles(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIteratorRA     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        internal_delegate(temp_storage, d_in, d_out_histograms)
-    {}
-
-
-    /**
-     * \brief Reduce a consecutive segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        SizeT   block_oob)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_oob)
-        {
-            internal_delegate.ConsumeTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_oob)
-        {
-            int valid_items = block_oob - block_offset;
-            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Aggregate output
-        internal_delegate.AggregateOutput();
-    }
-
-
-    /**
-     * Reduce a consecutive segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT                               num_items,          ///< [in] Total number of global input items
-        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
-    {
-        even_share.BlockInit();
-        ConsumeTiles(even_share.block_offset, even_share.block_oob);
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        int                 num_items,          ///< Total number of input items
-        GridQueue<SizeT>    queue)              ///< Queue descriptor for assigning tiles of work to thread blocks
-    {
-        // Shared block offset
-        __shared__ SizeT shared_block_offset;
-
-        // We give each thread block at least one tile of input.
-        SizeT block_offset      = blockIdx.x * TILE_ITEMS;
-        SizeT even_share_base   = gridDim.x * TILE_ITEMS;
-
-        // Process full tiles of input
-        while (block_offset + TILE_ITEMS <= num_items)
-        {
-            internal_delegate.ConsumeTile<true>(block_offset);
-
-            // Dequeue up to TILE_ITEMS
-            if (threadIdx.x == 0)
-                shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-            __syncthreads();
-
-            block_offset = shared_block_offset;
-
-            __syncthreads();
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < num_items)
-        {
-            int valid_items = num_items - block_offset;
-            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Aggregate output
-        internal_delegate.AggregateOutput();
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT                               num_items,          ///< [in] Total number of global input items
-        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_DYNAMIC>      is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        ConsumeTiles(num_items, queue);
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/block_partition_tiles.cuh b/kokkos/kokkos/TPL/cub/device/block/block_partition_tiles.cuh
deleted file mode 100644
index 4597773..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/block_partition_tiles.cuh
+++ /dev/null
@@ -1,381 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "scan_tiles_types.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_store.cuh"
-#include "../../block/block_scan.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_vector.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Tuning policy for BlockPartitionTiles
- */
-template <
-    int                         _PARTITIONS,
-    int                         _BLOCK_THREADS,
-    int                         _ITEMS_PER_THREAD,
-    PtxLoadModifier             _LOAD_MODIFIER,
-    BlockScanAlgorithm          _SCAN_ALGORITHM>
-struct BlockPartitionTilesPolicy
-{
-    enum
-    {
-        PARTITIONS              = _PARTITIONS,
-        BLOCK_THREADS           = _BLOCK_THREADS,
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
-    };
-
-    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
-    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
-};
-
-
-
-/**
- * Tuple type for scanning partition membership flags
- */
-template <
-    typename    SizeT,
-    int         PARTITIONS>
-struct PartitionScanTuple;
-
-
-/**
- * Tuple type for scanning partition membership flags (specialized for 1 output partition)
- */
-template <typename SizeT>
-struct PartitionScanTuple<SizeT, 1> : VectorHelper<SizeT, 1>::Type
-{
-    __device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
-    {
-        PartitionScanTuple retval;
-        retval.x = x + other.x;
-        return retval;
-    }
-
-    template <typename PredicateOp, typename T>
-    __device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
-    {
-        this->x = pred_op(val);
-    }
-
-    template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
-    __device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
-    {
-        if (pred_op(val))
-            d_out[this->x - 1] = val;
-    }
-
-};
-
-
-/**
- * Tuple type for scanning partition membership flags (specialized for 2 output partitions)
- */
-template <typename SizeT>
-struct PartitionScanTuple<SizeT, 2> : VectorHelper<SizeT, 2>::Type
-{
-    __device__ __forceinline__ PartitionScanTuple operator+(const PartitionScanTuple &other)
-    {
-        PartitionScanTuple retval;
-        retval.x = x + other.x;
-        retval.y = y + other.y;
-        return retval;
-    }
-
-    template <typename PredicateOp, typename T>
-    __device__ __forceinline__ void SetFlags(PredicateOp pred_op, T val)
-    {
-        bool pred = pred_op(val);
-        this->x = pred;
-        this->y = !pred;
-    }
-
-    template <typename PredicateOp, typename T, typename OutputIteratorRA, SizeT num_items>
-    __device__ __forceinline__ void Scatter(PredicateOp pred_op, T val, OutputIteratorRA d_out, SizeT num_items)
-    {
-        SizeT scatter_offset = (pred_op(val)) ?
-            this->x - 1 :
-            num_items - this->y;
-
-        d_out[scatter_offset] = val;
-    }
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockPartitionTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide list partitioning.
- *
- * Implements a single-pass "domino" strategy with adaptive prefix lookback.
- */
-template <
-    typename BlockPartitionTilesPolicy, ///< Tuning policy
-    typename InputIteratorRA,           ///< Input iterator type
-    typename OutputIteratorRA,          ///< Output iterator type
-    typename PredicateOp,               ///< Partition predicate functor type
-    typename SizeT>                     ///< Offset integer type
-struct BlockPartitionTiles
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Constants
-    enum
-    {
-        PARTITIONS          = BlockPartitionTilesPolicy::PARTITIONS,
-        BLOCK_THREADS       = BlockPartitionTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockPartitionTilesPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Load modifier
-    static const PtxLoadModifier LOAD_MODIFIER = BlockPartitionTilesPolicy::LOAD_MODIFIER;
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-    // Tuple type for scanning partition membership flags
-    typedef PartitionScanTuple<SizeT, PARTITIONS> PartitionScanTuple;
-
-    // Tile status descriptor type
-    typedef ScanTileDescriptor<PartitionScanTuple> ScanTileDescriptorT;
-
-    // Block scan type for scanning membership flag scan_tuples
-    typedef BlockScan<
-        PartitionScanTuple,
-        BlockPartitionTilesPolicy::BLOCK_THREADS,
-        BlockPartitionTilesPolicy::SCAN_ALGORITHM> BlockScanT;
-
-    // Callback type for obtaining inter-tile prefix during block scan
-    typedef DeviceScanBlockPrefixOp<PartitionScanTuple, Sum> InterblockPrefixOp;
-
-    // Shared memory type for this threadblock
-    struct TempStorage
-    {
-        typename InterblockPrefixOp::TempStorage    prefix;         // Smem needed for cooperative prefix callback
-        typename BlockScanT::TempStorage            scan;           // Smem needed for tile scanning
-        SizeT                                       tile_idx;       // Shared tile index
-    };
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    TempStorage                 &temp_storage;      ///< Reference to temp_storage
-    InputIteratorRA             d_in;               ///< Input data
-    OutputIteratorRA            d_out;              ///< Output data
-    ScanTileDescriptorT         *d_tile_status;     ///< Global list of tile status
-    PredicateOp                 pred_op;            ///< Unary predicate operator indicating membership in the first partition
-    SizeT                       num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockPartitionTiles(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIteratorRA             d_in,               ///< Input data
-        OutputIteratorRA            d_out,              ///< Output data
-        ScanTileDescriptorT         *d_tile_status,     ///< Global list of tile status
-        PredicateOp                 pred_op,            ///< Unary predicate operator indicating membership in the first partition
-        SizeT                       num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out(d_out),
-        d_tile_status(d_tile_status),
-        pred_op(pred_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Domino scan
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        int                 tile_idx,           ///< Tile index
-        SizeT               block_offset,       ///< Tile offset
-        PartitionScanTuple  &partition_ends)    ///< Running total
-    {
-        T                   items[ITEMS_PER_THREAD];
-        PartitionScanTuple  scan_tuples[ITEMS_PER_THREAD];
-
-        // Load items
-        int valid_items = num_items - block_offset;
-        if (FULL_TILE)
-            LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-        else
-            LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-
-        // Prevent hoisting
-//        __syncthreads();
-//        __threadfence_block();
-
-        // Set partition membership flags in scan scan_tuples
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scan_tuples[ITEM].SetFlags(pred_op, items[ITEM]);
-        }
-
-        // Perform inclusive scan over scan scan_tuples
-        PartitionScanTuple block_aggregate;
-        if (tile_idx == 0)
-        {
-            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate);
-            partition_ends = block_aggregate;
-
-            // Update tile status if there are successor tiles
-            if (FULL_TILE && (threadIdx.x == 0))
-                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
-        }
-        else
-        {
-            InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, Sum(), tile_idx);
-            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, Sum(), block_aggregate, prefix_op);
-            partition_ends = prefix_op.inclusive_prefix;
-        }
-
-        // Scatter items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Scatter if not out-of-bounds
-            if (FULL_TILE || (threadIdx.x + (ITEM * BLOCK_THREADS) < valid_items))
-            {
-                scan_tuples[ITEM].Scatter(pred_op, items[ITEM], d_out, num_items);
-            }
-        }
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a domino scan
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        GridQueue<int>      queue,              ///< [in] Queue descriptor for assigning tiles of work to thread blocks
-        SizeT               num_tiles,          ///< [in] Total number of input tiles
-        PartitionScanTuple  &partition_ends,    ///< [out] Running partition end offsets
-        bool                &is_last_tile)      ///< [out] Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
-    {
-#if CUB_PTX_ARCH < 200
-
-        // No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
-        int     tile_idx        = blockIdx.x;
-        SizeT   block_offset    = SizeT(TILE_ITEMS) * tile_idx;
-
-        if (block_offset + TILE_ITEMS <= num_items)
-        {
-            ConsumeTile<true>(tile_idx, block_offset, partition_ends);
-        }
-        else if (block_offset < num_items)
-        {
-            ConsumeTile<false>(tile_idx, block_offset, partition_ends);
-        }
-        is_last_tile = (tile_idx == num_tiles - 1);
-
-#else
-
-        // Get first tile
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int tile_idx = temp_storage.tile_idx;
-        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
-
-        while (block_offset + TILE_ITEMS <= num_items)
-        {
-            // Consume full tile
-            ConsumeTile<true>(tile_idx, block_offset, partition_ends);
-            is_last_tile = (tile_idx == num_tiles - 1);
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1);
-
-            __syncthreads();
-
-            tile_idx = temp_storage.tile_idx;
-            block_offset = SizeT(TILE_ITEMS) * tile_idx;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < num_items)
-        {
-            ConsumeTile<false>(tile_idx, block_offset, partition_ends);
-            is_last_tile = (tile_idx == num_tiles - 1);
-        }
-#endif
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh b/kokkos/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh
deleted file mode 100644
index 91d628e..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/block_radix_sort_downsweep_tiles.cuh
+++ /dev/null
@@ -1,713 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * BlockRadixSortDownsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep.
- */
-
-
-#pragma once
-
-#include "../../thread/thread_load.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_store.cuh"
-#include "../../block/block_radix_rank.cuh"
-#include "../../block/block_exchange.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Types of scattering strategies
- */
-enum RadixSortScatterAlgorithm
-{
-    RADIX_SORT_SCATTER_DIRECT,      ///< Scatter directly from registers to global bins
-    RADIX_SORT_SCATTER_TWO_PHASE,   ///< First scatter from registers into shared memory bins, then into global bins
-};
-
-
-/**
- * Tuning policy for BlockRadixSortDownsweepTiles
- */
-template <
-    int                         _BLOCK_THREADS,             ///< The number of threads per CTA
-    int                         _ITEMS_PER_THREAD,          ///< The number of consecutive downsweep keys to process per thread
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
-    PtxLoadModifier             _LOAD_MODIFIER,             ///< The PTX cache-modifier to use for loads
-    bool                        _EXCHANGE_TIME_SLICING,     ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
-    bool                        _MEMOIZE_OUTER_SCAN,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
-    BlockScanAlgorithm          _INNER_SCAN_ALGORITHM,      ///< The cub::BlockScanAlgorithm algorithm to use
-    RadixSortScatterAlgorithm   _SCATTER_ALGORITHM,         ///< The scattering strategy to use
-    cudaSharedMemConfig         _SMEM_CONFIG,               ///< Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
-    int                         _RADIX_BITS>                ///< The number of radix bits, i.e., log2(bins)
-struct BlockRadixSortDownsweepTilesPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING   = _EXCHANGE_TIME_SLICING,
-        RADIX_BITS              = _RADIX_BITS,
-        MEMOIZE_OUTER_SCAN      = _MEMOIZE_OUTER_SCAN,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = _LOAD_ALGORITHM;
-    static const PtxLoadModifier            LOAD_MODIFIER           = _LOAD_MODIFIER;
-    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = _INNER_SCAN_ALGORITHM;
-    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = _SCATTER_ALGORITHM;
-    static const cudaSharedMemConfig        SMEM_CONFIG             = _SMEM_CONFIG;
-
-    typedef BlockRadixSortDownsweepTilesPolicy<
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM,
-        LOAD_MODIFIER,
-        EXCHANGE_TIME_SLICING,
-        MEMOIZE_OUTER_SCAN,
-        INNER_SCAN_ALGORITHM,
-        SCATTER_ALGORITHM,
-        SMEM_CONFIG,
-        CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * CTA-wide "downsweep" abstraction for distributing keys from
- * a range of input tiles.
- */
-template <
-    typename BlockRadixSortDownsweepTilesPolicy,
-    typename Key,
-    typename Value,
-    typename SizeT>
-struct BlockRadixSortDownsweepTiles
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // Appropriate unsigned-bits representation of Key
-    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-    static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
-    static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
-
-    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRadixSortDownsweepTilesPolicy::LOAD_ALGORITHM;
-    static const PtxLoadModifier            LOAD_MODIFIER           = BlockRadixSortDownsweepTilesPolicy::LOAD_MODIFIER;
-    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRadixSortDownsweepTilesPolicy::INNER_SCAN_ALGORITHM;
-    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRadixSortDownsweepTilesPolicy::SCATTER_ALGORITHM;
-    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRadixSortDownsweepTilesPolicy::SMEM_CONFIG;
-
-    enum
-    {
-        BLOCK_THREADS           = BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = BlockRadixSortDownsweepTilesPolicy::ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING   = BlockRadixSortDownsweepTilesPolicy::EXCHANGE_TIME_SLICING,
-        RADIX_BITS              = BlockRadixSortDownsweepTilesPolicy::RADIX_BITS,
-        MEMOIZE_OUTER_SCAN      = BlockRadixSortDownsweepTilesPolicy::MEMOIZE_OUTER_SCAN,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-        KEYS_ONLY               = Equals<Value, NullType>::VALUE,
-
-        WARP_THREADS            = PtxArchProps::LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_SIZET         = sizeof(SizeT),
-        LOG_BYTES_PER_SIZET     = Log2<BYTES_PER_SIZET>::VALUE,
-
-        LOG_SMEM_BANKS          = PtxArchProps::LOG_SMEM_BANKS,
-        SMEM_BANKS              = 1 << LOG_SMEM_BANKS,
-
-        DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
-        SCATTER_PASSES          = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
-
-        LOG_STORE_TXN_THREADS   = LOG_SMEM_BANKS,
-        STORE_TXN_THREADS       = 1 << LOG_STORE_TXN_THREADS,
-    };
-
-    // BlockRadixRank type
-    typedef BlockRadixRank<
-        BLOCK_THREADS,
-        RADIX_BITS,
-        MEMOIZE_OUTER_SCAN,
-        INNER_SCAN_ALGORITHM,
-        SMEM_CONFIG> BlockRadixRank;
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        UnsignedBits*,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM,
-        LOAD_MODIFIER,
-        EXCHANGE_TIME_SLICING> BlockLoadKeys;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        Value*,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM,
-        LOAD_MODIFIER,
-        EXCHANGE_TIME_SLICING> BlockLoadValues;
-
-    // BlockExchange type (keys)
-    typedef BlockExchange<
-        UnsignedBits,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING> BlockExchangeKeys;
-
-    // BlockExchange type (values)
-    typedef BlockExchange<
-        Value,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING> BlockExchangeValues;
-
-
-    /**
-     * Shared memory storage layout
-     */
-    struct _TempStorage
-    {
-        SizeT   relative_bin_offsets[RADIX_DIGITS + 1];
-        bool    short_circuit;
-
-        union
-        {
-            typename BlockRadixRank::TempStorage        ranking;
-            typename BlockLoadKeys::TempStorage         load_keys;
-            typename BlockLoadValues::TempStorage       load_values;
-            typename BlockExchangeKeys::TempStorage     exchange_keys;
-            typename BlockExchangeValues::TempStorage   exchange_values;
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Input and output device pointers
-    UnsignedBits    *d_keys_in;
-    UnsignedBits    *d_keys_out;
-    Value           *d_values_in;
-    Value           *d_values_out;
-
-    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
-    SizeT           bin_offset;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Whether to short-ciruit
-    bool            short_circuit;
-
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decodes given keys to lookup digit offsets in shared memory
-     */
-    __device__ __forceinline__ void DecodeRelativeBinOffsets(
-        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
-        SizeT           (&relative_bin_offsets)[ITEMS_PER_THREAD])
-    {
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, RADIX_BITS);
-
-            // Lookup base digit offset from shared memory
-            relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
-        }
-    }
-
-
-    /**
-     * Scatter ranked items to global memory
-     */
-    template <bool FULL_TILE, typename T>
-    __device__ __forceinline__ void ScatterItems(
-        T       (&items)[ITEMS_PER_THREAD],
-        int     (&local_ranks)[ITEMS_PER_THREAD],
-        SizeT   (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        T       *d_out,
-        SizeT   valid_items)
-    {
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Scatter if not out-of-bounds
-            if (FULL_TILE || (local_ranks[ITEM] < valid_items))
-            {
-                d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
-            }
-        }
-    }
-
-
-    /**
-     * Scatter ranked keys directly to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
-        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        SizeT                                   valid_items,
-        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
-    {
-        // Compute scatter offsets
-        DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
-
-        // Untwiddle keys before outputting
-        UnsignedBits keys[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
-        }
-
-        // Scatter to global
-        ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
-    }
-
-
-    /**
-     * Scatter ranked keys through shared memory, then to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
-        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        SizeT                                   valid_items,
-        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
-    {
-        // Exchange keys through shared memory
-        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
-
-        // Compute striped local ranks
-        int local_ranks[ITEMS_PER_THREAD];
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
-        }
-
-        // Scatter directly
-        ScatterKeys<FULL_TILE>(
-            twiddled_keys,
-            relative_bin_offsets,
-            local_ranks,
-            valid_items,
-            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
-    }
-
-
-    /**
-     * Scatter ranked values directly to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        Value                                   (&values)[ITEMS_PER_THREAD],
-        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        SizeT                                   valid_items,
-        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
-    {
-        // Scatter to global
-        ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
-    }
-
-
-    /**
-     * Scatter ranked values through shared memory, then to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        Value                                   (&values)[ITEMS_PER_THREAD],
-        SizeT                                   (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        SizeT                                   valid_items,
-        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
-    {
-        __syncthreads();
-
-        // Exchange keys through shared memory
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
-
-        // Compute striped local ranks
-        int local_ranks[ITEMS_PER_THREAD];
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
-        }
-
-        // Scatter directly
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            local_ranks,
-            valid_items,
-            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
-    }
-
-
-    /**
-     * Load a tile of items (specialized for full tile)
-     */
-    template <typename BlockLoadT, typename T>
-    __device__ __forceinline__ void LoadItems(
-        BlockLoadT      &block_loader, 
-        T               (&items)[ITEMS_PER_THREAD],
-        T               *d_in, 
-        SizeT           valid_items, 
-        Int2Type<true>  is_full_tile)
-    {
-        block_loader.Load(d_in, items);
-    }
-
-
-    /**
-     * Load a tile of items (specialized for partial tile)
-     */
-    template <typename BlockLoadT, typename T>
-    __device__ __forceinline__ void LoadItems(
-        BlockLoadT      &block_loader, 
-        T               (&items)[ITEMS_PER_THREAD],
-        T               *d_in, 
-        SizeT           valid_items, 
-        Int2Type<false> is_full_tile)
-    {
-        block_loader.Load(d_in, items, valid_items);
-    }
-
-
-    /**
-     * Truck along associated values
-     */
-    template <bool FULL_TILE, typename _Value>
-    __device__ __forceinline__ void GatherScatterValues(
-        _Value      (&values)[ITEMS_PER_THREAD],
-        SizeT       (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        SizeT       block_offset,
-        SizeT       valid_items)
-    {
-        BlockLoadValues loader(temp_storage.load_values);
-        LoadItems(
-            loader,
-            values,
-            d_values_in + block_offset,
-            valid_items,
-            Int2Type<FULL_TILE>());
-
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            ranks,
-            valid_items,
-            Int2Type<SCATTER_ALGORITHM>());
-    }
-
-
-    /**
-     * Truck along associated values (specialized for key-only sorting)
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        NullType    (&values)[ITEMS_PER_THREAD],
-        SizeT       (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        SizeT       block_offset,
-        SizeT       valid_items)
-    {}
-
-
-    /**
-     * Process tile
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        SizeT block_offset,
-        const SizeT &valid_items = TILE_ITEMS)
-    {
-        // Per-thread tile data
-        UnsignedBits    keys[ITEMS_PER_THREAD];                     // Keys
-        UnsignedBits    twiddled_keys[ITEMS_PER_THREAD];            // Twiddled keys
-        int             ranks[ITEMS_PER_THREAD];                    // For each key, the local rank within the CTA
-        SizeT           relative_bin_offsets[ITEMS_PER_THREAD];     // For each key, the global scatter base offset of the corresponding digit
-
-        if (LOAD_ALGORITHM != BLOCK_LOAD_DIRECT) __syncthreads();
-
-        // Assign max-key to all keys
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            keys[ITEM] = MAX_KEY;
-        }
-
-        // Load tile of keys
-        BlockLoadKeys loader(temp_storage.load_keys);
-        LoadItems(
-            loader,
-            keys,
-            d_keys_in + block_offset,
-            valid_items, 
-            Int2Type<FULL_TILE>());
-
-        __syncthreads();
-
-        // Twiddle key bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
-        }
-
-        // Rank the twiddled keys
-        int inclusive_digit_prefix;
-        BlockRadixRank(temp_storage.ranking).RankKeys(
-            twiddled_keys,
-            ranks,
-            current_bit,
-            inclusive_digit_prefix);
-
-        // Update global scatter base offsets for each digit
-        if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
-        {
-            int exclusive_digit_prefix;
-
-            // Get exclusive digit prefix from inclusive prefix
-#if CUB_PTX_ARCH >= 300
-            exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
-            if (threadIdx.x == 0)
-                exclusive_digit_prefix = 0;
-#else
-            volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
-            exchange[threadIdx.x] = 0;
-            exchange[threadIdx.x + 1] = inclusive_digit_prefix;
-            exclusive_digit_prefix = exchange[threadIdx.x];
-#endif
-
-            bin_offset -= exclusive_digit_prefix;
-            temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
-            bin_offset += inclusive_digit_prefix;
-        }
-
-        __syncthreads();
-
-        // Scatter keys
-        ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
-
-        // Gather/scatter values
-        Value values[ITEMS_PER_THREAD];
-        GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
-    }
-
-
-    /**
-     * Copy tiles within the range of input
-     */
-    template <typename T>
-    __device__ __forceinline__ void Copy(
-        T       *d_in,
-        T       *d_out,
-        SizeT   block_offset,
-        SizeT   block_oob)
-    {
-        // Simply copy the input
-        while (block_offset + TILE_ITEMS <= block_oob)
-        {
-            T items[ITEMS_PER_THREAD];
-
-            LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-            __syncthreads();
-            StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Clean up last partial tile with guarded-I/O
-        if (block_offset < block_oob)
-        {
-            SizeT valid_items = block_oob - block_offset;
-
-            T items[ITEMS_PER_THREAD];
-
-            LoadStriped<LOAD_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-            __syncthreads();
-            StoreStriped<STORE_DEFAULT, BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
-        }
-    }
-
-
-    /**
-     * Copy tiles within the range of input (specialized for NullType)
-     */
-    __device__ __forceinline__ void Copy(
-        NullType    *d_in,
-        NullType    *d_out,
-        SizeT       block_offset,
-        SizeT       block_oob)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
-        TempStorage &temp_storage,
-        SizeT       bin_offset,
-        Key         *d_keys_in,
-        Key         *d_keys_out,
-        Value       *d_values_in,
-        Value       *d_values_out,
-        int         current_bit)
-    :
-        temp_storage(temp_storage.Alias()),
-        bin_offset(bin_offset),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        short_circuit(false)
-    {}
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRadixSortDownsweepTiles(
-        TempStorage &temp_storage,
-        SizeT       num_items,
-        SizeT       *d_spine,
-        Key         *d_keys_in,
-        Key         *d_keys_out,
-        Value       *d_values_in,
-        Value       *d_values_out,
-        int         current_bit)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        current_bit(current_bit)
-    {
-        // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
-        if (threadIdx.x < RADIX_DIGITS)
-        {
-            // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
-            SizeT first_block_bin_offset = d_spine[gridDim.x * threadIdx.x];
-            int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
-            this->temp_storage.short_circuit = WarpAll(predicate);
-
-            // Load my block's bin offset for my bin
-            bin_offset = d_spine[(gridDim.x * threadIdx.x) + blockIdx.x];
-        }
-
-        __syncthreads();
-
-        short_circuit = this->temp_storage.short_circuit;
-    }
-
-
-    /**
-     * Distribute keys from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessTiles(
-        SizeT           block_offset,
-        const SizeT     &block_oob)
-    {
-        if (short_circuit)
-        {
-            // Copy keys
-            Copy(d_keys_in, d_keys_out, block_offset, block_oob);
-
-            // Copy values
-            Copy(d_values_in, d_values_out, block_offset, block_oob);
-        }
-        else
-        {
-            // Process full tiles of tile_items
-            while (block_offset + TILE_ITEMS <= block_oob)
-            {
-                ProcessTile<true>(block_offset);
-                block_offset += TILE_ITEMS;
-            }
-
-            // Clean up last partial tile with guarded-I/O
-            if (block_offset < block_oob)
-            {
-                ProcessTile<false>(block_offset, block_oob - block_offset);
-            }
-        }
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh b/kokkos/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh
deleted file mode 100644
index 22f8c9c..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/block_radix_sort_upsweep_tiles.cuh
+++ /dev/null
@@ -1,464 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
- */
-
-#pragma once
-
-#include "../../thread/thread_reduce.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../block/block_load.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Tuning policy for BlockRadixSortUpsweepTiles
- */
-template <
-    int                 _BLOCK_THREADS,     ///< The number of threads per CTA
-    int                 _ITEMS_PER_THREAD,  ///< The number of items to load per thread per tile
-    PtxLoadModifier     _LOAD_MODIFIER,     ///< Load cache-modifier
-    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
-struct BlockRadixSortUpsweepTilesPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
-        RADIX_BITS          = _RADIX_BITS,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    static const PtxLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
-
-    typedef BlockRadixSortUpsweepTilesPolicy<
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_MODIFIER,
-        CUB_MAX(1, RADIX_BITS - 1)> AltPolicy;
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRadixSortUpsweepTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep.
- *
- * Computes radix digit histograms over a range of input tiles.
- */
-template <
-    typename BlockRadixSortUpsweepTilesPolicy,
-    typename Key,
-    typename SizeT>
-struct BlockRadixSortUpsweepTiles
-{
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-    // Integer type for digit counters (to be packed into words of PackedCounters)
-    typedef unsigned char DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef unsigned int PackedCounter;
-
-    static const PtxLoadModifier LOAD_MODIFIER = BlockRadixSortUpsweepTilesPolicy::LOAD_MODIFIER;
-
-    enum
-    {
-        RADIX_BITS              = BlockRadixSortUpsweepTilesPolicy::RADIX_BITS,
-        BLOCK_THREADS           = BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = BlockRadixSortUpsweepTilesPolicy::ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS        = PtxArchProps::LOG_WARP_THREADS,
-        WARP_THREADS            = 1 << LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
-
-        BYTES_PER_COUNTER       = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
-        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
-
-        // To prevent counter overflow, we must periodically unpack and aggregate the
-        // digit counters back into registers.  Each counter lane is assigned to a
-        // warp for aggregation.
-
-        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
-
-        // Unroll tiles in batches without risk of counter overflow
-        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
-        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
-    };
-
-
-
-    /**
-     * Shared memory storage layout
-     */
-    struct _TempStorage
-    {
-        union
-        {
-            DigitCounter    digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter   packed_counters[COUNTER_LANES][BLOCK_THREADS];
-            SizeT           digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields (aggregate state bundle)
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Thread-local counters for periodically aggregating composite-counter lanes
-    SizeT           local_counts[LANES_PER_WARP][PACKING_RATIO];
-
-    // Input and output device pointers
-    UnsignedBits    *d_keys_in;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-
-
-    //---------------------------------------------------------------------
-    // Helper structure for templated iteration
-    //---------------------------------------------------------------------
-
-    // Iterate
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        enum {
-            HALF = (MAX / 2),
-        };
-
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(
-            BlockRadixSortUpsweepTiles &cta,
-            UnsignedBits keys[KEYS_PER_THREAD])
-        {
-            cta.Bucket(keys[COUNT]);
-
-            // Next
-            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
-        }
-
-        // ProcessTiles
-        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
-        {
-            // Next
-            Iterate<1, HALF>::ProcessTiles(cta, block_offset);
-            Iterate<1, MAX - HALF>::ProcessTiles(cta, block_offset + (HALF * TILE_ITEMS));
-        }
-    };
-
-    // Terminate
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(BlockRadixSortUpsweepTiles &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
-
-        // ProcessTiles
-        static __device__ __forceinline__ void ProcessTiles(BlockRadixSortUpsweepTiles &cta, SizeT block_offset)
-        {
-            cta.ProcessFullTile(block_offset);
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decode a key and increment corresponding smem digit counter
-     */
-    __device__ __forceinline__ void Bucket(UnsignedBits key)
-    {
-        // Perform transform op
-        UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
-
-        // Add in sub-counter offset
-        UnsignedBits sub_counter = BFE(converted_key, current_bit, LOG_PACKING_RATIO);
-
-        // Add in row offset
-        UnsignedBits row_offset = BFE(converted_key, current_bit + LOG_PACKING_RATIO, LOG_COUNTER_LANES);
-
-        // Increment counter
-        temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
-
-    }
-
-
-    /**
-     * Reset composite counters
-     */
-    __device__ __forceinline__ void ResetDigitCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
-        {
-            temp_storage.packed_counters[LANE][threadIdx.x] = 0;
-        }
-    }
-
-
-    /**
-     * Reset the unpacked counters in each thread
-     */
-    __device__ __forceinline__ void ResetUnpackedCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            #pragma unroll
-            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-            {
-                local_counts[LANE][UNPACKED_COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Extracts and aggregates the digit counters for each counter lane
-     * owned by this warp
-     */
-    __device__ __forceinline__ void UnpackDigitCounts()
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
-
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            const int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                #pragma unroll
-                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
-                {
-                    #pragma unroll
-                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                    {
-                        SizeT counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
-                        local_counts[LANE][UNPACKED_COUNTER] += counter;
-                    }
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Places unpacked counters into smem for final digit reduction
-     */
-    __device__ __forceinline__ void ReduceUnpackedCounts(SizeT &bin_count)
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        __syncthreads();
-
-        // Rake-reduce bin_count reductions
-        if (threadIdx.x < RADIX_DIGITS)
-        {
-            bin_count = ThreadReduce<WARP_THREADS>(
-                temp_storage.digit_partials[threadIdx.x],
-                Sum());
-        }
-    }
-
-
-    /**
-     * Processes a single, full tile
-     */
-    __device__ __forceinline__ void ProcessFullTile(SizeT block_offset)
-    {
-        // Tile of keys
-        UnsignedBits keys[KEYS_PER_THREAD];
-
-        LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
-
-        // Prevent hoisting
-//        __threadfence_block();
-//        __syncthreads();
-
-        // Bucket tile of keys
-        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
-    }
-
-
-    /**
-     * Processes a single load (may have some threads masked off)
-     */
-    __device__ __forceinline__ void ProcessPartialTile(
-        SizeT block_offset,
-        const SizeT &block_oob)
-    {
-        // Process partial tile if necessary using single loads
-        block_offset += threadIdx.x;
-        while (block_offset < block_oob)
-        {
-            // Load and bucket key
-            UnsignedBits key = ThreadLoad<LOAD_MODIFIER>(d_keys_in + block_offset);
-            Bucket(key);
-            block_offset += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRadixSortUpsweepTiles(
-        TempStorage &temp_storage,
-        Key         *d_keys_in,
-        int         current_bit)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit)
-    {}
-
-
-    /**
-     * Compute radix digit histograms from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessTiles(
-        SizeT           block_offset,
-        const SizeT     &block_oob,
-        SizeT           &bin_count)                ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
-    {
-        // Reset digit counters in smem and unpacked counters in registers
-        ResetDigitCounters();
-        ResetUnpackedCounters();
-
-        // Unroll batches of full tiles
-        while (block_offset + UNROLLED_ELEMENTS <= block_oob)
-        {
-            Iterate<0, UNROLL_COUNT>::ProcessTiles(*this, block_offset);
-            block_offset += UNROLLED_ELEMENTS;
-
-            __syncthreads();
-
-            // Aggregate back into local_count registers to prevent overflow
-            UnpackDigitCounts();
-
-            __syncthreads();
-
-            // Reset composite counters in lanes
-            ResetDigitCounters();
-        }
-
-        // Unroll single full tiles
-        while (block_offset + TILE_ITEMS <= block_oob)
-        {
-            ProcessFullTile(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process partial tile if necessary
-        ProcessPartialTile(
-            block_offset,
-            block_oob);
-
-        __syncthreads();
-
-        // Aggregate back into local_count registers
-        UnpackDigitCounts();
-
-        __syncthreads();
-
-        // Final raking reduction of counts by bin
-        ReduceUnpackedCounts(bin_count);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh b/kokkos/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh
deleted file mode 100644
index 99e1980..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/block_reduce_by_key_tiles.cuh
+++ /dev/null
@@ -1,399 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "scan_tiles_types.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../block/block_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Utility data types
- ******************************************************************************/
-
-/// Scan tuple data type for reduce-value-by-key
-template <typename Value, typename SizeT>
-struct ReduceByKeyuple
-{
-    Value   value;      // Initially set as value, contains segment aggregate after prefix scan
-    SizeT   flag;       // Initially set as a tail flag, contains scatter offset after prefix scan
-};
-
-
-/// Binary reduce-by-key scan operator
-template <typename ReductionOp>
-struct ReduceByKeyScanOp
-{
-    /// Reduction functor
-    ReductionOp reduction_op;
-
-    /// Constructor
-    ReduceByKeyScanOp(ReductionOp reduction_op) : reduction_op(reduction_op)
-    {}
-
-    /// Binary scan operator
-    template <typename ReduceByKeyuple>
-    __device__ __forceinline__ ReduceByKeyuple operator()(
-        const ReduceByKeyuple &first,
-        const ReduceByKeyuple &second)
-    {
-        ReduceByKeyuple retval;
-        retval.val = (second.flag) ? second.val : reduction_op(first.val, second.val);
-        retval.flag = first.flag + second.flag;
-        return retval;
-    }
-};
-
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Tuning policy for BlockReduceByKeyiles
- */
-template <
-    int                         _BLOCK_THREADS,
-    int                         _ITEMS_PER_THREAD,
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,
-    bool                        _LOAD_WARP_TIME_SLICING,
-    PtxLoadModifier             _LOAD_MODIFIER,
-    BlockScanAlgorithm          _SCAN_ALGORITHM>
-struct BlockReduceByKeyilesPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
-        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM      = _LOAD_ALGORITHM;
-    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
-    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockReduceByKeyiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
- */
-template <
-    typename BlockReduceByKeyilesPolicy,   ///< Tuning policy
-    typename KeyInputIteratorRA,            ///< Random-access input iterator type for keys
-    typename KeyOutputIteratorRA,           ///< Random-access output iterator type for keys
-    typename ValueInputIteratorRA,          ///< Random-access input iterator type for values
-    typename ValueOutputIteratorRA,         ///< Random-access output iterator type for values
-    typename ReductionOp,                   ///< Reduction functor type
-    typename SizeT>                         ///< Offset integer type
-struct BlockReduceByKeyiles
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data types of input iterators
-    typedef typename std::iterator_traits<KeyInputIteratorRA>::value_type   Key;    // Key data type
-    typedef typename std::iterator_traits<ValueInputIteratorRA>::value_type Value;  // Value data type
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockReduceByKeyilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        STATUS_PADDING      = PtxArchProps::WARP_THREADS,
-    };
-
-    // Block load type for keys
-    typedef BlockLoad<
-        KeyInputIteratorRA,
-        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
-        BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
-        BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
-        BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
-        BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING>    BlockLoadKeys;
-
-    // Block load type for values
-    typedef BlockLoad<
-        ValueInputIteratorRA,
-        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
-        BlockReduceByKeyilesPolicy::ITEMS_PER_THREAD,
-        BlockReduceByKeyilesPolicy::LOAD_ALGORITHM,
-        BlockReduceByKeyilesPolicy::LOAD_MODIFIER,
-        BlockReduceByKeyilesPolicy::LOAD_WARP_TIME_SLICING>    BlockLoadValues;
-
-    // Block discontinuity type for setting tail flags
-    typedef BlockDiscontinuity<Key, BLOCK_THREADS>              BlockDiscontinuityKeys;
-
-    // Scan tuple type
-    typedef ReduceByKeyuple<Value, SizeT>                      ScanTuple;
-
-    // Tile status descriptor type
-    typedef ScanTileDescriptor<ScanTuple>                 ScanTileDescriptorT;
-
-    // Block scan functor type
-    typedef ReduceByKeyScanOp<ReductionOp>                      ScanOp;
-
-    // Block scan prefix callback type
-    typedef DeviceScanBlockPrefixOp<ScanTuple, ScanOp>          PrefixCallback;
-
-    // Block scan type
-    typedef BlockScan<
-        ScanTuple,
-        BlockReduceByKeyilesPolicy::BLOCK_THREADS,
-        BlockReduceByKeyilesPolicy::SCAN_ALGORITHM>            BlockScanT;
-
-    /// Shared memory type for this threadblock
-    struct _TempStorage
-    {
-        union
-        {
-            typename BlockLoadKeys::TempStorage         load_keys;      // Smem needed for loading tiles of keys
-            typename BlockLoadValues::TempStorage       load_values;    // Smem needed for loading tiles of values
-            struct
-            {
-                typename BlockScanT::TempStorage        scan;           // Smem needed for tile scanning
-                typename PrefixCallback::TempStorage    prefix;         // Smem needed for cooperative prefix callback
-            };
-        };
-
-        typename BlockDiscontinuityKeys::TempStorage    flagging;       // Smem needed for tile scanning
-        SizeT                                           tile_idx;       // Shared tile index
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                &temp_storage;      ///< Reference to temp_storage
-    KeyInputIteratorRA          d_keys_in;          ///< Key input data
-    KeyOutputIteratorRA         d_keys_out;         ///< Key output data
-    ValueInputIteratorRA        d_values_in;        ///< Value input data
-    ValueOutputIteratorRA       d_values_out;       ///< Value output data
-    ScanTileDescriptorT         *d_tile_status;     ///< Global list of tile status
-    ScanOp                      scan_op;            ///< Binary scan operator
-    int                         num_tiles;          ///< Total number of input tiles for the entire problem
-    SizeT                       num_items;          ///< Total number of scan items for the entire problem
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockReduceByKeyiles(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        KeyInputIteratorRA          d_keys_in,          ///< Key input data
-        KeyOutputIteratorRA         d_keys_out,         ///< Key output data
-        ValueInputIteratorRA        d_values_in,        ///< Value input data
-        ValueOutputIteratorRA       d_values_out,       ///< Value output data
-        ScanTileDescriptorT       *d_tile_status,     ///< Global list of tile status
-        ReductionOp                 reduction_op,       ///< Binary scan operator
-        int                         num_tiles,          ///< Total number of input tiles for the entire problem
-        SizeT                       num_items)          ///< Total number of scan items for the entire problem
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(d_keys_in),
-        d_keys_out(d_keys_out),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        d_tile_status(d_tile_status),
-        scan_op(reduction_op),
-        num_tiles(num_tiles),
-        num_items(num_items)
-    {}
-
-
-    /**
-     * Process a tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        int     tile_idx,                   ///< Tile index
-        SizeT   block_offset,               ///< Tile offset
-        int     valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
-    {
-        Key         keys[ITEMS_PER_THREAD];
-        Value       values[ITEMS_PER_THREAD];
-        int         tail_flags[ITEMS_PER_THREAD];
-        ScanTuple   scan_tuples[ITEMS_PER_THREAD];
-
-        // Load keys
-        if (FULL_TILE)
-            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
-        else
-            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, valid_items);
-
-        // Set tail flags
-        if (tile_idx == num_tiles - 1)
-        {
-            // Last tile
-            BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality());
-        }
-        else
-        {
-            // Preceding tiles require the first element of the next tile
-            Key tile_suffix_item;
-            if (threadIdx.x == 0)
-                tile_suffix_item = d_keys_in[block_offset + TILE_ITEMS];
-
-            BlockDiscontinuityKeys(temp_storage.flagging).FlagTails(tail_flags, keys, Equality(), tile_suffix_item);
-        }
-
-        __syncthreads();
-
-        // Load values
-        if (FULL_TILE)
-            BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
-        else
-            BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, valid_items);
-
-        // Assemble scan tuples
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scan_tuples[ITEM].value     = values[ITEM];
-            scan_tuples[ITEM].flag      = tail_flags[ITEM];
-        }
-
-        __syncthreads();
-
-        // Perform inclusive prefix scan
-        ScanTuple block_aggregate;
-        if (tile_idx == 0)
-        {
-            // Without prefix callback
-            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate);
-
-            // Update tile status
-            if (threadIdx.x == 0)
-                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
-        }
-        else
-        {
-            // With prefix callback
-            PrefixCallback prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
-            BlockScanT(temp_storage.scan).InclusiveScan(scan_tuples, scan_tuples, scan_op, block_aggregate, prefix_op);
-        }
-
-        // Scatter flagged keys and values to output
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int tile_item = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-
-            // Set the head flag on the last item in a partially-full tile
-            if (!FULL_TILE && (tile_item == valid_items - 1))
-                tail_flags[ITEM] = 1;
-
-            // Decrement scatter offset
-            scan_tuples[ITEM].flag--;
-
-            // Scatter key and aggregate value if flagged and in range
-            if ((FULL_TILE || (tile_item < valid_items)) && (tail_flags[ITEM]))
-            {
-                d_keys_out[scan_tuples[ITEM].flag]      = keys[ITEM];
-                d_values_out[scan_tuples[ITEM].flag]    = scan_tuples[ITEM].value;
-            }
-        }
-    }
-
-
-
-    /**
-     * Dequeue and scan tiles of elements
-     */
-    __device__ __forceinline__ void ProcessTiles(GridQueue<int> queue)          ///< Queue descriptor for assigning tiles of work to thread blocks
-    {
-        // We give each thread block at least one tile of input
-        int tile_idx = blockIdx.x;
-
-        // Consume full tiles of input
-        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
-        while (block_offset + TILE_ITEMS <= num_items)
-        {
-            ConsumeTile<true>(tile_idx, block_offset);
-
-            // Get next tile
-#if CUB_PTX_ARCH < 200
-            // No concurrent kernels allowed, so just stripe tiles
-            tile_idx += gridDim.x;
-#else
-            // Concurrent kernels are allowed, so we must only use active blocks to dequeue tile indices
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1) + gridDim.x;
-
-            __syncthreads();
-
-            tile_idx = temp_storage.tile_idx;
-#endif
-            block_offset = SizeT(TILE_ITEMS) * tile_idx;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < num_items)
-        {
-            // Consume a partially-full tile
-            int valid_items = num_items - block_offset;
-            ConsumeTile<false>(tile_idx, block_offset, valid_items);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh b/kokkos/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh
deleted file mode 100644
index a83c098..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/block_reduce_tiles.cuh
+++ /dev/null
@@ -1,375 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../block/block_load.cuh"
-#include "../../block/block_reduce.cuh"
-#include "../../grid/grid_mapping.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../util_vector.cuh"
-#include "../../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Tuning policy for BlockReduceTiles
- */
-template <
-    int                     _BLOCK_THREADS,         ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,      ///< Items per thread per tile of input
-    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
-    PtxLoadModifier         _LOAD_MODIFIER,         ///< PTX load modifier
-    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
-struct BlockReduceTilesPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;
-    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;
-    static const PtxLoadModifier       LOAD_MODIFIER        = _LOAD_MODIFIER;
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockReduceTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction.
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE, this
- * partial reduction is stored into \p thread_aggregate.  Otherwise it is
- * accumulated into \p thread_aggregate.
- */
-template <
-    typename BlockReduceTilesPolicy,
-    typename InputIteratorRA,
-    typename SizeT,
-    typename ReductionOp>
-struct BlockReduceTiles
-{
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type  T;              // Type of input iterator
-    typedef VectorHelper<T, BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH> VecHelper;      // Helper type for vectorizing loads of T
-    typedef typename VecHelper::Type                                    VectorT;        // Vector of T
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockReduceTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockReduceTilesPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH,
-
-        // Can vectorize according to the policy if the input iterator is a native pointer to a built-in primitive
-        CAN_VECTORIZE       = (BlockReduceTilesPolicy::VECTOR_LOAD_LENGTH > 1) &&
-                                (IsPointer<InputIteratorRA>::VALUE) &&
-                                (VecHelper::BUILT_IN),
-
-    };
-
-    static const PtxLoadModifier      LOAD_MODIFIER   = BlockReduceTilesPolicy::LOAD_MODIFIER;
-    static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockReduceTilesPolicy::BLOCK_ALGORITHM;
-
-    // Parameterized BlockReduce primitive
-    typedef BlockReduce<T, BLOCK_THREADS, BlockReduceTilesPolicy::BLOCK_ALGORITHM> BlockReduceT;
-
-    /// Shared memory type required by this thread block
-    typedef typename BlockReduceT::TempStorage _TempStorage;
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    T                       thread_aggregate;   ///< Each thread's partial reduction
-    _TempStorage&           temp_storage;       ///< Reference to temp_storage
-    InputIteratorRA         d_in;               ///< Input data to reduce
-    ReductionOp             reduction_op;       ///< Binary reduction operator
-    int                     first_tile_size;    ///< Size of first tile consumed
-    bool                    input_aligned;      ///< Whether or not input is vector-aligned
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockReduceTiles(
-        TempStorage&            temp_storage,       ///< Reference to temp_storage
-        InputIteratorRA         d_in,               ///< Input data to reduce
-        ReductionOp             reduction_op)       ///< Binary reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        reduction_op(reduction_op),
-        first_tile_size(0),
-        input_aligned(CAN_VECTORIZE && ((size_t(d_in) & (sizeof(VectorT) - 1)) == 0))
-    {}
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        SizeT   block_offset,                   ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)       ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            T stripe_partial;
-
-            // Load full tile
-            if (input_aligned)
-            {
-                // Alias items as an array of VectorT and load it in striped fashion
-                enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
-
-                VectorT vec_items[WORDS];
-
-                // Load striped into vec items
-                VectorT* alias_ptr = reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH));
-
-                #pragma unroll
-                for (int i = 0; i < WORDS; ++i)
-                    vec_items[i] = alias_ptr[BLOCK_THREADS * i];
-
-                // Reduce items within each thread stripe
-                stripe_partial = ThreadReduce<ITEMS_PER_THREAD>(
-                    reinterpret_cast<T*>(vec_items),
-                    reduction_op);
-            }
-            else
-            {
-                T items[ITEMS_PER_THREAD];
-
-                // Load items in striped fashion
-                LoadStriped<LOAD_MODIFIER, BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-
-                // Reduce items within each thread stripe
-                stripe_partial = ThreadReduce(items, reduction_op);
-            }
-
-            // Update running thread aggregate
-            thread_aggregate = (first_tile_size) ?
-                reduction_op(thread_aggregate, stripe_partial) :       // Update
-                stripe_partial;                                        // Assign
-        }
-        else
-        {
-
-            // Partial tile
-            int thread_offset = threadIdx.x;
-
-            if (!first_tile_size && (thread_offset < valid_items))
-            {
-                // Assign thread_aggregate
-                thread_aggregate = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
-                thread_offset += BLOCK_THREADS;
-            }
-
-            while (thread_offset < valid_items)
-            {
-                // Update thread aggregate
-                T item = ThreadLoad<LOAD_MODIFIER>(d_in + block_offset + thread_offset);
-                thread_aggregate = reduction_op(thread_aggregate, item);
-                thread_offset += BLOCK_THREADS;
-            }
-        }
-
-        // Set first tile size if necessary
-        if (!first_tile_size)
-            first_tile_size = valid_items;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        SizeT   block_oob,                          ///< [in] Threadblock end offset (exclusive)
-        T       &block_aggregate)                   ///< [out] Running total
-    {
-        // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_oob)
-        {
-            ConsumeTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_oob)
-        {
-            int valid_items = block_oob - block_offset;
-            ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Compute block-wide reduction
-        block_aggregate = (first_tile_size < TILE_ITEMS) ?
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT                               num_items,          ///< [in] Total number of global input items
-        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
-        T                                   &block_aggregate,   ///< [out] Running total
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
-    {
-        // Initialize even-share descriptor for this thread block
-        even_share.BlockInit();
-
-        // Consume input tiles
-        ConsumeTiles(even_share.block_offset, even_share.block_oob, block_aggregate);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Dynamically consume tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        int                 num_items,          ///< Total number of input items
-        GridQueue<SizeT>    queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        T                   &block_aggregate)   ///< [out] Running total
-    {
-        // Shared dequeue offset
-        __shared__ SizeT dequeue_offset;
-
-        // We give each thread block at least one tile of input.
-        SizeT block_offset = blockIdx.x * TILE_ITEMS;
-        SizeT even_share_base = gridDim.x * TILE_ITEMS;
-
-        if (block_offset + TILE_ITEMS <= num_items)
-        {
-            // Consume full tile of input
-            ConsumeTile<true>(block_offset);
-
-            // Dequeue more tiles
-            while (true)
-            {
-                 // Dequeue a tile of items
-                if (threadIdx.x == 0)
-                    dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-                __syncthreads();
-
-                // Grab tile offset and check if we're done with full tiles
-                block_offset = dequeue_offset;
-
-                __syncthreads();
-
-                if (block_offset + TILE_ITEMS > num_items)
-                    break;
-
-                // Consume a full tile
-                ConsumeTile<true>(block_offset);
-            }
-        }
-
-        if (block_offset < num_items)
-        {
-            int valid_items = num_items - block_offset;
-            ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Compute block-wide reduction
-        block_aggregate = (first_tile_size < TILE_ITEMS) ?
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT                               num_items,          ///< [in] Total number of global input items
-        GridEvenShare<SizeT>                &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<SizeT>                    &queue,             ///< [in,out] GridQueue descriptor
-        T                                   &block_aggregate,   ///< [out] Running total
-        Int2Type<GRID_MAPPING_DYNAMIC>      is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        ConsumeTiles(num_items, queue, block_aggregate);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/block_scan_tiles.cuh b/kokkos/kokkos/TPL/cub/device/block/block_scan_tiles.cuh
deleted file mode 100644
index 9802204..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/block_scan_tiles.cuh
+++ /dev/null
@@ -1,509 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "scan_tiles_types.cuh"
-#include "../../block/block_load.cuh"
-#include "../../block/block_store.cuh"
-#include "../../block/block_scan.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Tuning policy for BlockScanTiles
- */
-template <
-    int                         _BLOCK_THREADS,
-    int                         _ITEMS_PER_THREAD,
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,
-    bool                        _LOAD_WARP_TIME_SLICING,
-    PtxLoadModifier             _LOAD_MODIFIER,
-    BlockStoreAlgorithm         _STORE_ALGORITHM,
-    bool                        _STORE_WARP_TIME_SLICING,
-    BlockScanAlgorithm          _SCAN_ALGORITHM>
-struct BlockScanTilesPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,
-        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM      = _LOAD_ALGORITHM;
-    static const PtxLoadModifier        LOAD_MODIFIER       = _LOAD_MODIFIER;
-    static const BlockStoreAlgorithm    STORE_ALGORITHM     = _STORE_ALGORITHM;
-    static const BlockScanAlgorithm     SCAN_ALGORITHM      = _SCAN_ALGORITHM;
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockScanTiles implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan.
- *
- * Implements a single-pass "domino" strategy with adaptive prefix lookback.
- */
-template <
-    typename BlockScanTilesPolicy,     ///< Tuning policy
-    typename InputIteratorRA,               ///< Input iterator type
-    typename OutputIteratorRA,              ///< Output iterator type
-    typename ScanOp,                        ///< Scan functor type
-    typename Identity,                      ///< Identity element type (cub::NullType for inclusive scan)
-    typename SizeT>                         ///< Offset integer type
-struct BlockScanTiles
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-    // Constants
-    enum
-    {
-        INCLUSIVE           = Equals<Identity, NullType>::VALUE,            // Inclusive scan if no identity type is provided
-        BLOCK_THREADS       = BlockScanTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockScanTilesPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Block load type
-    typedef BlockLoad<
-        InputIteratorRA,
-        BlockScanTilesPolicy::BLOCK_THREADS,
-        BlockScanTilesPolicy::ITEMS_PER_THREAD,
-        BlockScanTilesPolicy::LOAD_ALGORITHM,
-        BlockScanTilesPolicy::LOAD_MODIFIER,
-        BlockScanTilesPolicy::LOAD_WARP_TIME_SLICING>   BlockLoadT;
-
-    // Block store type
-    typedef BlockStore<
-        OutputIteratorRA,
-        BlockScanTilesPolicy::BLOCK_THREADS,
-        BlockScanTilesPolicy::ITEMS_PER_THREAD,
-        BlockScanTilesPolicy::STORE_ALGORITHM,
-        STORE_DEFAULT,
-        BlockScanTilesPolicy::STORE_WARP_TIME_SLICING>  BlockStoreT;
-
-    // Tile status descriptor type
-    typedef ScanTileDescriptor<T>                 ScanTileDescriptorT;
-
-    // Block scan type
-    typedef BlockScan<
-        T,
-        BlockScanTilesPolicy::BLOCK_THREADS,
-        BlockScanTilesPolicy::SCAN_ALGORITHM> BlockScanT;
-
-    // Callback type for obtaining inter-tile prefix during block scan
-    typedef DeviceScanBlockPrefixOp<T, ScanOp> InterblockPrefixOp;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-        union
-        {
-            typename BlockLoadT::TempStorage            load;               // Smem needed for tile loading
-            typename BlockStoreT::TempStorage           store;              // Smem needed for tile storing
-            struct
-            {
-                typename InterblockPrefixOp::TempStorage    prefix;         // Smem needed for cooperative prefix callback
-                typename BlockScanT::TempStorage            scan;           // Smem needed for tile scanning
-            };
-        };
-
-        SizeT                                           tile_idx;           // Shared tile index
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                &temp_storage;      ///< Reference to temp_storage
-    InputIteratorRA             d_in;               ///< Input data
-    OutputIteratorRA            d_out;              ///< Output data
-    ScanOp                      scan_op;            ///< Binary scan operator
-    Identity                    identity;           ///< Identity element
-
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods (first tile)
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization
-     */
-    template <typename _ScanOp, typename _Identity>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
-    }
-
-    /**
-     * Exclusive sum specialization
-     */
-    template <typename _Identity>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
-    }
-
-    /**
-     * Inclusive scan specialization
-     */
-    template <typename _ScanOp>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
-    }
-
-    /**
-     * Inclusive sum specialization
-     */
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
-    }
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods (subsequent tiles)
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization (with prefix from predecessors)
-     */
-    template <typename _ScanOp, typename _Identity, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Exclusive sum specialization (with prefix from predecessors)
-     */
-    template <typename _Identity, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Inclusive scan specialization (with prefix from predecessors)
-     */
-    template <typename _ScanOp, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Inclusive sum specialization (with prefix from predecessors)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
-    }
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockScanTiles(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIteratorRA             d_in,               ///< Input data
-        OutputIteratorRA            d_out,              ///< Output data
-        ScanOp                      scan_op,            ///< Binary scan operator
-        Identity                    identity)           ///< Identity element
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out(d_out),
-        scan_op(scan_op),
-        identity(identity)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Domino scan
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (domino scan)
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        SizeT                 num_items,          ///< Total number of input items
-        int                   tile_idx,           ///< Tile index
-        SizeT                 block_offset,       ///< Tile offset
-        ScanTileDescriptorT   *d_tile_status)     ///< Global list of tile status
-    {
-        // Load items
-        T items[ITEMS_PER_THREAD];
-
-        if (FULL_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_items - block_offset);
-
-        __syncthreads();
-
-        T block_aggregate;
-        if (tile_idx == 0)
-        {
-            ScanBlock(items, scan_op, identity, block_aggregate);
-
-            // Update tile status if there are successor tiles
-            if (FULL_TILE && (threadIdx.x == 0))
-                ScanTileDescriptorT::SetPrefix(d_tile_status, block_aggregate);
-        }
-        else
-        {
-            InterblockPrefixOp prefix_op(d_tile_status, temp_storage.prefix, scan_op, tile_idx);
-            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
-        }
-
-        __syncthreads();
-
-        // Store items
-        if (FULL_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_items - block_offset);
-    }
-
-    /**
-     * Dequeue and scan tiles of items as part of a domino scan
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        int                   num_items,          ///< Total number of input items
-        GridQueue<int>        queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileDescriptorT   *d_tile_status)     ///< Global list of tile status
-    {
-#if CUB_PTX_ARCH < 200
-
-        // No concurrent kernels allowed and blocks are launched in increasing order, so just assign one tile per block (up to 65K blocks)
-        int     tile_idx        = blockIdx.x;
-        SizeT   block_offset    = SizeT(TILE_ITEMS) * tile_idx;
-
-        if (block_offset + TILE_ITEMS <= num_items)
-            ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
-        else if (block_offset < num_items)
-            ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
-
-#else
-
-        // Get first tile
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int tile_idx = temp_storage.tile_idx;
-        SizeT block_offset = SizeT(TILE_ITEMS) * tile_idx;
-
-        while (block_offset + TILE_ITEMS <= num_items)
-        {
-            // Consume full tile
-            ConsumeTile<true>(num_items, tile_idx, block_offset, d_tile_status);
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1);
-
-            __syncthreads();
-
-            tile_idx = temp_storage.tile_idx;
-            block_offset = SizeT(TILE_ITEMS) * tile_idx;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < num_items)
-        {
-            ConsumeTile<false>(num_items, tile_idx, block_offset, d_tile_status);
-        }
-#endif
-
-    }
-
-
-    //---------------------------------------------------------------------
-    // Even-share scan
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input
-     */
-    template <
-        bool FULL_TILE,
-        bool FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        SizeT                   block_offset,               ///< Tile offset
-        RunningBlockPrefixOp<T> &prefix_op,                 ///< Running prefix operator
-        int                     valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
-    {
-        // Load items
-        T items[ITEMS_PER_THREAD];
-
-        if (FULL_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
-
-        __syncthreads();
-
-        // Block scan
-        T block_aggregate;
-        if (FIRST_TILE)
-        {
-            ScanBlock(items, scan_op, identity, block_aggregate);
-            prefix_op.running_total = block_aggregate;
-        }
-        else
-        {
-            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
-        }
-
-        __syncthreads();
-
-        // Store items
-        if (FULL_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        SizeT   block_oob)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        RunningBlockPrefixOp<T> prefix_op;
-
-        if (block_offset + TILE_ITEMS <= block_oob)
-        {
-            // Consume first tile of input (full)
-            ConsumeTile<true, true>(block_offset, prefix_op);
-            block_offset += TILE_ITEMS;
-
-            // Consume subsequent full tiles of input
-            while (block_offset + TILE_ITEMS <= block_oob)
-            {
-                ConsumeTile<true, false>(block_offset, prefix_op);
-                block_offset += TILE_ITEMS;
-            }
-
-            // Consume a partially-full tile
-            if (block_offset < block_oob)
-            {
-                int valid_items = block_oob - block_offset;
-                ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
-            }
-        }
-        else
-        {
-            // Consume the first tile of input (partially-full)
-            int valid_items = block_oob - block_offset;
-            ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
-        }
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles, seeded with the specified prefix value
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        SizeT   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        SizeT   block_oob,                          ///< [in] Threadblock end offset (exclusive)
-        T       prefix)                             ///< [in] The prefix to apply to the scan segment
-    {
-        RunningBlockPrefixOp<T> prefix_op;
-        prefix_op.running_total = prefix;
-
-        // Consume full tiles of input
-        while (block_offset + TILE_ITEMS <= block_oob)
-        {
-            ConsumeTile<true, false>(block_offset, prefix_op);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_oob)
-        {
-            int valid_items = block_oob - block_offset;
-            ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/scan_tiles_types.cuh b/kokkos/kokkos/TPL/cub/device/block/scan_tiles_types.cuh
deleted file mode 100644
index 2b933d0..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/scan_tiles_types.cuh
+++ /dev/null
@@ -1,318 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Utility types for device-wide scan
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Enumerations of tile status
- */
-enum ScanTileStatus
-{
-    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID,      // Not yet processed
-    SCAN_TILE_PARTIAL,      // Tile aggregate is available
-    SCAN_TILE_PREFIX,       // Inclusive tile prefix is available
-};
-
-
-/**
- * Data type of tile status descriptor.
- *
- * Specialized for scan status and value types that can be combined into the same
- * machine word that can be read/written coherently in a single access.
- */
-template <
-    typename    T,
-    bool        SINGLE_WORD = (PowerOfTwo<sizeof(T)>::VALUE && (sizeof(T) <= 8))>
-struct ScanTileDescriptor
-{
-    // Status word type
-    typedef typename If<(sizeof(T) == 8),
-        long long,
-        typename If<(sizeof(T) == 4),
-            int,
-            typename If<(sizeof(T) == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-    // Vector word type
-    typedef typename If<(sizeof(T) == 8),
-        longlong2,
-        typename If<(sizeof(T) == 4),
-            int2,
-            typename If<(sizeof(T) == 2),
-                int,
-                short>::Type>::Type>::Type VectorWord;
-
-    T           value;
-    StatusWord  status;
-
-    static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
-    {
-        ScanTileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PREFIX;
-        tile_descriptor.value = prefix;
-
-        VectorWord alias;
-        *reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
-    }
-
-    static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
-    {
-        ScanTileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = partial;
-
-        VectorWord alias;
-        *reinterpret_cast<ScanTileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<VectorWord*>(ptr), alias);
-    }
-
-    static __device__ __forceinline__ void WaitForValid(
-        ScanTileDescriptor    *ptr,
-        int                     &status,
-        T                       &value)
-    {
-        ScanTileDescriptor tile_descriptor;
-        while (true)
-        {
-            VectorWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<VectorWord*>(ptr));
-
-            tile_descriptor = *reinterpret_cast<ScanTileDescriptor*>(&alias);
-            if (tile_descriptor.status != SCAN_TILE_INVALID) break;
-
-            __threadfence_block();
-        }
-
-        status = tile_descriptor.status;
-        value = tile_descriptor.value;
-    }
-
-};
-
-
-/**
- * Data type of tile status descriptor.
- *
- * Specialized for scan status and value types that cannot fused into
- * the same machine word.
- */
-template <typename T>
-struct ScanTileDescriptor<T, false>
-{
-    T       prefix_value;
-    T       partial_value;
-
-    /// Workaround for the fact that win32 doesn't guarantee 16B alignment 16B values of T
-    union
-    {
-        int                     status;
-        Uninitialized<T>        padding;
-    };
-
-    static __device__ __forceinline__ void SetPrefix(ScanTileDescriptor *ptr, T prefix)
-    {
-        ThreadStore<STORE_CG>(&ptr->prefix_value, prefix);
-        __threadfence_block();
-//        __threadfence();        // __threadfence_block seems sufficient on current architectures to prevent reordeing
-        ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PREFIX);
-
-    }
-
-    static __device__ __forceinline__ void SetPartial(ScanTileDescriptor *ptr, T partial)
-    {
-        ThreadStore<STORE_CG>(&ptr->partial_value, partial);
-        __threadfence_block();
-//        __threadfence();        // __threadfence_block seems sufficient on current architectures to prevent reordeing
-        ThreadStore<STORE_CG>(&ptr->status, (int) SCAN_TILE_PARTIAL);
-    }
-
-    static __device__ __forceinline__ void WaitForValid(
-        ScanTileDescriptor    *ptr,
-        int                         &status,
-        T                           &value)
-    {
-        while (true)
-        {
-            status = ThreadLoad<LOAD_CG>(&ptr->status);
-            if (status != SCAN_TILE_INVALID) break;
-
-            __threadfence_block();
-        }
-
-        value = (status == SCAN_TILE_PARTIAL) ?
-            ThreadLoad<LOAD_CG>(&ptr->partial_value) :
-            ThreadLoad<LOAD_CG>(&ptr->prefix_value);
-    }
-};
-
-
-/**
- * Stateful prefix functor that provides the the running prefix for
- * the current tile by using the callback warp to wait on on
- * aggregates/prefixes from predecessor tiles to become available
- */
-template <
-    typename T,
-    typename ScanOp>
-struct DeviceScanBlockPrefixOp
-{
-    // Parameterized warp reduce
-    typedef WarpReduce<T>                       WarpReduceT;
-
-    // Storage type
-    typedef typename WarpReduceT::TempStorage   _TempStorage;
-
-    // Alias wrapper allowing storage to be unioned
-    typedef Uninitialized<_TempStorage>         TempStorage;
-
-    // Tile status descriptor type
-    typedef ScanTileDescriptor<T>               ScanTileDescriptorT;
-
-    // Fields
-    ScanTileDescriptorT         *d_tile_status;     ///< Pointer to array of tile status
-    _TempStorage                &temp_storage;      ///< Reference to a warp-reduction instance
-    ScanOp                      scan_op;            ///< Binary scan operator
-    int                         tile_idx;           ///< The current tile index
-    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
-
-    // Constructor
-    __device__ __forceinline__
-    DeviceScanBlockPrefixOp(
-        ScanTileDescriptorT     *d_tile_status,
-        TempStorage             &temp_storage,
-        ScanOp                  scan_op,
-        int                     tile_idx) :
-            d_tile_status(d_tile_status),
-            temp_storage(temp_storage.Alias()),
-            scan_op(scan_op),
-            tile_idx(tile_idx) {}
-
-
-    // Block until all predecessors within the specified window have non-invalid status
-    __device__ __forceinline__
-    void ProcessWindow(
-        int                         predecessor_idx,
-        int                         &predecessor_status,
-        T                           &window_aggregate)
-    {
-        T value;
-        ScanTileDescriptorT::WaitForValid(d_tile_status + predecessor_idx, predecessor_status, value);
-
-        // Perform a segmented reduction to get the prefix for the current window
-        int flag = (predecessor_status != SCAN_TILE_PARTIAL);
-        window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(value, flag, scan_op);
-    }
-
-
-    // Prefix functor (called by the first warp)
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-        // Update our status with our tile-aggregate
-        if (threadIdx.x == 0)
-        {
-            ScanTileDescriptorT::SetPartial(d_tile_status + tile_idx, block_aggregate);
-        }
-
-        // Wait for the window of predecessor tiles to become valid
-        int predecessor_idx = tile_idx - threadIdx.x - 1;
-        int predecessor_status;
-        T window_aggregate;
-        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-
-        // The exclusive tile prefix starts out as the current window aggregate
-        T exclusive_prefix = window_aggregate;
-
-        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-        while (WarpAll(predecessor_status != SCAN_TILE_PREFIX))
-        {
-            predecessor_idx -= PtxArchProps::WARP_THREADS;
-
-            // Update exclusive tile prefix with the window prefix
-            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
-        }
-
-        // Compute the inclusive tile prefix and update the status for this tile
-        if (threadIdx.x == 0)
-        {
-            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
-            ScanTileDescriptorT::SetPrefix(
-                d_tile_status + tile_idx,
-                inclusive_prefix);
-        }
-
-        // Return exclusive_prefix
-        return exclusive_prefix;
-    }
-};
-
-
-// Running scan prefix callback type for single-block scans.
-// Maintains a running prefix that can be applied to consecutive
-// scan operations.
-template <typename T>
-struct RunningBlockPrefixOp
-{
-    // Running prefix
-    T running_total;
-
-    // Callback operator.
-    __device__ T operator()(T block_aggregate)
-    {
-        T old_prefix = running_total;
-        running_total += block_aggregate;
-        return old_prefix;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh b/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh
deleted file mode 100644
index 5896dbc..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_gatomic.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../../util_type.cuh"
-#include "../../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * BlockHistogramTilesGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
- */
-template <
-    typename    BlockHistogramTilesPolicy,      ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
-    typename    SizeT>                          ///< Integer type for offsets
-struct BlockHistogramTilesGlobalAtomic
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
-    };
-
-    // Shared memory type required by this thread block
-    typedef NullType TempStorage;
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIteratorRA d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramTilesGlobalAtomic(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIteratorRA     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {}
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        SizeT   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD][CHANNELS];
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                    }
-                }
-            }
-
-            __threadfence_block();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            int bounds = valid_items - (threadIdx.x * CHANNELS);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
-                    {
-                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                        atomicAdd(d_out_histograms[CHANNEL] + item, 1);
-                    }
-                }
-            }
-
-        }
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {}
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh b/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh
deleted file mode 100644
index c55d789..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_satomic.cuh
+++ /dev/null
@@ -1,237 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../../util_type.cuh"
-#include "../../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * BlockHistogramTilesSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
- */
-template <
-    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
-    typename    SizeT>                          ///< Integer type for offsets
-struct BlockHistogramTilesSharedAtomic
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
-    };
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1];  // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIteratorRA d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramTilesSharedAtomic(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIteratorRA     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {
-        // Initialize histogram bin counts to zeros
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int histo_offset = 0;
-
-            #pragma unroll
-            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-            {
-                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
-            }
-            // Finish up with guarded initialization if necessary
-            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
-            {
-                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        SizeT   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD][CHANNELS];
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                    }
-                }
-            }
-
-            __threadfence_block();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
-                    }
-                }
-            }
-
-            __threadfence_block();
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            int bounds = valid_items - (threadIdx.x * CHANNELS);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
-                    {
-                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                        atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
-                    }
-                }
-            }
-
-        }
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {
-        // Barrier to ensure shared memory histograms are coherent
-        __syncthreads();
-
-        // Copy shared memory histograms to output
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_offset  = (blockIdx.x * BINS);
-            int histo_offset    = 0;
-
-            #pragma unroll
-            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-            {
-                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
-            }
-            // Finish up with guarded initialization if necessary
-            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
-            {
-                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
-            }
-        }
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh b/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh
deleted file mode 100644
index 0f82130..0000000
--- a/kokkos/kokkos/TPL/cub/device/block/specializations/block_histo_tiles_sort.cuh
+++ /dev/null
@@ -1,364 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../../block/block_radix_sort.cuh"
-#include "../../../block/block_discontinuity.cuh"
-#include "../../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * BlockHistogramTilesSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
- */
-template <
-    typename    BlockHistogramTilesPolicy,          ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIteratorRA,                ///< The input iterator type (may be a simple pointer type).  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integral type for counting sample occurrences per histogram bin
-    typename    SizeT>                          ///< Integer type for offsets
-struct BlockHistogramTilesSort
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS               = BlockHistogramTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD            = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS                  = TILE_CHANNEL_ITEMS * CHANNELS,
-
-        STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
-    };
-
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
-
-    /// Shared memory type required by this thread block
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
-            int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Histogram counters striped across threads
-    HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIteratorRA d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramTilesSort(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIteratorRA     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {
-        // Initialize histogram counters striped across threads
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            #pragma unroll
-            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-            {
-                thread_counters[CHANNEL][COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Composite a tile of input items
-     */
-    __device__ __forceinline__ void Composite(
-        SampleT   (&items)[ITEMS_PER_THREAD],                     ///< Tile of samples
-        HistoCounter    thread_counters[STRIPED_COUNTERS_PER_THREAD])   ///< Histogram counters striped across threads
-    {
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort).Sort(items);
-
-        __syncthreads();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        #pragma unroll
-        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-        {
-            temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
-            temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
-        }
-
-        __syncthreads();
-
-        // Note the begin/end run offsets of bin runs in the sorted tile
-        int flags[ITEMS_PER_THREAD];                // unused
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
-
-        __syncthreads();
-
-        // Composite into histogram
-        // Initialize the shared memory's run_begin and run_end for each bin
-        #pragma unroll
-        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-        {
-            int          bin            = (COUNTER * BLOCK_THREADS) + threadIdx.x;
-            HistoCounter run_length     = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
-
-            thread_counters[COUNTER] += run_length;
-        }
-    }
-
-
-    /**
-     * Process one channel within a tile.
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTileChannel(
-        int     channel,
-        SizeT   block_offset,
-        int     valid_items)
-    {
-        // Load items in striped fashion
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD];
-
-            // Unguarded loads
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
-            }
-
-            // Composite our histogram data
-            Composite(items, thread_counters[channel]);
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD];
-
-            // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
-            int bounds = (valid_items - (threadIdx.x * CHANNELS));
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
-                    d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
-                    0;
-            }
-
-            // Composite our histogram data
-            Composite(items, thread_counters[channel]);
-
-            __syncthreads();
-
-            // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
-            if (threadIdx.x == 0)
-            {
-                int extra = (TILE_ITEMS - valid_items) / CHANNELS;
-                thread_counters[channel][0] -= extra;
-            }
-        }
-    }
-
-
-    /**
-     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Inductive step.
-     */
-    template <bool FULL_TILE, int CHANNEL, int END>
-    struct IterateChannels
-    {
-        /**
-         * Process one channel within a tile.
-         */
-        static __device__ __forceinline__ void ConsumeTileChannel(
-            BlockHistogramTilesSort *cta,
-            SizeT               block_offset,
-            int                 valid_items)
-        {
-            __syncthreads();
-
-            cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
-
-            IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
-        }
-    };
-
-
-    /**
-     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Base step.
-     */
-    template <bool FULL_TILE, int END>
-    struct IterateChannels<FULL_TILE, END, END>
-    {
-        static __device__ __forceinline__ void ConsumeTileChannel(BlockHistogramTilesSort *cta, SizeT block_offset, int valid_items) {}
-    };
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        SizeT   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        // First channel
-        ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
-
-        // Iterate through remaining channels
-        IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {
-        // Copy counters striped across threads into the histogram output
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_offset  = (blockIdx.x * BINS);
-
-            #pragma unroll
-            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-            {
-                int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
-
-                if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
-                {
-                    d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
-                }
-            }
-        }
-    }
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/device/device_histogram.cuh b/kokkos/kokkos/TPL/cub/device/device_histogram.cuh
deleted file mode 100644
index 6f5a74d..0000000
--- a/kokkos/kokkos/TPL/cub/device/device_histogram.cuh
+++ /dev/null
@@ -1,1062 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from samples data residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "block/block_histo_tiles.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../util_debug.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Initialization pass kernel entry point (multi-block).  Prepares queue descriptors zeroes global counters.
- */
-template <
-    int                                             BINS,                   ///< Number of histogram bins per channel
-    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
-    typename                                        SizeT,                  ///< Integer type used for global array indexing
-    typename                                        HistoCounter>           ///< Integral type for counting sample occurrences per histogram bin
-__launch_bounds__ (BINS, 1)
-__global__ void InitHistoKernel(
-    GridQueue<SizeT>                                grid_queue,             ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
-    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
-    SizeT                                           num_samples)            ///< [in] Total number of samples \p d_samples for all channels
-{
-    d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
-    if (threadIdx.x == 0) grid_queue.ResetDrain(num_samples);
-}
-
-
-/**
- * Histogram pass kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
- */
-template <
-    typename                                        BlockHistogramTilesPolicy,   ///< Tuning policy for cub::BlockHistogramTiles abstraction
-    int                                             BINS,                       ///< Number of histogram bins per channel
-    int                                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int                                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                        InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
-    typename                                        HistoCounter,               ///< Integral type for counting sample occurrences per histogram bin
-    typename                                        SizeT>                      ///< Integer type used for global array indexing
-__launch_bounds__ (int(BlockHistogramTilesPolicy::BLOCK_THREADS), BlockHistogramTilesPolicy::SM_OCCUPANCY)
-__global__ void MultiBlockHistogramKernel(
-    InputIteratorRA                                 d_samples,                  ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,           ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt>
-    SizeT                                           num_samples,                ///< [in] Total number of samples \p d_samples for all channels
-    GridEvenShare<SizeT>                            even_share,                 ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
-    GridQueue<SizeT>                                queue)                      ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
-{
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockHistogramTilesPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockHistogramTilesPolicy::ITEMS_PER_THREAD,
-        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Thread block type for compositing input tiles
-    typedef BlockHistogramTiles<BlockHistogramTilesPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT> BlockHistogramTilesT;
-
-    // Shared memory for BlockHistogramTiles
-    __shared__ typename BlockHistogramTilesT::TempStorage temp_storage;
-
-    // Consume input tiles
-    BlockHistogramTilesT(temp_storage, d_samples, d_out_histograms.array).ConsumeTiles(
-        num_samples,
-        even_share,
-        queue,
-        Int2Type<BlockHistogramTilesPolicy::GRID_MAPPING>());
-}
-
-
-/**
- * Block-aggregation pass kernel entry point (single-block).  Aggregates privatized threadblock histograms from a previous multi-block histogram pass.
- */
-template <
-    int                                             BINS,                   ///< Number of histogram bins per channel
-    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
-    typename                                        HistoCounter>           ///< Integral type for counting sample occurrences per histogram bin
-__launch_bounds__ (BINS, 1)
-__global__ void AggregateHistoKernel(
-    HistoCounter*                                   d_block_histograms,     ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS]</tt>
-    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
-    int                                             num_threadblocks)       ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
-{
-    // Accumulate threadblock-histograms from the channel
-    HistoCounter bin_aggregate = 0;
-
-    int block_offset = blockIdx.x * (num_threadblocks * BINS);
-    int block_oob = block_offset + (num_threadblocks * BINS);
-
-#if CUB_PTX_ARCH >= 200
-    #pragma unroll 32
-#endif
-    while (block_offset < block_oob)
-    {
-        bin_aggregate += d_block_histograms[block_offset + threadIdx.x];
-        block_offset += BINS;
-    }
-
-    // Output
-    d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * DeviceHistogram
- *****************************************************************************/
-
-/**
- * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from samples data residing within global memory. ![](histogram_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- *
- * \par Usage Considerations
- * \cdp_class{DeviceHistogram}
- *
- * \par Performance
- *
- * \image html histo_perf.png
- *
- */
-struct DeviceHistogram
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockHistogramTilesPolicy.
-    struct KernelDispachParams
-    {
-        // Policy fields
-        int                         block_threads;
-        int                         items_per_thread;
-        BlockHistogramTilesAlgorithm    block_algorithm;
-        GridMappingStrategy         grid_mapping;
-        int                         subscription_factor;
-
-        // Derived fields
-        int                         channel_tile_size;
-
-        template <typename BlockHistogramTilesPolicy>
-        __host__ __device__ __forceinline__
-        void Init(int subscription_factor = 1)
-        {
-            block_threads               = BlockHistogramTilesPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockHistogramTilesPolicy::ITEMS_PER_THREAD;
-            block_algorithm             = BlockHistogramTilesPolicy::GRID_ALGORITHM;
-            grid_mapping                = BlockHistogramTilesPolicy::GRID_MAPPING;
-            this->subscription_factor   = subscription_factor;
-
-            channel_tile_size           = block_threads * items_per_thread;
-        }
-
-        __host__ __device__ __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                block_algorithm,
-                grid_mapping,
-                subscription_factor);
-        }
-
-    };
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// Specializations of tuned policy types for different PTX architectures
-    template <
-        int                         CHANNELS,
-        int                         ACTIVE_CHANNELS,
-        BlockHistogramTilesAlgorithm    GRID_ALGORITHM,
-        int                         ARCH>
-    struct TunedPolicies;
-
-    /// SM35 tune
-    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
-    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350>
-    {
-        typedef BlockHistogramTilesPolicy<
-            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 128 : 256,
-            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
-            GRID_ALGORITHM,
-            (GRID_ALGORITHM == GRID_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
-            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 8 : 1> MultiBlockPolicy;
-        enum { SUBSCRIPTION_FACTOR = 7 };
-    };
-
-    /// SM30 tune
-    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
-    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300>
-    {
-        typedef BlockHistogramTilesPolicy<
-            128,
-            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
-            GRID_ALGORITHM,
-            (GRID_ALGORITHM == GRID_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE,
-            1> MultiBlockPolicy;
-        enum { SUBSCRIPTION_FACTOR = 1 };
-    };
-
-    /// SM20 tune
-    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
-    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200>
-    {
-        typedef BlockHistogramTilesPolicy<
-            128,
-            (GRID_ALGORITHM == GRID_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
-            GRID_ALGORITHM,
-            GRID_MAPPING_DYNAMIC,
-            1> MultiBlockPolicy;
-        enum { SUBSCRIPTION_FACTOR = 1 };
-    };
-
-    /// SM10 tune
-    template <int CHANNELS, int ACTIVE_CHANNELS, BlockHistogramTilesAlgorithm GRID_ALGORITHM>
-    struct TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100>
-    {
-        typedef BlockHistogramTilesPolicy<
-            128, 
-            7, 
-            GRID_HISTO_SORT,        // (use sort regardless because atomics are perf-useless)
-            GRID_MAPPING_EVEN_SHARE,
-            1> MultiBlockPolicy;
-        enum { SUBSCRIPTION_FACTOR = 1 };
-    };
-
-
-    /// Tuning policy for the PTX architecture that DeviceHistogram operations will get dispatched to
-    template <
-        int                         CHANNELS,
-        int                         ACTIVE_CHANNELS,
-        BlockHistogramTilesAlgorithm      GRID_ALGORITHM>
-    struct PtxDefaultPolicies
-    {
-        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
-                                                350 :
-                                                (CUB_PTX_ARCH >= 300) ?
-                                                    300 :
-                                                    (CUB_PTX_ARCH >= 200) ?
-                                                        200 :
-                                                        100;
-
-        // Tuned policy set for the current PTX compiler pass
-        typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, PTX_TUNE_ARCH> PtxTunedPolicies;
-
-        // Subscription factor for the current PTX compiler pass
-        static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
-
-        // MultiBlockPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct MultiBlockPolicy : PtxTunedPolicies::MultiBlockPolicy {};
-
-        /**
-         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
-         */
-        static void InitDispatchParams(int ptx_version, KernelDispachParams &multi_block_dispatch_params)
-        {
-            if (ptx_version >= 350)
-            {
-                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 350> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-            }
-            else if (ptx_version >= 300)
-            {
-                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 300> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-            }
-            else if (ptx_version >= 200)
-            {
-                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 200> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-            }
-            else
-            {
-                typedef TunedPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM, 100> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-            }
-        }
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for invoking device-wide, multi-channel, histogram
-     */
-    template <
-        int                         BINS,                               ///< Number of histogram bins per channel
-        int                         CHANNELS,                           ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-        int                         ACTIVE_CHANNELS,                    ///< Number of channels actively being histogrammed
-        typename                    InitHistoKernelPtr,                 ///< Function type of cub::InitHistoKernel
-        typename                    MultiBlockHistogramKernelPtr,           ///< Function type of cub::MultiBlockHistogramKernel
-        typename                    AggregateHistoKernelPtr,            ///< Function type of cub::AggregateHistoKernel
-        typename                    InputIteratorRA,                    ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
-        typename                    HistoCounter,                       ///< Integral type for counting sample occurrences per histogram bin
-        typename                    SizeT>                              ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InitHistoKernelPtr          init_kernel,                        ///< [in] Kernel function pointer to parameterization of cub::InitHistoKernel
-        MultiBlockHistogramKernelPtr    multi_block_kernel,                 ///< [in] Kernel function pointer to parameterization of cub::MultiBlockHistogramKernel
-        AggregateHistoKernelPtr     aggregate_kernel,                   ///< [in] Kernel function pointer to parameterization of cub::AggregateHistoKernel
-        KernelDispachParams         &multi_block_dispatch_params,       ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
-        InputIteratorRA             d_samples,                          ///< [in] Input samples to histogram
-        HistoCounter                *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
-        SizeT                       num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
-            int multi_block_sm_occupancy = CUB_MIN(
-                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
-                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / multi_block_dispatch_params.block_threads);
-
-#ifndef __CUDA_ARCH__
-            // We're on the host, so come up with a more accurate estimate of multi_block_kernel SM occupancy from actual device properties
-            Device device_props;
-            if (CubDebug(error = device_props.Init(device_ordinal))) break;
-
-            if (CubDebug(error = device_props.MaxSmOccupancy(
-                multi_block_sm_occupancy,
-                multi_block_kernel,
-                multi_block_dispatch_params.block_threads))) break;
-#endif
-
-            // Get device occupancy for multi_block_kernel
-            int multi_block_occupancy = multi_block_sm_occupancy * sm_count;
-
-            // Even-share work distribution
-            GridEvenShare<SizeT> even_share;
-
-            // Get tile size for multi_block_kernel
-            int multi_block_tile_size = multi_block_dispatch_params.channel_tile_size * CHANNELS;
-
-            // Get grid size for multi_block_kernel
-            int multi_block_grid_size;
-            switch (multi_block_dispatch_params.grid_mapping)
-            {
-            case GRID_MAPPING_EVEN_SHARE:
-
-                // Work is distributed evenly
-                even_share.GridInit(
-                    num_samples,
-                    multi_block_occupancy * multi_block_dispatch_params.subscription_factor,
-                    multi_block_tile_size);
-                multi_block_grid_size = even_share.grid_size;
-                break;
-
-            case GRID_MAPPING_DYNAMIC:
-
-                // Work is distributed dynamically
-                int num_tiles           = (num_samples + multi_block_tile_size - 1) / multi_block_tile_size;
-                multi_block_grid_size   = (num_tiles < multi_block_occupancy) ?
-                    num_tiles :                 // Not enough to fill the device with threadblocks
-                    multi_block_occupancy;      // Fill the device with threadblocks
-                break;
-            };
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                ACTIVE_CHANNELS * multi_block_grid_size * sizeof(HistoCounter) * BINS,      // bytes needed for privatized histograms
-                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
-            };
-
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Privatized per-block reductions
-            HistoCounter *d_block_histograms = (HistoCounter*) allocations[0];
-
-            // Grid queue descriptor
-            GridQueue<SizeT> queue(allocations[1]);
-
-            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
-
-            // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_temp_histo_wrapper;
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * multi_block_grid_size * BINS);
-
-            // Log init_kernel configuration
-            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream);
-
-            // Invoke init_kernel to initialize counters and queue descriptor
-            init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples);
-
-            // Sync the stream if specified
-            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Whether we need privatized histograms (i.e., non-global atomics and multi-block)
-            bool privatized_temporaries = (multi_block_grid_size > 1) && (multi_block_dispatch_params.block_algorithm != GRID_HISTO_GLOBAL_ATOMIC);
-
-            // Log multi_block_kernel configuration
-            if (stream_synchronous) CubLog("Invoking multi_block_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                multi_block_grid_size, multi_block_dispatch_params.block_threads, (long long) stream, multi_block_dispatch_params.items_per_thread, multi_block_sm_occupancy);
-
-            // Invoke multi_block_kernel
-            multi_block_kernel<<<multi_block_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
-                d_samples,
-                (privatized_temporaries) ?
-                    d_temp_histo_wrapper :
-                    d_histo_wrapper,
-                num_samples,
-                even_share,
-                queue);
-
-            // Sync the stream if specified
-            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Aggregate privatized block histograms if necessary
-            if (privatized_temporaries)
-            {
-                // Log aggregate_kernel configuration
-                if (stream_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    ACTIVE_CHANNELS, BINS, (long long) stream);
-
-                // Invoke aggregate_kernel
-                aggregate_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(
-                    d_block_histograms,
-                    d_histo_wrapper,
-                    multi_block_grid_size);
-
-                // Sync the stream if specified
-                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram
-     *
-     * \tparam GRID_ALGORITHM      cub::BlockHistogramTilesAlgorithm enumerator specifying the underlying algorithm to use
-     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that is assignable to <tt>unsigned char</tt>
-     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
-     */
-    template <
-        BlockHistogramTilesAlgorithm    GRID_ALGORITHM,
-        int                         BINS,                       ///< Number of histogram bins per channel
-        int                         CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-        int                         ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-        typename                    InputIteratorRA,            ///< The input iterator type (may be a simple pointer type).  Must have a value type that is assignable to <tt>unsigned char</tt>
-        typename                    HistoCounter>               ///< Integral type for counting sample occurrences per histogram bin
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_samples,                          ///< [in] Input samples to histogram
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        // Type used for array indexing
-        typedef int SizeT;
-
-        // Tuning polices for the PTX architecture that will get dispatched to
-        typedef PtxDefaultPolicies<CHANNELS, ACTIVE_CHANNELS, GRID_ALGORITHM> PtxDefaultPolicies;
-        typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Declare dispatch parameters
-            KernelDispachParams multi_block_dispatch_params;
-
-        #ifdef __CUDA_ARCH__
-
-            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
-            multi_block_dispatch_params.Init<MultiBlockPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
-
-        #else
-
-            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-            PtxDefaultPolicies::InitDispatchParams(ptx_version, multi_block_dispatch_params);
-
-        #endif
-
-            Dispatch<BINS, CHANNELS, ACTIVE_CHANNELS>(
-                d_temp_storage,
-                temp_storage_bytes,
-                InitHistoKernel<BINS, ACTIVE_CHANNELS, SizeT, HistoCounter>,
-                MultiBlockHistogramKernel<MultiBlockPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIteratorRA, HistoCounter, SizeT>,
-                AggregateHistoKernel<BINS, ACTIVE_CHANNELS, HistoCounter>,
-                multi_block_dispatch_params,
-                d_samples,
-                d_histograms,
-                num_samples,
-                stream,
-                stream_synchronous);
-
-            if (CubDebug(error)) break;
-        }
-        while (0);
-
-        return error;
-    }
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    /******************************************************************//**
-     * \name Single-channel samples
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide histogram.  Uses fast block-sorting to compute the histogram. Delivers consistent throughput regardless of sample diversity, but occupancy may be limited by histogram bin count.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the computation of a 256-bin histogram of
-     * single-channel <tt>unsigned char</tt> samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input samples and 256-bin output histogram
-     * unsigned char *d_samples;
-     * unsigned int *d_histogram;
-     * int num_items = ...
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements for histogram computation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
-     *
-     * // Allocate temporary storage for histogram computation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histogram
-     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        typename            InputIteratorRA,
-        typename            HistoCounter>
-    __host__ __device__ __forceinline__
-    static cudaError_t SingleChannelSorting(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_samples,                          ///< [in] Input samples
-        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        return Dispatch<GRID_HISTO_SORT, BINS, 1, 1>(
-            d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram.  Uses shared-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded, and occupancy may be limited by histogram bin count.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the computation of a 256-bin histogram of
-     * single-channel <tt>unsigned char</tt> samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input samples and 256-bin output histogram
-     * unsigned char *d_samples;
-     * unsigned int *d_histogram;
-     * int num_items = ...
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements for histogram computation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
-     *
-     * // Allocate temporary storage for histogram computation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histogram
-     * cub::DeviceHistogram::SingleChannelSharedAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        typename            InputIteratorRA,
-        typename            HistoCounter>
-    __host__ __device__ __forceinline__
-    static cudaError_t SingleChannelSharedAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_samples,                          ///< [in] Input samples
-        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch<GRID_HISTO_SHARED_ATOMIC, BINS, 1, 1>(
-            d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram.  Uses global-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded.
-     *
-     * Performance is not significantly impacted when computing histograms having large
-     * numbers of bins (e.g., thousands).
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the computation of a 256-bin histogram of
-     * single-channel <tt>unsigned char</tt> samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input samples and 256-bin output histogram
-     * unsigned char *d_samples;
-     * unsigned int *d_histogram;
-     * int num_items = ...
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements for histogram computation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::SingleChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
-     *
-     * // Allocate temporary storage for histogram computation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histogram
-     * cub::DeviceHistogram::SingleChannelGlobalAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_items);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        typename            InputIteratorRA,
-        typename            HistoCounter>
-    __host__ __device__ __forceinline__
-    static cudaError_t SingleChannelGlobalAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_samples,                          ///< [in] Input samples
-        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch<GRID_HISTO_GLOBAL_ATOMIC, BINS, 1, 1>(
-            d_temp_storage, temp_storage_bytes, d_samples, &d_histogram, num_samples, stream, stream_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Interleaved multi-channel samples
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide histogram from multi-channel data.  Uses fast block-sorting to compute the histogram.  Delivers consistent throughput regardless of sample diversity, but occupancy may be limited by histogram bin count.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     *
-     * The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the computation of three 256-bin histograms from
-     * interleaved quad-channel <tt>unsigned char</tt> samples (e.g., RGB histograms from RGBA samples).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input samples and
-     * // three 256-bin output histograms
-     * unsigned char *d_samples;
-     * unsigned int *d_histograms[3];
-     * int num_items = ...
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements for histogram computation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
-     *
-     * // Allocate temporary storage for histogram computation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiChannelSorting<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        int                 CHANNELS,
-        int                 ACTIVE_CHANNELS,
-        typename            InputIteratorRA,
-        typename            HistoCounter>
-    __host__ __device__ __forceinline__
-    static cudaError_t MultiChannelSorting(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_samples,                          ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch<GRID_HISTO_SORT, BINS, CHANNELS, ACTIVE_CHANNELS>(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram from multi-channel data.  Uses shared-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded, and occupancy may be limited by histogram bin count.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     *
-     * The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the computation of three 256-bin histograms from
-     * interleaved quad-channel <tt>unsigned char</tt> samples (e.g., RGB histograms from RGBA samples).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input samples and
-     * // three 256-bin output histograms
-     * unsigned char *d_samples;
-     * unsigned int *d_histograms[3];
-     * int num_items = ...
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements for histogram computation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiChannelSharedAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
-     *
-     * // Allocate temporary storage for histogram computation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiChannelSharedAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        int                 CHANNELS,
-        int                 ACTIVE_CHANNELS,
-        typename            InputIteratorRA,
-        typename            HistoCounter>
-    __host__ __device__ __forceinline__
-    static cudaError_t MultiChannelSharedAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_samples,                          ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch<GRID_HISTO_SHARED_ATOMIC, BINS, CHANNELS, ACTIVE_CHANNELS>(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram from multi-channel data.  Uses global-memory atomic read-modify-write operations to compute the histogram.  Input samples having lower diversity can cause performance to be degraded.
-     *
-     * Performance is not significantly impacted when computing histograms having large
-     * numbers of bins (e.g., thousands).
-     *
-     * The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * Performance is often improved when referencing input samples through a texture-caching iterator, e.g., cub::TexIteratorRA or cub::TexTransformIteratorRA.
-     *
-     * \par
-     * The code snippet below illustrates the computation of three 256-bin histograms from
-     * interleaved quad-channel <tt>unsigned char</tt> samples (e.g., RGB histograms from RGBA samples).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input samples and
-     * // three 256-bin output histograms
-     * unsigned char *d_samples;
-     * unsigned int *d_histograms[3];
-     * int num_items = ...
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexIteratorRA<unsigned int> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_items * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements for histogram computation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiChannelGlobalAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
-     *
-     * // Allocate temporary storage for histogram computation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiChannelGlobalAtomic<256>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_items);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)  Must have a value type that can be cast as an integer in the range [0..BINS-1]
-     * \tparam HistoCounter         <b>[inferred]</b> Integral type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        int                 CHANNELS,
-        int                 ACTIVE_CHANNELS,
-        typename            InputIteratorRA,
-        typename            HistoCounter>
-    __host__ __device__ __forceinline__
-    static cudaError_t MultiChannelGlobalAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_samples,                          ///< [in] Input samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histogram counter arrays, each having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch<GRID_HISTO_GLOBAL_ATOMIC, BINS, CHANNELS, ACTIVE_CHANNELS>(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histograms, num_samples, stream, stream_synchronous);
-    }
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/device/device_radix_sort.cuh b/kokkos/kokkos/TPL/cub/device/device_radix_sort.cuh
deleted file mode 100644
index 087d546..0000000
--- a/kokkos/kokkos/TPL/cub/device/device_radix_sort.cuh
+++ /dev/null
@@ -1,890 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "block/block_radix_sort_upsweep_tiles.cuh"
-#include "block/block_radix_sort_downsweep_tiles.cuh"
-#include "block/block_scan_tiles.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../util_debug.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Upsweep pass kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
- */
-template <
-    typename                BlockRadixSortUpsweepTilesPolicy, ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
-    typename                Key,                            ///< Key type
-    typename                SizeT>                          ///< Integer type used for global array indexing
-__launch_bounds__ (int(BlockRadixSortUpsweepTilesPolicy::BLOCK_THREADS), 1)
-__global__ void RadixSortUpsweepKernel(
-    Key                     *d_keys,                        ///< [in] Input keys buffer
-    SizeT                   *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    SizeT                   num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    bool                    use_primary_bit_granularity,    ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
-    bool                    first_pass,                     ///< [in] Whether this is the first digit pass
-    GridEvenShare<SizeT>    even_share)                     ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
-{
-
-    // Alternate policy for when fewer bits remain
-    typedef typename BlockRadixSortUpsweepTilesPolicy::AltPolicy AltPolicy;
-
-    // Parameterize two versions of BlockRadixSortUpsweepTiles type for the current configuration
-    typedef BlockRadixSortUpsweepTiles<BlockRadixSortUpsweepTilesPolicy, Key, SizeT>    BlockRadixSortUpsweepTilesT;          // Primary
-    typedef BlockRadixSortUpsweepTiles<AltPolicy, Key, SizeT>                           AltBlockRadixSortUpsweepTilesT;       // Alternate (smaller bit granularity)
-
-    // Shared memory storage
-    __shared__ union
-    {
-        typename BlockRadixSortUpsweepTilesT::TempStorage     pass_storage;
-        typename AltBlockRadixSortUpsweepTilesT::TempStorage  alt_pass_storage;
-    } temp_storage;
-
-    // Initialize even-share descriptor for this thread block
-    even_share.BlockInit();
-
-    // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
-    if (use_primary_bit_granularity)
-    {
-        // Primary granularity
-        SizeT bin_count;
-        BlockRadixSortUpsweepTilesT(temp_storage.pass_storage, d_keys, current_bit).ProcessTiles(
-            even_share.block_offset,
-            even_share.block_oob,
-            bin_count);
-
-        // Write out digit counts (striped)
-        if (threadIdx.x < BlockRadixSortUpsweepTilesT::RADIX_DIGITS)
-        {
-            d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
-        }
-    }
-    else
-    {
-        // Alternate granularity
-        // Process input tiles (each of the first RADIX_DIGITS threads will compute a count for that digit)
-        SizeT bin_count;
-        AltBlockRadixSortUpsweepTilesT(temp_storage.alt_pass_storage, d_keys, current_bit).ProcessTiles(
-            even_share.block_offset,
-            even_share.block_oob,
-            bin_count);
-
-        // Write out digit counts (striped)
-        if (threadIdx.x < AltBlockRadixSortUpsweepTilesT::RADIX_DIGITS)
-        {
-            d_spine[(gridDim.x * threadIdx.x) + blockIdx.x] = bin_count;
-        }
-    }
-}
-
-
-/**
- * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
- */
-template <
-    typename    BlockScanTilesPolicy,   ///< Tuning policy for cub::BlockScanTiles abstraction
-    typename    SizeT>                  ///< Integer type used for global array indexing
-__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS), 1)
-__global__ void RadixSortScanKernel(
-    SizeT       *d_spine,               ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    int         num_counts)             ///< [in] Total number of bin-counts
-{
-    // Parameterize the BlockScanTiles type for the current configuration
-    typedef BlockScanTiles<BlockScanTilesPolicy, SizeT*, SizeT*, cub::Sum, SizeT, SizeT> BlockScanTilesT;
-
-    // Shared memory storage
-    __shared__ typename BlockScanTilesT::TempStorage temp_storage;
-
-    // Block scan instance
-    BlockScanTilesT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), SizeT(0)) ;
-
-    // Process full input tiles
-    int block_offset = 0;
-    RunningBlockPrefixOp<SizeT> prefix_op;
-    prefix_op.running_total = 0;
-    while (block_offset < num_counts)
-    {
-        block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
-        block_offset += BlockScanTilesT::TILE_ITEMS;
-    }
-}
-
-
-/**
- * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
- */
-template <
-    typename                BlockRadixSortDownsweepTilesPolicy,   ///< Tuning policy for cub::BlockRadixSortUpsweepTiles abstraction
-    typename                Key,                                ///< Key type
-    typename                Value,                              ///< Value type
-    typename                SizeT>                              ///< Integer type used for global array indexing
-__launch_bounds__ (int(BlockRadixSortDownsweepTilesPolicy::BLOCK_THREADS))
-__global__ void RadixSortDownsweepKernel(
-    Key                     *d_keys_in,                     ///< [in] Input keys ping buffer
-    Key                     *d_keys_out,                    ///< [in] Output keys pong buffer
-    Value                   *d_values_in,                   ///< [in] Input values ping buffer
-    Value                   *d_values_out,                  ///< [in] Output values pong buffer
-    SizeT                   *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    SizeT                   num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    bool                    use_primary_bit_granularity,    ///< [in] Whether nor not to use the primary policy (or the embedded alternate policy for smaller bit granularity)
-    bool                    first_pass,                     ///< [in] Whether this is the first digit pass
-    bool                    last_pass,                      ///< [in] Whether this is the last digit pass
-    GridEvenShare<SizeT>    even_share)                     ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
-{
-
-    // Alternate policy for when fewer bits remain
-    typedef typename BlockRadixSortDownsweepTilesPolicy::AltPolicy AltPolicy;
-
-    // Parameterize two versions of BlockRadixSortDownsweepTiles type for the current configuration
-    typedef BlockRadixSortDownsweepTiles<BlockRadixSortDownsweepTilesPolicy, Key, Value, SizeT>     BlockRadixSortDownsweepTilesT;
-    typedef BlockRadixSortDownsweepTiles<AltPolicy, Key, Value, SizeT>                            AltBlockRadixSortDownsweepTilesT;
-
-    // Shared memory storage
-    __shared__ union
-    {
-        typename BlockRadixSortDownsweepTilesT::TempStorage       pass_storage;
-        typename AltBlockRadixSortDownsweepTilesT::TempStorage    alt_pass_storage;
-
-    } temp_storage;
-
-    // Initialize even-share descriptor for this thread block
-    even_share.BlockInit();
-
-    if (use_primary_bit_granularity)
-    {
-        // Process input tiles
-        BlockRadixSortDownsweepTilesT(temp_storage.pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
-            even_share.block_offset,
-            even_share.block_oob);
-    }
-    else
-    {
-        // Process input tiles
-        AltBlockRadixSortDownsweepTilesT(temp_storage.alt_pass_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit).ProcessTiles(
-            even_share.block_offset,
-            even_share.block_oob);
-    }
-}
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-
-
-/******************************************************************************
- * DeviceRadixSort
- *****************************************************************************/
-
-/**
- * \brief DeviceRadixSort provides operations for computing a device-wide, parallel radix sort across data items residing within global memory. ![](sorting_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending order.  It relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, BlockRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRadixSort}
- *
- * \par Performance
- *
- * \image html lsd_sort_perf.png
- *
- */
-struct DeviceRadixSort
-{
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /// Generic structure for encapsulating dispatch properties codified in block policy.
-    struct KernelDispachParams
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        cudaSharedMemConfig     smem_config;
-        int                     radix_bits;
-        int                     alt_radix_bits;
-        int                     subscription_factor;
-        int                     tile_size;
-
-        template <typename SortBlockPolicy>
-        __host__ __device__ __forceinline__
-        void InitUpsweepPolicy(int subscription_factor = 1)
-        {
-            block_threads               = SortBlockPolicy::BLOCK_THREADS;
-            items_per_thread            = SortBlockPolicy::ITEMS_PER_THREAD;
-            radix_bits                  = SortBlockPolicy::RADIX_BITS;
-            alt_radix_bits              = SortBlockPolicy::AltPolicy::RADIX_BITS;
-            smem_config                 = cudaSharedMemBankSizeFourByte;
-            this->subscription_factor   = subscription_factor;
-            tile_size                   = block_threads * items_per_thread;
-        }
-
-        template <typename ScanBlockPolicy>
-        __host__ __device__ __forceinline__
-        void InitScanPolicy()
-        {
-            block_threads               = ScanBlockPolicy::BLOCK_THREADS;
-            items_per_thread            = ScanBlockPolicy::ITEMS_PER_THREAD;
-            radix_bits                  = 0;
-            alt_radix_bits              = 0;
-            smem_config                 = cudaSharedMemBankSizeFourByte;
-            subscription_factor         = 0;
-            tile_size                   = block_threads * items_per_thread;
-        }
-
-        template <typename SortBlockPolicy>
-        __host__ __device__ __forceinline__
-        void InitDownsweepPolicy(int subscription_factor = 1)
-        {
-            block_threads               = SortBlockPolicy::BLOCK_THREADS;
-            items_per_thread            = SortBlockPolicy::ITEMS_PER_THREAD;
-            radix_bits                  = SortBlockPolicy::RADIX_BITS;
-            alt_radix_bits              = SortBlockPolicy::AltPolicy::RADIX_BITS;
-            smem_config                 = SortBlockPolicy::SMEM_CONFIG;
-            this->subscription_factor   = subscription_factor;
-            tile_size                   = block_threads * items_per_thread;
-        }
-    };
-
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// Specializations of tuned policy types for different PTX architectures
-    template <typename Key, typename Value, typename SizeT, int ARCH>
-    struct TunedPolicies;
-
-    /// SM35 tune
-    template <typename Key, typename Value, typename SizeT>
-    struct TunedPolicies<Key, Value, SizeT, 350>
-    {
-        enum {
-            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
-            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
-            RADIX_BITS      = 5,
-        };
-
-        // UpsweepPolicy
-        typedef BlockRadixSortUpsweepTilesPolicy <64,     CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepTilesPolicy <128,    CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
-/*
-        // 4bit
-        typedef BlockRadixSortUpsweepTilesPolicy <128, 15, LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepTilesPolicy <256, 13, LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
-*/
-        // ScanPolicy
-        typedef BlockScanTilesPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // DownsweepPolicy
-        typedef BlockRadixSortDownsweepTilesPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepTilesPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-
-/*
-        // 4bit
-        typedef BlockRadixSortDownsweepTilesPolicy <128, 15, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepTilesPolicy <256, 13, BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
-*/
-        enum { SUBSCRIPTION_FACTOR = 7 };
-    };
-
-
-    /// SM20 tune
-    template <typename Key, typename Value, typename SizeT>
-    struct TunedPolicies<Key, Value, SizeT, 200>
-    {
-        enum {
-            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
-            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
-            RADIX_BITS      = 5,
-        };
-
-        // UpsweepPolicy
-        typedef BlockRadixSortUpsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
-
-        // ScanPolicy
-        typedef BlockScanTilesPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // DownsweepPolicy
-        typedef BlockRadixSortDownsweepTilesPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepTilesPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-
-        enum { SUBSCRIPTION_FACTOR = 3 };
-    };
-
-
-    /// SM10 tune
-    template <typename Key, typename Value, typename SizeT>
-    struct TunedPolicies<Key, Value, SizeT, 100>
-    {
-        enum {
-            RADIX_BITS = 4,
-        };
-
-        // UpsweepPolicy
-        typedef BlockRadixSortUpsweepTilesPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
-
-        // ScanPolicy
-        typedef BlockScanTilesPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // DownsweepPolicy
-        typedef BlockRadixSortDownsweepTilesPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
-
-        enum { SUBSCRIPTION_FACTOR = 3 };
-    };
-
-
-
-    /******************************************************************************
-     * Default policy initializer
-     ******************************************************************************/
-
-    /// Tuning policy for the PTX architecture that DeviceRadixSort operations will get dispatched to
-    template <typename Key, typename Value, typename SizeT>
-    struct PtxDefaultPolicies
-    {
-
-        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
-                                                350 :
-                                                (CUB_PTX_ARCH >= 200) ?
-                                                    200 :
-                                                    100;
-
-        // Tuned policy set for the current PTX compiler pass
-        typedef TunedPolicies<Key, Value, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
-
-        // UpsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct UpsweepPolicy : PtxTunedPolicies::UpsweepPolicy {};
-
-        // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
-
-        // DownsweepPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct DownsweepPolicy : PtxTunedPolicies::DownsweepPolicy {};
-
-        // Subscription factor for the current PTX compiler pass
-        enum { SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR };
-
-
-        /**
-         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
-         */
-        static void InitDispatchParams(
-            int                    ptx_version,
-            KernelDispachParams    &upsweep_dispatch_params,
-            KernelDispachParams    &scan_dispatch_params,
-            KernelDispachParams    &downsweep_dispatch_params)
-        {
-            if (ptx_version >= 350)
-            {
-                typedef TunedPolicies<Key, Value, SizeT, 350> TunedPolicies;
-                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
-                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-            }
-            else if (ptx_version >= 200)
-            {
-                typedef TunedPolicies<Key, Value, SizeT, 200> TunedPolicies;
-                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
-                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-            }
-            else
-            {
-                typedef TunedPolicies<Key, Value, SizeT, 100> TunedPolicies;
-                upsweep_dispatch_params.InitUpsweepPolicy<typename TunedPolicies::UpsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                scan_dispatch_params.InitScanPolicy<typename TunedPolicies::ScanPolicy>();
-                downsweep_dispatch_params.InitDownsweepPolicy<typename TunedPolicies::DownsweepPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-            }
-        }
-    };
-
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
-     */
-    template <
-        typename            UpsweepKernelPtr,                       ///< Function type of cub::RadixSortUpsweepKernel
-        typename            SpineKernelPtr,                         ///< Function type of cub::SpineScanKernel
-        typename            DownsweepKernelPtr,                     ///< Function type of cub::RadixSortUpsweepKernel
-        typename            Key,                                    ///< Key type
-        typename            Value,                                  ///< Value type
-        typename            SizeT>                                  ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        UpsweepKernelPtr    upsweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
-        SpineKernelPtr      scan_kernel,                            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
-        DownsweepKernelPtr  downsweep_kernel,                       ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel
-        KernelDispachParams &upsweep_dispatch_params,               ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
-        KernelDispachParams &scan_dispatch_params,                  ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
-        KernelDispachParams &downsweep_dispatch_params,             ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
-        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        SizeT               num_items,                              ///< [in] Number of items to reduce
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get a rough estimate of downsweep_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
-            int downsweep_sm_occupancy = CUB_MIN(
-                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
-                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / downsweep_dispatch_params.block_threads);
-            int upsweep_sm_occupancy = downsweep_sm_occupancy;
-
-#ifndef __CUDA_ARCH__
-            // We're on the host, so come up with more accurate estimates of SM occupancy from actual device properties
-            Device device_props;
-            if (CubDebug(error = device_props.Init(device_ordinal))) break;
-
-            if (CubDebug(error = device_props.MaxSmOccupancy(
-                downsweep_sm_occupancy,
-                downsweep_kernel,
-                downsweep_dispatch_params.block_threads))) break;
-
-            if (CubDebug(error = device_props.MaxSmOccupancy(
-                upsweep_sm_occupancy,
-                upsweep_kernel,
-                upsweep_dispatch_params.block_threads))) break;
-#endif
-            // Get device occupancies
-            int downsweep_occupancy = downsweep_sm_occupancy * sm_count;
-
-            // Get even-share work distribution descriptor
-            GridEvenShare<SizeT> even_share;
-            int max_downsweep_grid_size = downsweep_occupancy * downsweep_dispatch_params.subscription_factor;
-            int downsweep_grid_size;
-            even_share.GridInit(num_items, max_downsweep_grid_size, downsweep_dispatch_params.tile_size);
-            downsweep_grid_size = even_share.grid_size;
-
-            // Get number of spine elements (round up to nearest spine scan kernel tile size)
-            int bins            = 1 << downsweep_dispatch_params.radix_bits;
-            int spine_size      = downsweep_grid_size * bins;
-            int spine_tiles     = (spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
-            spine_size          = spine_tiles * scan_dispatch_params.tile_size;
-
-            int alt_bins            = 1 << downsweep_dispatch_params.alt_radix_bits;
-            int alt_spine_size      = downsweep_grid_size * alt_bins;
-            int alt_spine_tiles     = (alt_spine_size + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
-            alt_spine_size          = alt_spine_tiles * scan_dispatch_params.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[1];
-            size_t allocation_sizes[1] =
-            {
-                spine_size * sizeof(SizeT),    // bytes needed for privatized block digit histograms
-            };
-
-            // Alias temporaries (or set the necessary size of the storage allocation)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Privatized per-block digit histograms
-            SizeT *d_spine = (SizeT*) allocations[0];
-
-#ifndef __CUDA_ARCH__
-            // Get current smem bank configuration
-            cudaSharedMemConfig original_smem_config;
-            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
-            cudaSharedMemConfig current_smem_config = original_smem_config;
-#endif
-            // Iterate over digit places
-            int current_bit = begin_bit;
-            while (current_bit < end_bit)
-            {
-                // Use primary bit granularity if bits remaining is a whole multiple of bit primary granularity
-                int bits_remaining = end_bit - current_bit;
-                bool use_primary_bit_granularity = (bits_remaining % downsweep_dispatch_params.radix_bits == 0);
-                int radix_bits = (use_primary_bit_granularity) ?
-                    downsweep_dispatch_params.radix_bits :
-                    downsweep_dispatch_params.alt_radix_bits;
-
-#ifndef __CUDA_ARCH__
-                // Update smem config if necessary
-                if (current_smem_config != upsweep_dispatch_params.smem_config)
-                {
-                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_dispatch_params.smem_config))) break;
-                    current_smem_config = upsweep_dispatch_params.smem_config;
-                }
-#endif
-
-                // Log upsweep_kernel configuration
-                if (stream_synchronous)
-                    CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
-                    downsweep_grid_size, upsweep_dispatch_params.block_threads, (long long) stream, upsweep_dispatch_params.smem_config, upsweep_dispatch_params.items_per_thread, upsweep_sm_occupancy, d_keys.selector, current_bit, radix_bits);
-
-                // Invoke upsweep_kernel with same grid size as downsweep_kernel
-                upsweep_kernel<<<downsweep_grid_size, upsweep_dispatch_params.block_threads, 0, stream>>>(
-                    d_keys.d_buffers[d_keys.selector],
-                    d_spine,
-                    num_items,
-                    current_bit,
-                    use_primary_bit_granularity,
-                    (current_bit == begin_bit),
-                    even_share);
-
-                // Sync the stream if specified
-                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                // Log scan_kernel configuration
-                if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                    1, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread);
-
-                // Invoke scan_kernel
-                scan_kernel<<<1, scan_dispatch_params.block_threads, 0, stream>>>(
-                    d_spine,
-                    (use_primary_bit_granularity) ? spine_size : alt_spine_size);
-
-                // Sync the stream if specified
-                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-#ifndef __CUDA_ARCH__
-                // Update smem config if necessary
-                if (current_smem_config != downsweep_dispatch_params.smem_config)
-                {
-                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_dispatch_params.smem_config))) break;
-                    current_smem_config = downsweep_dispatch_params.smem_config;
-                }
-#endif
-
-                // Log downsweep_kernel configuration
-                if (stream_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
-                    downsweep_grid_size, downsweep_dispatch_params.block_threads, (long long) stream, downsweep_dispatch_params.smem_config, downsweep_dispatch_params.items_per_thread, downsweep_sm_occupancy);
-
-                // Invoke downsweep_kernel
-                downsweep_kernel<<<downsweep_grid_size, downsweep_dispatch_params.block_threads, 0, stream>>>(
-                    d_keys.d_buffers[d_keys.selector],
-                    d_keys.d_buffers[d_keys.selector ^ 1],
-                    d_values.d_buffers[d_values.selector],
-                    d_values.d_buffers[d_values.selector ^ 1],
-                    d_spine,
-                    num_items,
-                    current_bit,
-                    use_primary_bit_granularity,
-                    (current_bit == begin_bit),
-                    (current_bit + downsweep_dispatch_params.radix_bits >= end_bit),
-                    even_share);
-
-                // Sync the stream if specified
-                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                // Invert selectors
-                d_keys.selector ^= 1;
-                d_values.selector ^= 1;
-
-                // Update current bit position
-                current_bit += radix_bits;
-            }
-
-#ifndef __CUDA_ARCH__
-            // Reset smem config if necessary
-            if (current_smem_config != original_smem_config)
-            {
-                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
-            }
-#endif
-
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-
-    /**
-     * \brief Sorts key-value pairs.
-     *
-     * \par
-     * The sorting operation requires a pair of key buffers and a pair of value
-     * buffers.  Each pair is wrapped in a DoubleBuffer structure whose member
-     * DoubleBuffer::Current() references the active buffer.  The currently-active
-     * buffer may be changed by the sorting operation.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \par
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers for
-     * // sorting data (keys, values, and equivalently-sized alternate buffers)
-     * int num_items = ...
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements for sorting operation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage for sorting operation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Sorted keys and values are referenced by d_keys.Current() and d_values.Current()
-     *
-     * \endcode
-     *
-     * \tparam Key      <b>[inferred]</b> Key type
-     * \tparam Value    <b>[inferred]</b> Value type
-     */
-    template <
-        typename            Key,
-        typename            Value>
-    __host__ __device__ __forceinline__
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                 num_items,                              ///< [in] Number of items to reduce
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
-        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        // Type used for array indexing
-        typedef int SizeT;
-
-        // Tuning polices
-        typedef PtxDefaultPolicies<Key, Value, SizeT>           PtxDefaultPolicies; // Wrapper of default kernel policies
-        typedef typename PtxDefaultPolicies::UpsweepPolicy      UpsweepPolicy;      // Upsweep kernel policy
-        typedef typename PtxDefaultPolicies::ScanPolicy         ScanPolicy;         // Scan kernel policy
-        typedef typename PtxDefaultPolicies::DownsweepPolicy    DownsweepPolicy;    // Downsweep kernel policy
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Declare dispatch parameters
-            KernelDispachParams upsweep_dispatch_params;
-            KernelDispachParams scan_dispatch_params;
-            KernelDispachParams downsweep_dispatch_params;
-
-#ifdef __CUDA_ARCH__
-            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
-            upsweep_dispatch_params.InitUpsweepPolicy<UpsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
-            scan_dispatch_params.InitScanPolicy<ScanPolicy>();
-            downsweep_dispatch_params.InitDownsweepPolicy<DownsweepPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
-#else
-            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-            PtxDefaultPolicies::InitDispatchParams(
-                ptx_version,
-                upsweep_dispatch_params,
-                scan_dispatch_params,
-                downsweep_dispatch_params);
-#endif
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                RadixSortUpsweepKernel<UpsweepPolicy, Key, SizeT>,
-                RadixSortScanKernel<ScanPolicy, SizeT>,
-                RadixSortDownsweepKernel<DownsweepPolicy, Key, Value, SizeT>,
-                upsweep_dispatch_params,
-                scan_dispatch_params,
-                downsweep_dispatch_params,
-                d_keys,
-                d_values,
-                num_items,
-                begin_bit,
-                end_bit,
-                stream,
-                stream_synchronous))) break;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * \brief Sorts keys
-     *
-     * \par
-     * The sorting operation requires a pair of key buffers.  The pair is
-     * wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current()
-     * references the active buffer.  The currently-active buffer may be changed
-     * by the sorting operation.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \par
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers for
-     * // sorting data (keys and equivalently-sized alternate buffer)
-     * int num_items = ...
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements for sorting operation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage for sorting operation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Sorted keys are referenced by d_keys.Current()
-     *
-     * \endcode
-     *
-     * \tparam Key      <b>[inferred]</b> Key type
-     */
-    template <typename Key>
-    __host__ __device__ __forceinline__
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to reduce
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
-        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        DoubleBuffer<NullType> d_values;
-        return SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, stream_synchronous);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/device/device_reduce.cuh b/kokkos/kokkos/TPL/cub/device/device_reduce.cuh
deleted file mode 100644
index 069af8c..0000000
--- a/kokkos/kokkos/TPL/cub/device/device_reduce.cuh
+++ /dev/null
@@ -1,775 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "block/block_reduce_tiles.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../util_debug.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-
-
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Reduction pass kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
- */
-template <
-    typename                BlockReduceTilesPolicy, ///< Tuning policy for cub::BlockReduceTiles abstraction
-    typename                InputIteratorRA,        ///< Random-access iterator type for input (may be a simple pointer type)
-    typename                OutputIteratorRA,       ///< Random-access iterator type for output (may be a simple pointer type)
-    typename                SizeT,                  ///< Integer type used for global array indexing
-    typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
-__global__ void ReducePrivatizedKernel(
-    InputIteratorRA         d_in,                   ///< [in] Input data to reduce
-    OutputIteratorRA        d_out,                  ///< [out] Output location for result
-    SizeT                   num_items,              ///< [in] Total number of input data items
-    GridEvenShare<SizeT>    even_share,             ///< [in] Descriptor for how to map an even-share of tiles across thread blocks
-    GridQueue<SizeT>        queue,                  ///< [in] Descriptor for performing dynamic mapping of tile data to thread blocks
-    ReductionOp             reduction_op)           ///< [in] Binary reduction operator
-{
-    // Data type
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-    // Thread block type for reducing input tiles
-    typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
-
-    // Block-wide aggregate
-    T block_aggregate;
-
-    // Shared memory storage
-    __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
-
-    // Consume input tiles
-    BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
-        num_items,
-        even_share,
-        queue,
-        block_aggregate,
-        Int2Type<BlockReduceTilesPolicy::GRID_MAPPING>());
-
-    // Output result
-    if (threadIdx.x == 0)
-    {
-        d_out[blockIdx.x] = block_aggregate;
-    }
-}
-
-
-/**
- * Reduction pass kernel entry point (single-block).  Aggregates privatized threadblock reductions from a previous multi-block reduction pass.
- */
-template <
-    typename                BlockReduceTilesPolicy,  ///< Tuning policy for cub::BlockReduceTiles abstraction
-    typename                InputIteratorRA,        ///< Random-access iterator type for input (may be a simple pointer type)
-    typename                OutputIteratorRA,       ///< Random-access iterator type for output (may be a simple pointer type)
-    typename                SizeT,                  ///< Integer type used for global array indexing
-    typename                ReductionOp>            ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(BlockReduceTilesPolicy::BLOCK_THREADS), 1)
-__global__ void ReduceSingleKernel(
-    InputIteratorRA         d_in,                   ///< [in] Input data to reduce
-    OutputIteratorRA        d_out,                  ///< [out] Output location for result
-    SizeT                   num_items,              ///< [in] Total number of input data items
-    ReductionOp             reduction_op)           ///< [in] Binary reduction operator
-{
-    // Data type
-    typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-    // Thread block type for reducing input tiles
-    typedef BlockReduceTiles<BlockReduceTilesPolicy, InputIteratorRA, SizeT, ReductionOp> BlockReduceTilesT;
-
-    // Block-wide aggregate
-    T block_aggregate;
-
-    // Shared memory storage
-    __shared__ typename BlockReduceTilesT::TempStorage temp_storage;
-
-    // Consume input tiles
-    BlockReduceTilesT(temp_storage, d_in, reduction_op).ConsumeTiles(
-        SizeT(0),
-        SizeT(num_items),
-        block_aggregate);
-
-    // Output result
-    if (threadIdx.x == 0)
-    {
-        d_out[blockIdx.x] = block_aggregate;
-    }
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * DeviceReduce
- *****************************************************************************/
-
-/**
- * \brief DeviceReduce provides operations for computing a device-wide, parallel reduction across data items residing within global memory. ![](reduce_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a list of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
- *
- * \par Performance
- *
- * \image html reduction_perf.png
- *
- */
-struct DeviceReduce
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /// Generic structure for encapsulating dispatch properties codified in block policy.
-    struct KernelDispachParams
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        int                     vector_load_length;
-        BlockReduceAlgorithm    block_algorithm;
-        PtxLoadModifier         load_modifier;
-        GridMappingStrategy     grid_mapping;
-        int                     subscription_factor;
-        int                     tile_size;
-
-        template <typename BlockPolicy>
-        __host__ __device__ __forceinline__
-        void Init(int subscription_factor = 1)
-        {
-            block_threads               = BlockPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
-            vector_load_length          = BlockPolicy::VECTOR_LOAD_LENGTH;
-            block_algorithm             = BlockPolicy::BLOCK_ALGORITHM;
-            load_modifier               = BlockPolicy::LOAD_MODIFIER;
-            grid_mapping                = BlockPolicy::GRID_MAPPING;
-            this->subscription_factor   = subscription_factor;
-            tile_size                   = block_threads * items_per_thread;
-        }
-
-        __host__ __device__ __forceinline__
-        void Print()
-        {
-            printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping, %d subscription",
-                block_threads,
-                items_per_thread,
-                vector_load_length,
-                block_algorithm,
-                load_modifier,
-                grid_mapping,
-                subscription_factor);
-        }
-
-    };
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// Specializations of tuned policy types for different PTX architectures
-    template <
-        typename    T,
-        typename    SizeT,
-        int         ARCH>
-    struct TunedPolicies;
-
-    /// SM35 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 350>
-    {
-        // PrivatizedPolicy (1B): GTX Titan: 206.0 GB/s @ 192M 1B items
-        typedef BlockReduceTilesPolicy<128, 12,  1, BLOCK_REDUCE_RAKING, LOAD_LDG, GRID_MAPPING_DYNAMIC>                PrivatizedPolicy1B;
-
-        // PrivatizedPolicy (4B): GTX Titan: 254.2 GB/s @ 48M 4B items
-        typedef BlockReduceTilesPolicy<512, 20,  1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>         PrivatizedPolicy4B;
-
-        // PrivatizedPolicy
-        typedef typename If<(sizeof(T) < 4),
-            PrivatizedPolicy1B,
-            PrivatizedPolicy4B>::Type PrivatizedPolicy;
-
-        // SinglePolicy
-        typedef BlockReduceTilesPolicy<256, 8, 1, BLOCK_REDUCE_WARP_REDUCTIONS, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>  SinglePolicy;
-
-        enum { SUBSCRIPTION_FACTOR = 7 };
-
-    };
-
-    /// SM30 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 300>
-    {
-        // PrivatizedPolicy: GTX670: 154.0 @ 48M 32-bit T
-        typedef BlockReduceTilesPolicy<256, 2,  1, BLOCK_REDUCE_WARP_REDUCTIONS,  LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>    PrivatizedPolicy;
-
-        // SinglePolicy
-        typedef BlockReduceTilesPolicy<256, 24, 4, BLOCK_REDUCE_WARP_REDUCTIONS,  LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>    SinglePolicy;
-
-        enum { SUBSCRIPTION_FACTOR = 1 };
-    };
-
-    /// SM20 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 200>
-    {
-        // PrivatizedPolicy (1B): GTX 580: 158.1 GB/s @ 192M 1B items
-        typedef BlockReduceTilesPolicy<192, 24,  4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy1B;
-
-        // PrivatizedPolicy (4B): GTX 580: 178.9 GB/s @ 48M 4B items
-        typedef BlockReduceTilesPolicy<128, 8,  4, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_DYNAMIC>                PrivatizedPolicy4B;
-
-        // PrivatizedPolicy
-        typedef typename If<(sizeof(T) < 4),
-            PrivatizedPolicy1B,
-            PrivatizedPolicy4B>::Type PrivatizedPolicy;
-
-        // SinglePolicy
-        typedef BlockReduceTilesPolicy<192, 7,  1, BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>             SinglePolicy;
-
-        enum { SUBSCRIPTION_FACTOR = 2 };
-    };
-
-    /// SM13 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 130>
-    {
-        // PrivatizedPolicy
-        typedef BlockReduceTilesPolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy;
-
-        // SinglePolicy
-        typedef BlockReduceTilesPolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            SinglePolicy;
-
-        enum { SUBSCRIPTION_FACTOR = 1 };
-    };
-
-    /// SM10 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 100>
-    {
-        // PrivatizedPolicy
-        typedef BlockReduceTilesPolicy<128, 8,  2,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            PrivatizedPolicy;
-
-        // SinglePolicy
-        typedef BlockReduceTilesPolicy<32,  4,  4,  BLOCK_REDUCE_RAKING, LOAD_DEFAULT, GRID_MAPPING_EVEN_SHARE>            SinglePolicy;
-
-        enum { SUBSCRIPTION_FACTOR = 1 };
-    };
-
-
-
-    /******************************************************************************
-     * Default policy initializer
-     ******************************************************************************/
-
-    /// Tuning policy for the PTX architecture that DeviceReduce operations will get dispatched to
-    template <typename T, typename SizeT>
-    struct PtxDefaultPolicies
-    {
-        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
-                                                350 :
-                                                (CUB_PTX_ARCH >= 300) ?
-                                                    300 :
-                                                    (CUB_PTX_ARCH >= 200) ?
-                                                        200 :
-                                                        (CUB_PTX_ARCH >= 130) ?
-                                                            130 :
-                                                            100;
-
-        // Tuned policy set for the current PTX compiler pass
-        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
-
-        // Subscription factor for the current PTX compiler pass
-        static const int SUBSCRIPTION_FACTOR = PtxTunedPolicies::SUBSCRIPTION_FACTOR;
-
-        // PrivatizedPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct PrivatizedPolicy : PtxTunedPolicies::PrivatizedPolicy {};
-
-        // SinglePolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct SinglePolicy : PtxTunedPolicies::SinglePolicy {};
-
-
-        /**
-         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
-         */
-        static void InitDispatchParams(
-            int                    ptx_version,
-            KernelDispachParams    &privatized_dispatch_params,
-            KernelDispachParams    &single_dispatch_params)
-        {
-            if (ptx_version >= 350)
-            {
-                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
-                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
-            }
-            else if (ptx_version >= 300)
-            {
-                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
-                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
-            }
-            else if (ptx_version >= 200)
-            {
-                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
-                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
-            }
-            else if (ptx_version >= 130)
-            {
-                typedef TunedPolicies<T, SizeT, 130> TunedPolicies;
-                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
-            }
-            else
-            {
-                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
-                privatized_dispatch_params.Init<typename TunedPolicies::PrivatizedPolicy>(TunedPolicies::SUBSCRIPTION_FACTOR);
-                single_dispatch_params.Init<typename TunedPolicies::SinglePolicy >();
-            }
-        }
-    };
-
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction using a two-stages of kernel invocations.
-     */
-    template <
-        typename                    ReducePrivatizedKernelPtr,          ///< Function type of cub::ReducePrivatizedKernel
-        typename                    ReduceSingleKernelPtr,              ///< Function type of cub::ReduceSingleKernel
-        typename                    ResetDrainKernelPtr,                ///< Function type of cub::ResetDrainKernel
-        typename                    InputIteratorRA,                    ///< Random-access iterator type for input (may be a simple pointer type)
-        typename                    OutputIteratorRA,                   ///< Random-access iterator type for output (may be a simple pointer type)
-        typename                    SizeT,                              ///< Integer type used for global array indexing
-        typename                    ReductionOp>                        ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        ReducePrivatizedKernelPtr   privatized_kernel,                  ///< [in] Kernel function pointer to parameterization of cub::ReducePrivatizedKernel
-        ReduceSingleKernelPtr       single_kernel,                      ///< [in] Kernel function pointer to parameterization of cub::ReduceSingleKernel
-        ResetDrainKernelPtr         prepare_drain_kernel,               ///< [in] Kernel function pointer to parameterization of cub::ResetDrainKernel
-        KernelDispachParams         &privatized_dispatch_params,        ///< [in] Dispatch parameters that match the policy that \p privatized_kernel_ptr was compiled for
-        KernelDispachParams         &single_dispatch_params,            ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for
-        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
-        OutputIteratorRA            d_out,                              ///< [out] Output location for result
-        SizeT                       num_items,                          ///< [in] Number of items to reduce
-        ReductionOp                 reduction_op,                       ///< [in] Binary reduction operator
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-
-        // Data type of input iterator
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            if ((privatized_kernel == NULL) || (num_items <= (single_dispatch_params.tile_size)))
-            {
-                // Dispatch a single-block reduction kernel
-
-                // Return if the caller is simply requesting the size of the storage allocation
-                if (d_temp_storage == NULL)
-                {
-                    temp_storage_bytes = 1;
-                    return cudaSuccess;
-                }
-
-                // Log single_kernel configuration
-                if (stream_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                    single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
-
-                // Invoke single_kernel
-                single_kernel<<<1, single_dispatch_params.block_threads>>>(
-                    d_in,
-                    d_out,
-                    num_items,
-                    reduction_op);
-
-                // Sync the stream if specified
-                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            }
-            else
-            {
-                // Dispatch two kernels: a multi-block kernel to compute
-                // privatized per-block reductions, and then a single-block
-                // to reduce those
-
-                // Get device ordinal
-                int device_ordinal;
-                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-                // Get SM count
-                int sm_count;
-                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-                // Get a rough estimate of privatized_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
-                int privatized_sm_occupancy = CUB_MIN(
-                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
-                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / privatized_dispatch_params.block_threads);
-
-#ifndef __CUDA_ARCH__
-                // We're on the host, so come up with a more accurate estimate of privatized_kernel SM occupancy from actual device properties
-                Device device_props;
-                if (CubDebug(error = device_props.Init(device_ordinal))) break;
-
-                if (CubDebug(error = device_props.MaxSmOccupancy(
-                    privatized_sm_occupancy,
-                    privatized_kernel,
-                    privatized_dispatch_params.block_threads))) break;
-#endif
-
-                // Get device occupancy for privatized_kernel
-                int privatized_occupancy = privatized_sm_occupancy * sm_count;
-
-                // Even-share work distribution
-                GridEvenShare<SizeT> even_share;
-
-                // Get grid size for privatized_kernel
-                int privatized_grid_size;
-                switch (privatized_dispatch_params.grid_mapping)
-                {
-                case GRID_MAPPING_EVEN_SHARE:
-
-                    // Work is distributed evenly
-                    even_share.GridInit(
-                        num_items,
-                        privatized_occupancy * privatized_dispatch_params.subscription_factor,
-                        privatized_dispatch_params.tile_size);
-                    privatized_grid_size = even_share.grid_size;
-                    break;
-
-                case GRID_MAPPING_DYNAMIC:
-
-                    // Work is distributed dynamically
-                    int num_tiles = (num_items + privatized_dispatch_params.tile_size - 1) / privatized_dispatch_params.tile_size;
-                    privatized_grid_size   = (num_tiles < privatized_occupancy) ?
-                        num_tiles :                 // Not enough to fill the device with threadblocks
-                        privatized_occupancy;      // Fill the device with threadblocks
-                    break;
-                };
-
-                // Temporary storage allocation requirements
-                void* allocations[2];
-                size_t allocation_sizes[2] =
-                {
-                    privatized_grid_size * sizeof(T),      // bytes needed for privatized block reductions
-                    GridQueue<int>::AllocationSize()        // bytes needed for grid queue descriptor
-                };
-
-                // Alias temporaries (or set the necessary size of the storage allocation)
-                if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-                // Return if the caller is simply requesting the size of the storage allocation
-                if (d_temp_storage == NULL)
-                    return cudaSuccess;
-
-                // Privatized per-block reductions
-                T *d_block_reductions = (T*) allocations[0];
-
-                // Grid queue descriptor
-                GridQueue<SizeT> queue(allocations[1]);
-
-                // Prepare the dynamic queue descriptor if necessary
-                if (privatized_dispatch_params.grid_mapping == GRID_MAPPING_DYNAMIC)
-                {
-                    // Prepare queue using a kernel so we know it gets prepared once per operation
-                    if (stream_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
-
-                    // Invoke prepare_drain_kernel
-                    prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
-
-                    // Sync the stream if specified
-                    if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-                }
-
-                // Log privatized_kernel configuration
-                if (stream_synchronous) CubLog("Invoking privatized_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    privatized_grid_size, privatized_dispatch_params.block_threads, (long long) stream, privatized_dispatch_params.items_per_thread, privatized_sm_occupancy);
-
-                // Invoke privatized_kernel
-                privatized_kernel<<<privatized_grid_size, privatized_dispatch_params.block_threads, 0, stream>>>(
-                    d_in,
-                    d_block_reductions,
-                    num_items,
-                    even_share,
-                    queue,
-                    reduction_op);
-
-                // Sync the stream if specified
-                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                // Log single_kernel configuration
-                if (stream_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                    1, single_dispatch_params.block_threads, (long long) stream, single_dispatch_params.items_per_thread);
-
-                // Invoke single_kernel
-                single_kernel<<<1, single_dispatch_params.block_threads, 0, stream>>>(
-                    d_block_reductions,
-                    d_out,
-                    privatized_grid_size,
-                    reduction_op);
-
-                // Sync the stream if specified
-                if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * Does not support non-commutative reduction operators.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the max reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input and output
-     * int *d_reduce_input, *d_aggregate;
-     * int num_items = ...
-     * ...
-     *
-     * // Determine temporary device storage requirements for reduction
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
-     *
-     * // Allocate temporary storage for reduction
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction (max)
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
-     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename                    InputIteratorRA,
-        typename                    OutputIteratorRA,
-        typename                    ReductionOp>
-    __host__ __device__ __forceinline__
-    static cudaError_t Reduce(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
-        OutputIteratorRA            d_out,                              ///< [out] Output location for result
-        int                         num_items,                          ///< [in] Number of items to reduce
-        ReductionOp                 reduction_op,                       ///< [in] Binary reduction operator
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        // Type used for array indexing
-        typedef int SizeT;
-
-        // Data type of input iterator
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        // Tuning polices
-        typedef PtxDefaultPolicies<T, SizeT>                    PtxDefaultPolicies;     // Wrapper of default kernel policies
-        typedef typename PtxDefaultPolicies::PrivatizedPolicy   PrivatizedPolicy;       // Multi-block kernel policy
-        typedef typename PtxDefaultPolicies::SinglePolicy       SinglePolicy;           // Single-block kernel policy
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Declare dispatch parameters
-            KernelDispachParams privatized_dispatch_params;
-            KernelDispachParams single_dispatch_params;
-
-#ifdef __CUDA_ARCH__
-            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
-            privatized_dispatch_params.Init<PrivatizedPolicy>(PtxDefaultPolicies::SUBSCRIPTION_FACTOR);
-            single_dispatch_params.Init<SinglePolicy>();
-#else
-            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-            PtxDefaultPolicies::InitDispatchParams(ptx_version, privatized_dispatch_params, single_dispatch_params);
-#endif
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                ReducePrivatizedKernel<PrivatizedPolicy, InputIteratorRA, T*, SizeT, ReductionOp>,
-                ReduceSingleKernel<SinglePolicy, T*, OutputIteratorRA, SizeT, ReductionOp>,
-                ResetDrainKernel<SizeT>,
-                privatized_dispatch_params,
-                single_dispatch_params,
-                d_in,
-                d_out,
-                num_items,
-                reduction_op,
-                stream,
-                stream_synchronous))) break;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * \brief Computes a device-wide sum using the addition ('+') operator.
-     *
-     * \par
-     * Does not support non-commutative reduction operators.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the sum reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input and output
-     * int *d_reduce_input, *d_aggregate;
-     * int num_items = ...
-     * ...
-     *
-     * // Determine temporary device storage requirements for summation
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
-     *
-     * // Allocate temporary storage for summation
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction summation
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_reduce_input, d_aggregate, num_items);
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
-     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     */
-    template <
-        typename                    InputIteratorRA,
-        typename                    OutputIteratorRA>
-    __host__ __device__ __forceinline__
-    static cudaError_t Sum(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA             d_in,                               ///< [in] Input data to reduce
-        OutputIteratorRA            d_out,                              ///< [out] Output location for result
-        int                         num_items,                          ///< [in] Number of items to reduce
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        return Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), stream, stream_synchronous);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/device/device_reduce_by_key.cuh b/kokkos/kokkos/TPL/cub/device/device_reduce_by_key.cuh
deleted file mode 100644
index f05f751..0000000
--- a/kokkos/kokkos/TPL/cub/device/device_reduce_by_key.cuh
+++ /dev/null
@@ -1,633 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "block/block_reduce_by_key_tiles.cuh"
-#include "device_scan.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../util_iterator.cuh"
-#include "../util_debug.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Reduce-by-key kernel entry point (multi-block)
- */
-template <
-    typename    BlockReduceByKeyilesPolicy,    ///< Tuning policy for cub::BlockReduceByKeyiles abstraction
-    typename    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-    typename    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-    typename    T,                              ///< The scan data type
-    typename    ReductionOp,                    ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
-    typename    SizeT>                          ///< Integer type used for global array indexing
-__launch_bounds__ (int(BlockSweepScanPolicy::BLOCK_THREADS))
-__global__ void MultiBlockScanKernel(
-    InputIteratorRA             d_in,           ///< Input data
-    OutputIteratorRA            d_out,          ///< Output data
-    ScanTileDescriptor<T> *d_tile_status, ///< Global list of tile status
-    ReductionOp                 reduction_op,   ///< Binary scan operator
-    Identity                    identity,       ///< Identity element
-    SizeT                       num_items,      ///< Total number of scan items for the entire problem
-    GridQueue<int>              queue)          ///< Descriptor for performing dynamic mapping of tile data to thread blocks
-{
-    enum
-    {
-        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
-    };
-
-    // Thread block type for scanning input tiles
-    typedef BlockSweepScan<
-        BlockSweepScanPolicy,
-        InputIteratorRA,
-        OutputIteratorRA,
-        ReductionOp,
-        Identity,
-        SizeT> BlockSweepScanT;
-
-    // Shared memory for BlockSweepScan
-    __shared__ typename BlockSweepScanT::TempStorage temp_storage;
-
-    // Process tiles
-    BlockSweepScanT(temp_storage, d_in, d_out, reduction_op, identity).ConsumeTiles(
-        num_items,
-        queue,
-        d_tile_status + TILE_STATUS_PADDING);
-}
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * DeviceReduceByKey
- *****************************************************************************/
-
-/**
- * \addtogroup DeviceModule
- * @{
- */
-
-/**
- * \brief DeviceReduceByKey provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](scan_logo.png)
- */
-struct DeviceReduceByKey
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockSweepScanPolicy.
-    struct KernelDispachParams
-    {
-        // Policy fields
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        BlockStoreAlgorithm     store_policy;
-        BlockScanAlgorithm      scan_algorithm;
-
-        // Other misc
-        int                     tile_size;
-
-        template <typename BlockSweepScanPolicy>
-        __host__ __device__ __forceinline__
-        void Init()
-        {
-            block_threads               = BlockSweepScanPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockSweepScanPolicy::ITEMS_PER_THREAD;
-            load_policy                 = BlockSweepScanPolicy::LOAD_ALGORITHM;
-            store_policy                = BlockSweepScanPolicy::STORE_ALGORITHM;
-            scan_algorithm              = BlockSweepScanPolicy::SCAN_ALGORITHM;
-
-            tile_size                   = block_threads * items_per_thread;
-        }
-
-        __host__ __device__ __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_policy,
-                scan_algorithm);
-        }
-
-    };
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-
-    /// Specializations of tuned policy types for different PTX architectures
-    template <
-        typename    T,
-        typename    SizeT,
-        int         ARCH>
-    struct TunedPolicies;
-
-    /// SM35 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 350>
-    {
-        typedef BlockSweepScanPolicy<128, 16,  BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
-    };
-
-    /// SM30 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 300>
-    {
-        typedef BlockSweepScanPolicy<256, 9,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
-    };
-
-    /// SM20 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 200>
-    {
-        typedef BlockSweepScanPolicy<128, 15,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> MultiBlockPolicy;
-    };
-
-    /// SM10 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 100>
-    {
-        typedef BlockSweepScanPolicy<128, 7,  BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> MultiBlockPolicy;
-    };
-
-
-    /// Tuning policy for the PTX architecture that DeviceReduceByKey operations will get dispatched to
-    template <typename T, typename SizeT>
-    struct PtxDefaultPolicies
-    {
-        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
-                                                350 :
-                                                (CUB_PTX_ARCH >= 300) ?
-                                                    300 :
-                                                    (CUB_PTX_ARCH >= 200) ?
-                                                        200 :
-                                                        100;
-
-        // Tuned policy set for the current PTX compiler pass
-        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
-
-        // MultiBlockPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct MultiBlockPolicy : PtxTunedPolicies::MultiBlockPolicy {};
-
-        /**
-         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
-         */
-        static void InitDispatchParams(int ptx_version, KernelDispachParams &multi_block_dispatch_params)
-        {
-            if (ptx_version >= 350)
-            {
-                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
-            }
-            else if (ptx_version >= 300)
-            {
-                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
-            }
-            else if (ptx_version >= 200)
-            {
-                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
-            }
-            else
-            {
-                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
-                multi_block_dispatch_params.Init<typename TunedPolicies::MultiBlockPolicy>();
-            }
-        }
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine
-     */
-    template <
-        typename                    InitScanKernelPtr,              ///< Function type of cub::InitScanKernel
-        typename                    MultiBlockScanKernelPtr,        ///< Function type of cub::MultiBlockScanKernel
-        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-        typename                    ReductionOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
-        typename                    SizeT>                          ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InitScanKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::InitScanKernel
-        MultiBlockScanKernelPtr     multi_block_kernel,             ///< [in] Kernel function pointer to parameterization of cub::MultiBlockScanKernel
-        KernelDispachParams         &multi_block_dispatch_params,   ///< [in] Dispatch parameters that match the policy that \p multi_block_kernel was compiled for
-        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
-        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
-        ReductionOp                      reduction_op,                        ///< [in] Binary scan operator
-        Identity                    identity,                       ///< [in] Identity element
-        SizeT                       num_items,                      ///< [in] Total number of items to scan
-        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-
-        enum
-        {
-            TILE_STATUS_PADDING = 32,
-        };
-
-        // Data type
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Number of input tiles
-            int num_tiles = (num_items + multi_block_dispatch_params.tile_size - 1) / multi_block_dispatch_params.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptor<T>),        // bytes needed for tile status descriptors
-                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
-            };
-
-            // Alias temporaries (or set the necessary size of the storage allocation)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Global list of tile status
-            ScanTileDescriptor<T> *d_tile_status = (ScanTileDescriptor<T>*) allocations[0];
-
-            // Grid queue descriptor
-            GridQueue<int> queue(allocations[1]);
-
-            // Get GPU id
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Log init_kernel configuration
-            int init_kernel_threads = 128;
-            int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
-            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors and queue descriptors
-            init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
-                queue,
-                d_tile_status,
-                num_tiles);
-
-            // Sync the stream if specified
-#ifndef __CUDA_ARCH__
-            if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
-#else
-            if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
-#endif
-
-            // Get a rough estimate of multi_block_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
-            int multi_sm_occupancy = CUB_MIN(
-                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
-                ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / multi_block_dispatch_params.block_threads);
-
-#ifndef __CUDA_ARCH__
-
-            // We're on the host, so come up with a more accurate estimate of multi_block_kernel SM occupancy from actual device properties
-            Device device_props;
-            if (CubDebug(error = device_props.Init(device_ordinal))) break;
-
-            if (CubDebug(error = device_props.MaxSmOccupancy(
-                multi_sm_occupancy,
-                multi_block_kernel,
-                multi_block_dispatch_params.block_threads))) break;
-
-#endif
-            // Get device occupancy for multi_block_kernel
-            int multi_block_occupancy = multi_sm_occupancy * sm_count;
-
-            // Get grid size for multi_block_kernel
-            int multi_block_grid_size = (num_tiles < multi_block_occupancy) ?
-                num_tiles :                 // Not enough to fill the device with threadblocks
-                multi_block_occupancy;            // Fill the device with threadblocks
-
-            // Log multi_block_kernel configuration
-            if (stream_synchronous) CubLog("Invoking multi_block_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                multi_block_grid_size, multi_block_dispatch_params.block_threads, (long long) stream, multi_block_dispatch_params.items_per_thread, multi_sm_occupancy);
-
-            // Invoke multi_block_kernel
-            multi_block_kernel<<<multi_block_grid_size, multi_block_dispatch_params.block_threads, 0, stream>>>(
-                d_in,
-                d_out,
-                d_tile_status,
-                reduction_op,
-                identity,
-                num_items,
-                queue);
-
-            // Sync the stream if specified
-#ifndef __CUDA_ARCH__
-            if (stream_synchronous && CubDebug(error = cudaStreamSynchronize(stream))) break;
-#else
-            if (stream_synchronous && CubDebug(error = cudaDeviceSynchronize())) break;
-#endif
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-
-    /**
-     * Internal scan dispatch routine for using default tuning policies
-     */
-    template <
-        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-        typename                    ReductionOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
-        typename                    SizeT>                          ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
-        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
-        ReductionOp                      reduction_op,                        ///< [in] Binary scan operator
-        Identity                    identity,                       ///< [in] Identity element
-        SizeT                       num_items,                      ///< [in] Total number of items to scan
-        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        // Data type
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        // Tuning polices for the PTX architecture that will get dispatched to
-        typedef PtxDefaultPolicies<T, SizeT> PtxDefaultPolicies;
-        typedef typename PtxDefaultPolicies::MultiBlockPolicy MultiBlockPolicy;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Declare dispatch parameters
-            KernelDispachParams multi_block_dispatch_params;
-
-#ifdef __CUDA_ARCH__
-            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
-            multi_block_dispatch_params.Init<MultiBlockPolicy>();
-#else
-            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-            PtxDefaultPolicies::InitDispatchParams(ptx_version, multi_block_dispatch_params);
-#endif
-
-            Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                InitScanKernel<T, SizeT>,
-                MultiBlockScanKernel<MultiBlockPolicy, InputIteratorRA, OutputIteratorRA, T, ReductionOp, Identity, SizeT>,
-                multi_block_dispatch_params,
-                d_in,
-                d_out,
-                reduction_op,
-                identity,
-                num_items,
-                stream,
-                stream_synchronous);
-
-            if (CubDebug(error)) break;
-        }
-        while (0);
-
-        return error;
-    }
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    /******************************************************************//**
-     * Interface
-     *********************************************************************/
-
-
-    /**
-     * \brief Computes device-wide reductions of consecutive values whose corresponding keys are equal.
-     *
-     * The resulting output lists of value-aggregates and their corresponding keys are compacted.
-     *
-     * \devicestorage
-     *
-     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
-     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
-     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
-     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
-     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
-     */
-    template <
-        typename                KeyInputIteratorRA,
-        typename                KeyOutputIteratorRA,
-        typename                ValueInputIteratorRA,
-        typename                ValueOutputIteratorRA,
-        typename                ReductionOp>
-    __host__ __device__ __forceinline__
-    static cudaError_t ReduceValues(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
-        KeyOutputIteratorRA     d_keys_out,                     ///< [out] Key output data (compacted)
-        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
-        ValueOutputIteratorRA   d_values_out,                   ///< [out] Value output data (compacted)
-        int                     num_items,                      ///< [in] Total number of input pairs
-        ReductionOp             reduction_op,                   ///< [in] Binary value reduction operator
-        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes device-wide sums of consecutive values whose corresponding keys are equal.
-     *
-     * The resulting output lists of value-aggregates and their corresponding keys are compacted.
-     *
-     * \devicestorage
-     *
-     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
-     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
-     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
-     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
-     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
-     */
-    template <
-        typename                KeyInputIteratorRA,
-        typename                KeyOutputIteratorRA,
-        typename                ValueInputIteratorRA,
-        typename                ValueOutputIteratorRA>
-    __host__ __device__ __forceinline__
-    static cudaError_t SumValues(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
-        KeyOutputIteratorRA     d_keys_out,                     ///< [in] Key output data (compacted)
-        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
-        ValueOutputIteratorRA   d_values_out,                   ///< [in] Value output data (compacted)
-        int                     num_items,                      ///< [in] Total number of input pairs
-        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return ReduceValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, cub::Sum(), num_items, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes the "run-length" of each group of consecutive, equal-valued keys.
-     *
-     * The resulting output lists of run-length counts and their corresponding keys are compacted.
-     *
-     * \devicestorage
-     *
-     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
-     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
-     * \tparam CountOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for output of key-counts whose value type must be convertible to an integer type (may be a simple pointer type)
-     */
-    template <
-        typename                KeyInputIteratorRA,
-        typename                KeyOutputIteratorRA,
-        typename                CountOutputIteratorRA>
-    __host__ __device__ __forceinline__
-    static cudaError_t RunLengths(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
-        KeyOutputIteratorRA     d_keys_out,                     ///< [in] Key output data (compacted)
-        CountOutputIteratorRA   d_counts_out,                   ///< [in] Run-length counts output data (compacted)
-        int                     num_items,                      ///< [in] Total number of keys
-        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef typename std::iterator_traits<CountOutputIteratorRA>::value_type CountT;
-        return SumValues(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, ConstantIteratorRA<CountT>(1), d_counts_out, num_items, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Removes duplicates within each group of consecutive, equal-valued keys.  Only the first key from each group (and corresponding value) is kept.
-     *
-     * The resulting keys are compacted.
-     *
-     * \devicestorage
-     *
-     * \tparam KeyInputIteratorRA       <b>[inferred]</b> Random-access input iterator type for keys input (may be a simple pointer type)
-     * \tparam KeyOutputIteratorRA      <b>[inferred]</b> Random-access output iterator type for keys output (may be a simple pointer type)
-     * \tparam ValueInputIteratorRA     <b>[inferred]</b> Random-access input iterator type for values input (may be a simple pointer type)
-     * \tparam ValueOutputIteratorRA    <b>[inferred]</b> Random-access output iterator type for values output (may be a simple pointer type)
-     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>, where \p T is the value type of \p ValueInputIteratorRA
-     */
-    template <
-        typename                KeyInputIteratorRA,
-        typename                KeyOutputIteratorRA,
-        typename                ValueInputIteratorRA,
-        typename                ValueOutputIteratorRA,
-        typename                ReductionOp>
-    __host__ __device__ __forceinline__
-    static cudaError_t Unique(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        KeyInputIteratorRA      d_keys_in,                      ///< [in] Key input data
-        KeyOutputIteratorRA     d_keys_out,                     ///< [out] Key output data (compacted)
-        ValueInputIteratorRA    d_values_in,                    ///< [in] Value input data
-        ValueOutputIteratorRA   d_values_out,                   ///< [out] Value output data (compacted)
-        int                     num_items,                      ///< [in] Total number of input pairs
-        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, reduction_op, num_items, stream, stream_synchronous);
-    }
-
-
-
-};
-
-
-/** @} */       // DeviceModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/device/device_reorder.cuh b/kokkos/kokkos/TPL/cub/device/device_reorder.cuh
deleted file mode 100644
index cba3bb4..0000000
--- a/kokkos/kokkos/TPL/cub/device/device_reorder.cuh
+++ /dev/null
@@ -1,550 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "device_scan.cuh"
-#include "block/block_partition_tiles.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../util_debug.cuh"
-#include "../util_device.cuh"
-#include "../util_vector.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Partition kernel entry point (multi-block)
- */
-template <
-    typename    BlockPartitionTilesPolicy,  ///< Tuning policy for cub::BlockPartitionTiles abstraction
-    typename    InputIteratorRA,            ///< Random-access iterator type for input (may be a simple pointer type)
-    typename    OutputIteratorRA,           ///< Random-access iterator type for output (may be a simple pointer type)
-    typename    LengthOutputIterator,       ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
-    typename    PredicateOp,                ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
-    typename    SizeT>                      ///< Integer type used for global array indexing
-__launch_bounds__ (int(BlockPartitionTilesPolicy::BLOCK_THREADS))
-__global__ void PartitionKernel(
-    InputIteratorRA                                                                         d_in,               ///< Input data
-    OutputIteratorRA                                                                        d_out,              ///< Output data
-    LengthOutputIterator                                                                    d_partition_length, ///< Number of items in the first partition
-    ScanTileDescriptor<PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> >   *d_tile_status,     ///< Global list of tile status
-    PredicateOp                                                                             pred_op,            ///< Unary predicate operator indicating membership in the first partition
-    SizeT                                                                                   num_items,          ///< Total number of input items for the entire problem
-    int                                                                                     num_tiles,          ///< Totla number of intut tiles for the entire problem
-    GridQueue<int>                                                                          queue)              ///< Descriptor for performing dynamic mapping of tile data to thread blocks
-{
-    enum
-    {
-        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
-    };
-
-    typedef PartitionScanTuple<SizeT, BlockPartitionTilesPolicy::PARTITOINS> PartitionScanTuple;
-
-    // Thread block type for scanning input tiles
-    typedef BlockPartitionTiles<
-        BlockPartitionTilesPolicy,
-        InputIteratorRA,
-        OutputIteratorRA,
-        PredicateOp,
-        SizeT> BlockPartitionTilesT;
-
-    // Shared memory for BlockPartitionTiles
-    __shared__ typename BlockPartitionTilesT::TempStorage temp_storage;
-
-    // Process tiles
-    PartitionScanTuple  partition_ends;     // Ending offsets for partitions (one-after)
-    bool                is_last_tile;       // Whether or not this block handled the last tile (i.e., partition_ends is valid for the entire input)
-    BlockPartitionTilesT(temp_storage, d_in, d_out, d_tile_status + TILE_STATUS_PADDING, pred_op, num_items).ConsumeTiles(
-        queue,
-        num_tiles,
-        partition_ends,
-        is_last_tile);
-
-    // Record the length of the first partition
-    if (is_last_tile && (threadIdx.x == 0))
-    {
-        *d_partition_length = partition_ends.x;
-    }
-}
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * DeviceReorder
- *****************************************************************************/
-
-/**
- * \addtogroup DeviceModule
- * @{
- */
-
-/**
- * \brief DeviceReorder provides device-wide operations for partitioning and filtering lists of items residing within global memory
- */
-struct DeviceReorder
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockPartitionTilesPolicy.
-    struct KernelDispachParams
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockScanAlgorithm      scan_algorithm;
-        int                     tile_size;
-
-        template <typename BlockPartitionTilesPolicy>
-        __host__ __device__ __forceinline__
-        void Init()
-        {
-            block_threads               = BlockPartitionTilesPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockPartitionTilesPolicy::ITEMS_PER_THREAD;
-            scan_algorithm              = BlockPartitionTilesPolicy::SCAN_ALGORITHM;
-            tile_size                   = block_threads * items_per_thread;
-        }
-    };
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-
-    /// Specializations of tuned policy types for different PTX architectures
-    template <
-        int         PARTITIONS,
-        typename    T,
-        typename    SizeT,
-        int         ARCH>
-    struct TunedPolicies;
-
-    /// SM35 tune
-    template <int PARTITIONS, typename T, typename SizeT>
-    struct TunedPolicies<PARTITIONS, T, SizeT, 350>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 16,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_LDG, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
-    };
-
-    /// SM30 tune
-    template <int PARTITIONS, typename T, typename SizeT>
-    struct TunedPolicies<PARTITIONS, T, SizeT, 300>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockPartitionTilesPolicy<PARTITIONS, 256, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
-    };
-
-    /// SM20 tune
-    template <int PARTITIONS, typename T, typename SizeT>
-    struct TunedPolicies<PARTITIONS, T, SizeT, 200>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING_MEMOIZE> PartitionPolicy;
-    };
-
-    /// SM10 tune
-    template <int PARTITIONS, typename T, typename SizeT>
-    struct TunedPolicies<PARTITIONS, T, SizeT, 100>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-        typedef BlockPartitionTilesPolicy<PARTITIONS, 128, ITEMS_PER_THREAD, LOAD_DEFAULT, BLOCK_SCAN_RAKING> PartitionPolicy;
-    };
-
-
-    /// Tuning policy for the PTX architecture that DevicePartition operations will get dispatched to
-    template <int PARTITIONS, typename T, typename SizeT>
-    struct PtxDefaultPolicies
-    {
-        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
-                                                350 :
-                                                (CUB_PTX_ARCH >= 300) ?
-                                                    300 :
-                                                    (CUB_PTX_ARCH >= 200) ?
-                                                        200 :
-                                                        100;
-
-        // Tuned policy set for the current PTX compiler pass
-        typedef TunedPolicies<PARTITIONS, T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
-
-        // PartitionPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct PartitionPolicy : PtxTunedPolicies::PartitionPolicy {};
-
-        /**
-         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
-         */
-        static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
-        {
-            if (ptx_version >= 350)
-            {
-                typedef TunedPolicies<PARTITIONS, T, SizeT, 350> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
-            }
-            else if (ptx_version >= 300)
-            {
-                typedef TunedPolicies<PARTITIONS, T, SizeT, 300> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
-            }
-            else if (ptx_version >= 200)
-            {
-                typedef TunedPolicies<PARTITIONS, T, SizeT, 200> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
-            }
-            else
-            {
-                typedef TunedPolicies<PARTITIONS, T, SizeT, 100> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::PartitionPolicy>();
-            }
-        }
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine
-     */
-    template <
-        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
-        typename                    PartitionKernelPtr,             ///< Function type of cub::PartitionKernel
-        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-        typename                    LengthOutputIterator,           ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
-        typename                    PredicateOp,                    ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
-        typename                    SizeT>                          ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        int                         ptx_version,                    ///< [in] PTX version
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::PartitionInitKernel
-        PartitionKernelPtr          partition_kernel,               ///< [in] Kernel function pointer to parameterization of cub::PartitionKernel
-        KernelDispachParams         &scan_dispatch_params,          ///< [in] Dispatch parameters that match the policy that \p partition_kernel was compiled for
-        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
-        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
-        LengthOutputIterator        d_partition_length,                 ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
-        PredicateOp                 pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
-        SizeT                       num_items,                      ///< [in] Total number of items to partition
-        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        enum
-        {
-            TILE_STATUS_PADDING = 32,
-        };
-
-        // Data type
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        // Scan tuple type and tile status descriptor type
-        typedef typename VectorHelper<SizeT, 2>::Type ScanTuple;
-        typedef ScanTileDescriptor<ScanTuple> ScanTileDescriptorT;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Number of input tiles
-            int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT),      // bytes needed for tile status descriptors
-                GridQueue<int>::AllocationSize()                                            // bytes needed for grid queue descriptor
-            };
-
-            // Alias temporaries (or set the necessary size of the storage allocation)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Global list of tile status
-            ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
-
-            // Grid queue descriptor
-            GridQueue<int> queue(allocations[1]);
-
-            // Log init_kernel configuration
-            int init_kernel_threads = 128;
-            int init_grid_size = (num_tiles + init_kernel_threads - 1) / init_kernel_threads;
-            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, init_kernel_threads, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors and queue descriptors
-            init_kernel<<<init_grid_size, init_kernel_threads, 0, stream>>>(
-                queue,
-                d_tile_status,
-                num_tiles);
-
-            // Sync the stream if specified
-            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get grid size for multi-block kernel
-            int scan_grid_size;
-            int multi_sm_occupancy = -1;
-            if (ptx_version < 200)
-            {
-                // We don't have atomics (or don't have fast ones), so just assign one
-                // block per tile (limited to 65K tiles)
-                scan_grid_size = num_tiles;
-            }
-            else
-            {
-                // We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
-                // Get GPU id
-                int device_ordinal;
-                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-                // Get SM count
-                int sm_count;
-                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-                // Get a rough estimate of partition_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
-                multi_sm_occupancy = CUB_MIN(
-                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
-                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
-
-#ifndef __CUDA_ARCH__
-                // We're on the host, so come up with a
-                Device device_props;
-                if (CubDebug(error = device_props.Init(device_ordinal))) break;
-
-                if (CubDebug(error = device_props.MaxSmOccupancy(
-                    multi_sm_occupancy,
-                    partition_kernel,
-                    scan_dispatch_params.block_threads))) break;
-#endif
-                // Get device occupancy for partition_kernel
-                int scan_occupancy = multi_sm_occupancy * sm_count;
-
-                // Get grid size for partition_kernel
-                scan_grid_size = (num_tiles < scan_occupancy) ?
-                    num_tiles :                 // Not enough to fill the device with threadblocks
-                    scan_occupancy;      // Fill the device with threadblocks
-            }
-
-            // Log partition_kernel configuration
-            if (stream_synchronous) CubLog("Invoking partition_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
-
-            // Invoke partition_kernel
-            partition_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
-                d_in,
-                d_out,
-                d_partition_length,
-                d_tile_status,
-                pred_op,
-                num_items,
-                num_tiles,
-                queue);
-
-            // Sync the stream if specified
-            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-
-    /**
-     * Internal partition dispatch routine for using default tuning policies
-     */
-    template <
-        typename                    PARTITIONS,                     ///< Number of partitions we are keeping
-        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-        typename                    LengthOutputIterator,           ///< Output iterator type for recording the length of the first partition (may be a simple pointer type)
-        typename                    PredicateOp,                    ///< Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
-        typename                    SizeT>                          ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to input items
-        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to output items
-        LengthOutputIterator        d_partition_length,             ///< [out] Output iterator referencing the location where the pivot offset (i.e., the length of the first partition) is to be recorded
-        PredicateOp                 pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
-        SizeT                       num_items,                      ///< [in] Total number of items to partition
-        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        // Data type
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        // Tuning polices
-        typedef PtxDefaultPolicies<PARTITIONS, T, SizeT>        PtxDefaultPolicies;     // Wrapper of default kernel policies
-        typedef typename PtxDefaultPolicies::PartitionPolicy    PartitionPolicy;        // Partition kernel policy
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Declare dispatch parameters
-            KernelDispachParams scan_dispatch_params;
-
-            int ptx_version;
-#ifdef __CUDA_ARCH__
-            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
-            scan_dispatch_params.Init<PartitionPolicy>();
-            ptx_version = CUB_PTX_ARCH;
-#else
-            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-            PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
-#endif
-
-            Dispatch(
-                ptx_version,
-                d_temp_storage,
-                temp_storage_bytes,
-                ScanInitKernel<T, SizeT>,
-                PartitionKernel<PartitionPolicy, InputIteratorRA, OutputIteratorRA, LengthOutputIterator, PredicateOp, SizeT>,
-                scan_dispatch_params,
-                d_in,
-                d_out,
-                d_partition_length,
-                pred_op,
-                num_items,
-                stream,
-                stream_synchronous);
-
-            if (CubDebug(error)) break;
-        }
-        while (0);
-
-        return error;
-    }
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    /**
-     * \brief Splits a list of input items into two partitions within the given output list using the specified predicate.  The relative ordering of inputs is not necessarily preserved.
-     *
-     * An item \p val is placed in the first partition if <tt>pred_op(val) == true</tt>, otherwise
-     * it is placed in the second partition.  The offset of the partitioning pivot (equivalent to
-     * the total length of the first partition as well as the starting offset of the second), is
-     * recorded to \p d_partition_length.
-     *
-     * The length of the output referenced by \p d_out is assumed to be the same as that of \p d_in.
-     *
-     * \devicestorage
-     *
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
-     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     * \tparam LengthOutputIterator <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     * \tparam PredicateOp          <b>[inferred]</b> Unary predicate operator indicating membership in the first partition type having member <tt>bool operator()(const T &val)</tt>
-     */
-    template <
-        typename                InputIteratorRA,
-        typename                OutputIteratorRA,
-        typename                LengthOutputIterator,
-        typename                PredicateOp>
-    __host__ __device__ __forceinline__
-    static cudaError_t Partition(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA         d_in,                           ///< [in] Iterator pointing to input items
-        OutputIteratorRA        d_out,                          ///< [in] Iterator pointing to output items
-        LengthOutputIterator    d_pivot_offset,                 ///< [out] Output iterator referencing the location where the pivot offset is to be recorded
-        PredicateOp             pred_op,                        ///< [in] Unary predicate operator indicating membership in the first partition
-        int                     num_items,                      ///< [in] Total number of items to partition
-        cudaStream_t            stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
-    }
-
-
-};
-
-
-/** @} */       // DeviceModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/device/device_scan.cuh b/kokkos/kokkos/TPL/cub/device/device_scan.cuh
deleted file mode 100644
index c0640c8..0000000
--- a/kokkos/kokkos/TPL/cub/device/device_scan.cuh
+++ /dev/null
@@ -1,812 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "block/block_scan_tiles.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../util_debug.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename T,                                     ///< Scan value type
-    typename SizeT>                                 ///< Integer type used for global array indexing
-__global__ void ScanInitKernel(
-    GridQueue<SizeT>            grid_queue,         ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks
-    ScanTileDescriptor<T>       *d_tile_status,     ///< [out] Tile status words
-    int                         num_tiles)          ///< [in] Number of tiles
-{
-    typedef ScanTileDescriptor<T> ScanTileDescriptorT;
-
-    enum
-    {
-        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
-    };
-
-    // Reset queue descriptor
-    if ((blockIdx.x == 0) && (threadIdx.x == 0)) grid_queue.ResetDrain(num_tiles);
-
-    // Initialize tile status
-    int tile_offset = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (tile_offset < num_tiles)
-    {
-        // Not-yet-set
-        d_tile_status[TILE_STATUS_PADDING + tile_offset].status = SCAN_TILE_INVALID;
-    }
-
-    if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-    {
-        // Padding
-        d_tile_status[threadIdx.x].status = SCAN_TILE_OOB;
-    }
-}
-
-
-/**
- * Scan kernel entry point (multi-block)
- */
-template <
-    typename    BlockScanTilesPolicy,           ///< Tuning policy for cub::BlockScanTiles abstraction
-    typename    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-    typename    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-    typename    T,                              ///< The scan data type
-    typename    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
-    typename    SizeT>                          ///< Integer type used for global array indexing
-__launch_bounds__ (int(BlockScanTilesPolicy::BLOCK_THREADS))
-__global__ void ScanKernel(
-    InputIteratorRA             d_in,           ///< Input data
-    OutputIteratorRA            d_out,          ///< Output data
-    ScanTileDescriptor<T>       *d_tile_status, ///< Global list of tile status
-    ScanOp                      scan_op,        ///< Binary scan operator
-    Identity                    identity,       ///< Identity element
-    SizeT                       num_items,      ///< Total number of scan items for the entire problem
-    GridQueue<int>              queue)          ///< Descriptor for performing dynamic mapping of tile data to thread blocks
-{
-    enum
-    {
-        TILE_STATUS_PADDING = PtxArchProps::WARP_THREADS,
-    };
-
-    // Thread block type for scanning input tiles
-    typedef BlockScanTiles<
-        BlockScanTilesPolicy,
-        InputIteratorRA,
-        OutputIteratorRA,
-        ScanOp,
-        Identity,
-        SizeT> BlockScanTilesT;
-
-    // Shared memory for BlockScanTiles
-    __shared__ typename BlockScanTilesT::TempStorage temp_storage;
-
-    // Process tiles
-    BlockScanTilesT(temp_storage, d_in, d_out, scan_op, identity).ConsumeTiles(
-        num_items,
-        queue,
-        d_tile_status + TILE_STATUS_PADDING);
-}
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * DeviceScan
- *****************************************************************************/
-
-/**
- * \brief DeviceScan provides operations for computing a device-wide, parallel prefix scan across data items residing within global memory. ![](device_scan.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output list where each element is computed to be the reduction
- * of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceScan}
- *
- * \par Performance
- *
- * \image html scan_perf.png
- *
- */
-struct DeviceScan
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    /// Generic structure for encapsulating dispatch properties.  Mirrors the constants within BlockScanTilesPolicy.
-    struct KernelDispachParams
-    {
-        // Policy fields
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        BlockStoreAlgorithm     store_policy;
-        BlockScanAlgorithm      scan_algorithm;
-
-        // Other misc
-        int                     tile_size;
-
-        template <typename BlockScanTilesPolicy>
-        __host__ __device__ __forceinline__
-        void Init()
-        {
-            block_threads               = BlockScanTilesPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockScanTilesPolicy::ITEMS_PER_THREAD;
-            load_policy                 = BlockScanTilesPolicy::LOAD_ALGORITHM;
-            store_policy                = BlockScanTilesPolicy::STORE_ALGORITHM;
-            scan_algorithm              = BlockScanTilesPolicy::SCAN_ALGORITHM;
-
-            tile_size                   = block_threads * items_per_thread;
-        }
-
-        __host__ __device__ __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_policy,
-                scan_algorithm);
-        }
-
-    };
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-
-    /// Specializations of tuned policy types for different PTX architectures
-    template <
-        typename    T,
-        typename    SizeT,
-        int         ARCH>
-    struct TunedPolicies;
-
-    /// SM35 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 350>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 16,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        // ScanPolicy: GTX Titan: 29.1B items/s (232.4 GB/s) @ 48M 32-bit T
-        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD,  BLOCK_LOAD_DIRECT, false, LOAD_LDG, BLOCK_STORE_WARP_TRANSPOSE, true, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-    };
-
-    /// SM30 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 300>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockScanTilesPolicy<256, ITEMS_PER_THREAD,  BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-    };
-
-    /// SM20 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 200>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        // ScanPolicy: GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
-        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-    };
-
-    /// SM10 tune
-    template <typename T, typename SizeT>
-    struct TunedPolicies<T, SizeT, 100>
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-        typedef BlockScanTilesPolicy<128, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE, false, LOAD_DEFAULT, BLOCK_STORE_TRANSPOSE, false, BLOCK_SCAN_RAKING> ScanPolicy;
-    };
-
-
-    /// Tuning policy for the PTX architecture that DeviceScan operations will get dispatched to
-    template <typename T, typename SizeT>
-    struct PtxDefaultPolicies
-    {
-        static const int PTX_TUNE_ARCH =   (CUB_PTX_ARCH >= 350) ?
-                                                350 :
-                                                (CUB_PTX_ARCH >= 300) ?
-                                                    300 :
-                                                    (CUB_PTX_ARCH >= 200) ?
-                                                        200 :
-                                                        100;
-
-        // Tuned policy set for the current PTX compiler pass
-        typedef TunedPolicies<T, SizeT, PTX_TUNE_ARCH> PtxTunedPolicies;
-
-        // ScanPolicy that opaquely derives from the specialization corresponding to the current PTX compiler pass
-        struct ScanPolicy : PtxTunedPolicies::ScanPolicy {};
-
-        /**
-         * Initialize dispatch params with the policies corresponding to the PTX assembly we will use
-         */
-        static void InitDispatchParams(int ptx_version, KernelDispachParams &scan_dispatch_params)
-        {
-            if (ptx_version >= 350)
-            {
-                typedef TunedPolicies<T, SizeT, 350> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
-            }
-            else if (ptx_version >= 300)
-            {
-                typedef TunedPolicies<T, SizeT, 300> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
-            }
-            else if (ptx_version >= 200)
-            {
-                typedef TunedPolicies<T, SizeT, 200> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
-            }
-            else
-            {
-                typedef TunedPolicies<T, SizeT, 100> TunedPolicies;
-                scan_dispatch_params.Init<typename TunedPolicies::ScanPolicy>();
-            }
-        }
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine
-     */
-    template <
-        typename                    ScanInitKernelPtr,              ///< Function type of cub::ScanInitKernel
-        typename                    ScanKernelPtr,                  ///< Function type of cub::ScanKernel
-        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-        typename                    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
-        typename                    SizeT>                          ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        int                         ptx_version,                    ///< [in] PTX version
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        ScanInitKernelPtr           init_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel
-        ScanKernelPtr               scan_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::ScanKernel
-        KernelDispachParams         &scan_dispatch_params,          ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
-        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
-        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
-        ScanOp                      scan_op,                        ///< [in] Binary scan operator
-        Identity                    identity,                       ///< [in] Identity element
-        SizeT                       num_items,                      ///< [in] Total number of items to scan
-        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        enum
-        {
-            TILE_STATUS_PADDING     = 32,
-            INIT_KERNEL_THREADS     = 128
-        };
-
-        // Data type
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        // Tile status descriptor type
-        typedef ScanTileDescriptor<T> ScanTileDescriptorT;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Number of input tiles
-            int num_tiles = (num_items + scan_dispatch_params.tile_size - 1) / scan_dispatch_params.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                (num_tiles + TILE_STATUS_PADDING) * sizeof(ScanTileDescriptorT),      // bytes needed for tile status descriptors
-                GridQueue<int>::AllocationSize()                                      // bytes needed for grid queue descriptor
-            };
-
-            // Alias temporaries (or set the necessary size of the storage allocation)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Global list of tile status
-            ScanTileDescriptorT *d_tile_status = (ScanTileDescriptorT*) allocations[0];
-
-            // Grid queue descriptor
-            GridQueue<int> queue(allocations[1]);
-
-            // Log init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (stream_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors and queue descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                queue,
-                d_tile_status,
-                num_tiles);
-
-            // Sync the stream if specified
-            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get grid size for multi-block kernel
-            int scan_grid_size;
-            int multi_sm_occupancy = -1;
-            if (ptx_version < 200)
-            {
-                // We don't have atomics (or don't have fast ones), so just assign one
-                // block per tile (limited to 65K tiles)
-                scan_grid_size = num_tiles;
-            }
-            else
-            {
-                // We have atomics and can thus reuse blocks across multiple tiles using a queue descriptor.
-                // Get GPU id
-                int device_ordinal;
-                if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-                // Get SM count
-                int sm_count;
-                if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-                // Get a rough estimate of scan_kernel SM occupancy based upon the maximum SM occupancy of the targeted PTX architecture
-                multi_sm_occupancy = CUB_MIN(
-                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADBLOCKS,
-                    ArchProps<CUB_PTX_ARCH>::MAX_SM_THREADS / scan_dispatch_params.block_threads);
-
-#ifndef __CUDA_ARCH__
-                // We're on the host, so come up with a
-                Device device_props;
-                if (CubDebug(error = device_props.Init(device_ordinal))) break;
-
-                if (CubDebug(error = device_props.MaxSmOccupancy(
-                    multi_sm_occupancy,
-                    scan_kernel,
-                    scan_dispatch_params.block_threads))) break;
-#endif
-                // Get device occupancy for scan_kernel
-                int scan_occupancy = multi_sm_occupancy * sm_count;
-
-                // Get grid size for scan_kernel
-                scan_grid_size = (num_tiles < scan_occupancy) ?
-                    num_tiles :                 // Not enough to fill the device with threadblocks
-                    scan_occupancy;      // Fill the device with threadblocks
-            }
-
-            // Log scan_kernel configuration
-            if (stream_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size, scan_dispatch_params.block_threads, (long long) stream, scan_dispatch_params.items_per_thread, multi_sm_occupancy);
-
-            // Invoke scan_kernel
-            scan_kernel<<<scan_grid_size, scan_dispatch_params.block_threads, 0, stream>>>(
-                d_in,
-                d_out,
-                d_tile_status,
-                scan_op,
-                identity,
-                num_items,
-                queue);
-
-            // Sync the stream if specified
-            if (stream_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-
-    /**
-     * Internal scan dispatch routine for using default tuning policies
-     */
-    template <
-        typename                    InputIteratorRA,                ///< Random-access iterator type for input (may be a simple pointer type)
-        typename                    OutputIteratorRA,               ///< Random-access iterator type for output (may be a simple pointer type)
-        typename                    ScanOp,                         ///< Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-        typename                    Identity,                       ///< Identity value type (cub::NullType for inclusive scans)
-        typename                    SizeT>                          ///< Integer type used for global array indexing
-    __host__ __device__ __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA             d_in,                           ///< [in] Iterator pointing to scan input
-        OutputIteratorRA            d_out,                          ///< [in] Iterator pointing to scan output
-        ScanOp                      scan_op,                        ///< [in] Binary scan operator
-        Identity                    identity,                       ///< [in] Identity element
-        SizeT                       num_items,                      ///< [in] Total number of items to scan
-        cudaStream_t                stream              = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        stream_synchronous  = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-    {
-        // Data type
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-
-        // Tuning polices
-        typedef PtxDefaultPolicies<T, SizeT>                    PtxDefaultPolicies;     // Wrapper of default kernel policies
-        typedef typename PtxDefaultPolicies::ScanPolicy   ScanPolicy;       // Scan kernel policy
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Declare dispatch parameters
-            KernelDispachParams scan_dispatch_params;
-
-            int ptx_version;
-#ifdef __CUDA_ARCH__
-            // We're on the device, so initialize the dispatch parameters with the PtxDefaultPolicies directly
-            scan_dispatch_params.Init<ScanPolicy>();
-            ptx_version = CUB_PTX_ARCH;
-#else
-            // We're on the host, so lookup and initialize the dispatch parameters with the policies that match the device's PTX version
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-            PtxDefaultPolicies::InitDispatchParams(ptx_version, scan_dispatch_params);
-#endif
-
-            Dispatch(
-                ptx_version,
-                d_temp_storage,
-                temp_storage_bytes,
-                ScanInitKernel<T, SizeT>,
-                ScanKernel<ScanPolicy, InputIteratorRA, OutputIteratorRA, T, ScanOp, Identity, SizeT>,
-                scan_dispatch_params,
-                d_in,
-                d_out,
-                scan_op,
-                identity,
-                num_items,
-                stream,
-                stream_synchronous);
-
-            if (CubDebug(error)) break;
-        }
-        while (0);
-
-        return error;
-    }
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    /******************************************************************//**
-     * \name Exclusive scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a device-wide exclusive prefix sum.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the exclusive prefix sum of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input and output
-     * int *d_scan_input, *d_scan_output;
-     * int num_items = ...
-     *
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix sum
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
-     *
-     * // Allocate temporary storage for exclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix sum
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
-     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     */
-    template <
-        typename            InputIteratorRA,
-        typename            OutputIteratorRA>
-    __host__ __device__ __forceinline__
-    static cudaError_t ExclusiveSum(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
-        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
-        int                 num_items,                          ///< [in] Total number of items to scan
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef typename std::iterator_traits<InputIteratorRA>::value_type T;
-        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), T(), num_items, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * Supports non-commutative scan operators.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the exclusive prefix scan of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input and output
-     * int *d_scan_input, *d_scan_output;
-     * int num_items = ...
-     *
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
-     *
-     * // Allocate temporary storage for exclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix scan (max)
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), (int) MIN_INT, num_items);
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
-     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam Identity             <b>[inferred]</b> Type of the \p identity value used Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            InputIteratorRA,
-        typename            OutputIteratorRA,
-        typename            ScanOp,
-        typename            Identity>
-    __host__ __device__ __forceinline__
-    static cudaError_t ExclusiveScan(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
-        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
-        ScanOp              scan_op,                            ///< [in] Binary scan operator
-        Identity            identity,                           ///< [in] Identity element
-        int                 num_items,                          ///< [in] Total number of items to scan
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, identity, num_items, stream, stream_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix sum.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the inclusive prefix sum of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input and output
-     * int *d_scan_input, *d_scan_output;
-     * int num_items = ...
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix sum
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix sum
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, num_items);
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
-     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     */
-    template <
-        typename            InputIteratorRA,
-        typename            OutputIteratorRA>
-    __host__ __device__ __forceinline__
-    static cudaError_t InclusiveSum(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
-        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
-        int                 num_items,                          ///< [in] Total number of items to scan
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, Sum(), NullType(), num_items, stream, stream_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * Supports non-commutative scan operators.
-     *
-     * \devicestorage
-     *
-     * \cdp
-     *
-     * \iterator
-     *
-     * \par
-     * The code snippet below illustrates the inclusive prefix scan of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     * ...
-     *
-     * // Declare and initialize device pointers for input and output
-     * int *d_scan_input, *d_scan_output;
-     * int num_items = ...
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix scan (max)
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_scan_input, d_scan_output, cub::Max(), num_items);
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorRA      <b>[inferred]</b> Random-access iterator type for input (may be a simple pointer type)
-     * \tparam OutputIteratorRA     <b>[inferred]</b> Random-access iterator type for output (may be a simple pointer type)
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            InputIteratorRA,
-        typename            OutputIteratorRA,
-        typename            ScanOp>
-    __host__ __device__ __forceinline__
-    static cudaError_t InclusiveScan(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Size in bytes of \p d_temp_storage allocation.
-        InputIteratorRA     d_in,                               ///< [in] Iterator pointing to scan input
-        OutputIteratorRA    d_out,                              ///< [in] Iterator pointing to scan output
-        ScanOp              scan_op,                            ///< [in] Binary scan operator
-        int                 num_items,                          ///< [in] Total number of items to scan
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                stream_synchronous  = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        return Dispatch(d_temp_storage, temp_storage_bytes, d_in, d_out, scan_op, NullType(), num_items, stream, stream_synchronous);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/grid/grid_barrier.cuh b/kokkos/kokkos/TPL/cub/grid/grid_barrier.cuh
deleted file mode 100644
index ebdc4b5..0000000
--- a/kokkos/kokkos/TPL/cub/grid/grid_barrier.cuh
+++ /dev/null
@@ -1,211 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-
-#pragma once
-
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-#include "../thread/thread_load.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-class GridBarrier
-{
-protected :
-
-    typedef unsigned int SyncFlag;
-
-    // Counters in global device memory
-    SyncFlag* d_sync;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrier() : d_sync(NULL) {}
-
-
-    /**
-     * Synchronize
-     */
-    __device__ __forceinline__ void Sync() const
-    {
-        volatile SyncFlag *d_vol_sync = d_sync;
-
-        // Threadfence and syncthreads to make sure global writes are visible before
-        // thread-0 reports in with its sync counter
-        __threadfence();
-        __syncthreads();
-
-        if (blockIdx.x == 0)
-        {
-            // Report in ourselves
-            if (threadIdx.x == 0)
-            {
-                d_vol_sync[blockIdx.x] = 1;
-            }
-
-            __syncthreads();
-
-            // Wait for everyone else to report in
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            __syncthreads();
-
-            // Let everyone know it's safe to proceed
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                d_vol_sync[peer_block] = 0;
-            }
-        }
-        else
-        {
-            if (threadIdx.x == 0)
-            {
-                // Report in
-                d_vol_sync[blockIdx.x] = 1;
-
-                // Wait for acknowledgment
-                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            __syncthreads();
-        }
-    }
-};
-
-
-/**
- * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
- *
- * Uses RAII for lifetime, i.e., device resources are reclaimed when
- * the destructor is called.
- */
-class GridBarrierLifetime : public GridBarrier
-{
-protected:
-
-    // Number of bytes backed by d_sync
-    size_t sync_bytes;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
-
-
-    /**
-     * DeviceFrees and resets the progress counters
-     */
-    cudaError_t HostReset()
-    {
-        cudaError_t retval = cudaSuccess;
-        if (d_sync)
-        {
-            CubDebug(retval = cudaFree(d_sync));
-            d_sync = NULL;
-        }
-        sync_bytes = 0;
-        return retval;
-    }
-
-
-    /**
-     * Destructor
-     */
-    virtual ~GridBarrierLifetime()
-    {
-        HostReset();
-    }
-
-
-    /**
-     * Sets up the progress counters for the next kernel launch (lazily
-     * allocating and initializing them if necessary)
-     */
-    cudaError_t Setup(int sweep_grid_size)
-    {
-        cudaError_t retval = cudaSuccess;
-        do {
-            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
-            if (new_sync_bytes > sync_bytes)
-            {
-                if (d_sync)
-                {
-                    if (CubDebug(retval = cudaFree(d_sync))) break;
-                }
-
-                sync_bytes = new_sync_bytes;
-
-                // Allocate and initialize to zero
-                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
-                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
-            }
-        } while (0);
-
-        return retval;
-    }
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/grid/grid_even_share.cuh b/kokkos/kokkos/TPL/cub/grid/grid_even_share.cuh
deleted file mode 100644
index defe9e0..0000000
--- a/kokkos/kokkos/TPL/cub/grid/grid_even_share.cuh
+++ /dev/null
@@ -1,197 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
- */
-
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
- *
- * \par Overview
- * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
- * Threadblocks may receive one of three different amounts of work: "big", "normal",
- * and "last".  The "big" workloads are one scheduling grain larger than "normal".  The "last" work unit
- * for the last threadblock may be partially-full if the input is not an even multiple of
- * the scheduling grain size.
- *
- * \par
- * Before invoking a child grid, a parent thread will typically construct and initialize an instance of
- * GridEvenShare using \p GridInit().  The instance can be passed to child threadblocks which can
- * initialize their per-threadblock offsets using \p BlockInit().
- *
- * \tparam SizeT Integer type for array indexing
- */
-template <typename SizeT>
-class GridEvenShare
-{
-private:
-
-    SizeT   total_grains;
-    int     big_blocks;
-    SizeT   big_share;
-    SizeT   normal_share;
-    SizeT   normal_base_offset;
-
-
-public:
-
-    /// Total number of input items
-    SizeT   num_items;
-
-    /// Grid size in threadblocks
-    int     grid_size;
-
-    /// Offset into input marking the beginning of the owning thread block's segment of input tiles
-    SizeT   block_offset;
-
-    /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
-    SizeT   block_oob;
-
-    /**
-     * \brief Block-based constructor for single-block grids.
-     */
-    __device__ __forceinline__ GridEvenShare(SizeT num_items) :
-        num_items(num_items),
-        grid_size(1),
-        block_offset(0),
-        block_oob(num_items) {}
-
-
-    /**
-     * \brief Default constructor.  Zero-initializes block-specific fields.
-     */
-    __host__ __device__ __forceinline__ GridEvenShare() :
-        num_items(0),
-        grid_size(0),
-        block_offset(0),
-        block_oob(0) {}
-
-
-    /**
-     * \brief Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
-     */
-    __host__ __device__ __forceinline__ void GridInit(
-        SizeT   num_items,                  ///< Total number of input items
-        int     max_grid_size,              ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
-        int     schedule_granularity)       ///< Granularity by which the input can be parcelled into and distributed among threablocks.  Usually the thread block's native tile size (or a multiple thereof.
-    {
-        this->num_items             = num_items;
-        this->block_offset          = 0;
-        this->block_oob             = 0;
-        this->total_grains          = (num_items + schedule_granularity - 1) / schedule_granularity;
-        this->grid_size             = CUB_MIN(total_grains, max_grid_size);
-        SizeT grains_per_block      = total_grains / grid_size;
-        this->big_blocks            = total_grains - (grains_per_block * grid_size);        // leftover grains go to big blocks
-        this->normal_share          = grains_per_block * schedule_granularity;
-        this->normal_base_offset    = big_blocks * schedule_granularity;
-        this->big_share             = normal_share + schedule_granularity;
-    }
-
-
-    /**
-     * \brief Initializes the threadblock-specific details (e.g., to be called by each threadblock after startup)
-     */
-    __device__ __forceinline__ void BlockInit()
-    {
-        if (blockIdx.x < big_blocks)
-        {
-            // This threadblock gets a big share of grains (grains_per_block + 1)
-            block_offset = (blockIdx.x * big_share);
-            block_oob = block_offset + big_share;
-        }
-        else if (blockIdx.x < total_grains)
-        {
-            // This threadblock gets a normal share of grains (grains_per_block)
-            block_offset = normal_base_offset + (blockIdx.x * normal_share);
-            block_oob = block_offset + normal_share;
-        }
-
-        // Last threadblock
-        if (blockIdx.x == grid_size - 1)
-        {
-            block_oob = num_items;
-        }
-    }
-
-
-    /**
-     * Print to stdout
-     */
-    __host__ __device__ __forceinline__ void Print()
-    {
-        printf(
-#ifdef __CUDA_ARCH__
-            "\tthreadblock(%d) "
-            "block_offset(%lu) "
-            "block_oob(%lu) "
-#endif
-            "num_items(%lu)  "
-            "total_grains(%lu)  "
-            "big_blocks(%lu)  "
-            "big_share(%lu)  "
-            "normal_share(%lu)\n",
-#ifdef __CUDA_ARCH__
-                blockIdx.x,
-                (unsigned long) block_offset,
-                (unsigned long) block_oob,
-#endif
-                (unsigned long) num_items,
-                (unsigned long) total_grains,
-                (unsigned long) big_blocks,
-                (unsigned long) big_share,
-                (unsigned long) normal_share);
-    }
-};
-
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/grid/grid_mapping.cuh b/kokkos/kokkos/TPL/cub/grid/grid_mapping.cuh
deleted file mode 100644
index 419f9ac..0000000
--- a/kokkos/kokkos/TPL/cub/grid/grid_mapping.cuh
+++ /dev/null
@@ -1,95 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/******************************************************************************
- * Mapping policies
- *****************************************************************************/
-
-
-/**
- * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-enum GridMappingStrategy
-{
-    /**
-     * \brief An "even-share" strategy for assigning input tiles to thread blocks.
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p segments, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each segment is comprised of
-     * consecutive tiles, where a tile is a small, constant-sized unit of input
-     * to be processed to completion before the thread block terminates or
-     * obtains more work.  The kernel invokes \p p thread blocks, each
-     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
-     * in tile-size increments.
-     */
-    GRID_MAPPING_EVEN_SHARE,
-
-    /**
-     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
-     *
-     * \par Overview
-     * The input is treated as a queue to be dynamically consumed by a grid of
-     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
-     * unit of input to be processed to completion before the thread block
-     * terminates or obtains more work.  The grid size \p p is constant,
-     * loosely corresponding to the number of thread blocks that may actively
-     * reside on the target device.
-     */
-    GRID_MAPPING_DYNAMIC,
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/grid/grid_queue.cuh b/kokkos/kokkos/TPL/cub/grid/grid_queue.cuh
deleted file mode 100644
index 009260d..0000000
--- a/kokkos/kokkos/TPL/cub/grid/grid_queue.cuh
+++ /dev/null
@@ -1,207 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridQueue is a descriptor utility for dynamic queue management.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_debug.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridQueue is a descriptor utility for dynamic queue management.
- *
- * \par Overview
- * GridQueue descriptors provides abstractions for "filling" or
- * "draining" globally-shared vectors.
- *
- * \par
- * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
- * returning a unique offset for the calling thread to write its items.
- * The GridQueue maintains the total "fill-size".  The fill counter must be reset
- * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
- * will be filling.
- *
- * \par
- * Similarly a "draining" GridQueue works by works by atomically-incrementing a
- * zero-initialized counter, returning a unique offset for the calling thread to
- * read its items. Threads can safely drain until the array's logical fill-size is
- * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
- * GridQueue::ResetDrainAfterFill by the host or kernel instance prior to the kernel instance that
- * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
- * is simply the number of elements in the array.)
- *
- * \par
- * Iterative work management can be implemented simply with a pair of flip-flopping
- * work buffers, each with an associated set of fill and drain GridQueue descriptors.
- *
- * \tparam SizeT Integer type for array indexing
- */
-template <typename SizeT>
-class GridQueue
-{
-private:
-
-    /// Counter indices
-    enum
-    {
-        FILL    = 0,
-        DRAIN   = 1,
-    };
-
-    /// Pair of counters
-    SizeT *d_counters;
-
-public:
-
-    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
-    __host__ __device__ __forceinline__
-    static size_t AllocationSize()
-    {
-        return sizeof(SizeT) * 2;
-    }
-
-
-    /// Constructs an invalid GridQueue descriptor around the device storage allocation
-    __host__ __device__ __forceinline__ GridQueue(
-        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
-    :
-        d_counters((SizeT*) d_storage)
-    {}
-
-
-    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t ResetDrainAfterFill(cudaStream_t stream = 0)
-    {
-#ifdef __CUDA_ARCH__
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        return ResetDrain(0, stream);
-#endif
-    }
-
-    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t ResetDrain(
-        SizeT fill_size,
-        cudaStream_t stream = 0)
-    {
-#ifdef __CUDA_ARCH__
-        d_counters[FILL] = fill_size;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        SizeT counters[2];
-        counters[FILL] = fill_size;
-        counters[DRAIN] = 0;
-        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(SizeT) * 2, cudaMemcpyHostToDevice, stream));
-#endif
-    }
-
-
-    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
-    __host__ __device__ __forceinline__ cudaError_t ResetFill()
-    {
-#ifdef __CUDA_ARCH__
-        d_counters[FILL] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(SizeT)));
-#endif
-    }
-
-
-    /// Returns the fill-size established by the parent or by the previous kernel.
-    __host__ __device__ __forceinline__ cudaError_t FillSize(
-        SizeT &fill_size,
-        cudaStream_t stream = 0)
-    {
-#ifdef __CUDA_ARCH__
-        fill_size = d_counters[FILL];
-#else
-        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(SizeT), cudaMemcpyDeviceToHost, stream));
-#endif
-    }
-
-
-    /// Drain num_items.  Returns offset from which to read items.
-    __device__ __forceinline__ SizeT Drain(SizeT num_items)
-    {
-        return atomicAdd(d_counters + DRAIN, num_items);
-    }
-
-
-    /// Fill num_items.  Returns offset from which to write items.
-    __device__ __forceinline__ SizeT Fill(SizeT num_items)
-    {
-        return atomicAdd(d_counters + FILL, num_items);
-    }
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Reset grid queue (call with 1 block of 1 thread)
- */
-template <typename SizeT>
-__global__ void ResetDrainKernel(
-    GridQueue<SizeT>    grid_queue,
-    SizeT               num_items)
-{
-    grid_queue.ResetDrain(num_items);
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/kokkos/kokkos/TPL/cub/host/spinlock.cuh b/kokkos/kokkos/TPL/cub/host/spinlock.cuh
deleted file mode 100644
index 5621b6f..0000000
--- a/kokkos/kokkos/TPL/cub/host/spinlock.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
- */
-
-
-#pragma once
-
-#if defined(_WIN32) || defined(_WIN64)
-    #include <intrin.h>
-    #include <windows.h>
-    #undef small            // Windows is terrible for polluting macro namespace
-
-    /**
-     * Compiler read/write barrier
-     */
-    #pragma intrinsic(_ReadWriteBarrier)
-
-#endif
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-#if defined(_MSC_VER)
-
-    // Microsoft VC++
-    typedef long Spinlock;
-
-#else
-
-    // GNU g++
-    typedef int Spinlock;
-
-    /**
-     * Compiler read/write barrier
-     */
-    __forceinline__ void _ReadWriteBarrier()
-    {
-        __sync_synchronize();
-    }
-
-    /**
-     * Atomic exchange
-     */
-    __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
-    {
-        // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
-        _ReadWriteBarrier();
-        return __sync_lock_test_and_set(Target, Value);
-    }
-
-    /**
-     * Pause instruction to prevent excess processor bus usage
-     */
-    __forceinline__ void YieldProcessor()
-    {
-#ifndef __arm__
-        asm volatile("pause\n": : :"memory");
-#endif  // __arm__
-    }
-
-#endif  // defined(_MSC_VER)
-
-/**
- * Return when the specified spinlock has been acquired
- */
-__forceinline__ void Lock(volatile Spinlock *lock)
-{
-    while (1)
-    {
-        if (!_InterlockedExchange(lock, 1)) return;
-        while (*lock) YieldProcessor();
-    }
-}
-
-
-/**
- * Release the specified spinlock
- */
-__forceinline__ void Unlock(volatile Spinlock *lock)
-{
-    _ReadWriteBarrier();
-    *lock = 0;
-}
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/kokkos/kokkos/TPL/cub/thread/thread_load.cuh b/kokkos/kokkos/TPL/cub/thread/thread_load.cuh
deleted file mode 100644
index ee112b9..0000000
--- a/kokkos/kokkos/TPL/cub/thread/thread_load.cuh
+++ /dev/null
@@ -1,429 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for reading memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-#include <iterator>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup IoModule
- * @{
- */
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of PTX cache-modifiers for memory load operations.
- */
-enum PtxLoadModifier
-{
-    LOAD_DEFAULT,       ///< Default (no modifier)
-    LOAD_CA,            ///< Cache at all levels
-    LOAD_CG,            ///< Cache at global level
-    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
-    LOAD_CV,            ///< Cache as volatile (including cached system lines)
-    LOAD_LDG,           ///< Cache as texture
-    LOAD_VOLATILE,      ///< Volatile (any memory space)
-};
-
-
-/**
- * \name Simple I/O
- * @{
- */
-
-/**
- * \brief Thread utility for reading memory using cub::PtxLoadModifier cache modifiers.
- *
- * Cache modifiers will only be effected for built-in types (i.e., C++
- * primitives and CUDA vector-types).
- *
- * For example:
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * // 32-bit load using cache-global modifier:
- * int *d_in;
- * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
- *
- * // 16-bit load using default modifier
- * short *d_in;
- * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
- *
- * // 256-bit load using cache-volatile modifier
- * double4 *d_in;
- * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
- *
- * // 96-bit load using default cache modifier (ignoring LOAD_CS)
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
- * \endcode
- *
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename InputIteratorRA>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr);
-
-
-//@}  end member group
-
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Define a int4 (16B) ThreadLoad specialization for the given PTX load modifier
- */
-#define CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ int4 ThreadLoad<cub_modifier, int4*>(int4* ptr)              \
-    {                                                                                       \
-        int4 retval;                                                                        \
-        asm volatile ("ld."#ptx_modifier".v4.s32 {%0, %1, %2, %3}, [%4];" :                 \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y),                                                                 \
-            "=r"(retval.z),                                                                 \
-            "=r"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ longlong2 ThreadLoad<cub_modifier, longlong2*>(longlong2* ptr)              \
-    {                                                                                       \
-        longlong2 retval;                                                                   \
-        asm volatile ("ld."#ptx_modifier".v2.s64 {%0, %1}, [%2];" :                         \
-            "=l"(retval.x),                                                                 \
-            "=l"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a int2 (8B) ThreadLoad specialization for the given PTX load modifier
- */
-#define CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ short4 ThreadLoad<cub_modifier, short4*>(short4* ptr)        \
-    {                                                                                       \
-        short4 retval;                                                                      \
-        asm volatile ("ld."#ptx_modifier".v4.s16 {%0, %1, %2, %3}, [%4];" :                 \
-            "=h"(retval.x),                                                                 \
-            "=h"(retval.y),                                                                 \
-            "=h"(retval.z),                                                                 \
-            "=h"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ int2 ThreadLoad<cub_modifier, int2*>(int2* ptr)              \
-    {                                                                                       \
-        int2 retval;                                                                        \
-        asm volatile ("ld."#ptx_modifier".v2.s32 {%0, %1}, [%2];" :                         \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ long long ThreadLoad<cub_modifier, long long*>(long long* ptr)                 \
-    {                                                                                       \
-        long long retval;                                                                   \
-        asm volatile ("ld."#ptx_modifier".s64 %0, [%1];" :                                  \
-            "=l"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a int (4B) ThreadLoad specialization for the given PTX load modifier
- */
-#define CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ int ThreadLoad<cub_modifier, int*>(int* ptr)                 \
-    {                                                                                       \
-        int retval;                                                                         \
-        asm volatile ("ld."#ptx_modifier".s32 %0, [%1];" :                                  \
-            "=r"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define a short (2B) ThreadLoad specialization for the given PTX load modifier
- */
-#define CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ short ThreadLoad<cub_modifier, short*>(short* ptr)           \
-    {                                                                                       \
-        short retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".s16 %0, [%1];" :                                  \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define a char (1B) ThreadLoad specialization for the given PTX load modifier
- */
-#define CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ char ThreadLoad<cub_modifier, char*>(char* ptr)              \
-    {                                                                                       \
-        short retval;                                                                       \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .s8 datum;"                                                                \
-        "    ld."#ptx_modifier".s8 datum, [%1];"                                            \
-        "    cvt.s16.s8 %0, datum;"                                                         \
-        "}" :                                                                               \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return (char) retval;                                                               \
-    }
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the given PTX load modifier
- */
-#define CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
-    CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
-    CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
-    CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
-    CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
-    CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
-
-
-/**
- * Define ThreadLoad specializations for the various PTX load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    CUB_LOAD_ALL(LOAD_CA, ca)
-    CUB_LOAD_ALL(LOAD_CG, cg)
-    CUB_LOAD_ALL(LOAD_CS, cs)
-    CUB_LOAD_ALL(LOAD_CV, cv)
-#else
-    // LOAD_CV on SM10-13 uses "volatile.global" to ensure reads from last level
-    CUB_LOAD_ALL(LOAD_CV, volatile.global)
-#endif
-#if CUB_PTX_ARCH >= 350
-    CUB_LOAD_ALL(LOAD_LDG, global.nc)
-#endif
-
-
-/// Helper structure for templated load iteration (inductive case)
-template <PtxLoadModifier MODIFIER, int COUNT, int MAX>
-struct IterateThreadLoad
-{
-    template <typename T>
-    static __device__ __forceinline__ void Load(T *ptr, T *vals)
-    {
-        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
-        IterateThreadLoad<MODIFIER, COUNT + 1, MAX>::Load(ptr, vals);
-    }
-};
-
-/// Helper structure for templated load iteration (termination case)
-template <PtxLoadModifier MODIFIER, int MAX>
-struct IterateThreadLoad<MODIFIER, MAX, MAX>
-{
-    template <typename T>
-    static __device__ __forceinline__ void Load(T *ptr, T *vals) {}
-};
-
-
-
-/**
- * Load with LOAD_DEFAULT on iterator types
- */
-template <typename InputIteratorRA>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(
-    InputIteratorRA         itr,
-    Int2Type<LOAD_DEFAULT>  modifier,
-    Int2Type<false>         is_pointer)
-{
-    return *itr;
-}
-
-
-/**
- * Load with LOAD_DEFAULT on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_DEFAULT>  modifier,
-    Int2Type<true>          is_pointer)
-{
-    return *ptr;
-}
-
-
-/**
- * Load with LOAD_VOLATILE on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatile(
-    T                       *ptr,
-    Int2Type<true>          is_primitive)
-{
-    T retval = *reinterpret_cast<volatile T*>(ptr);
-
-#if (CUB_PTX_ARCH <= 130)
-    if (sizeof(T) == 1) __threadfence_block();
-#endif
-
-    return retval;
-}
-
-
-/**
- * Load with LOAD_VOLATILE on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatile(
-    T                       *ptr,
-    Int2Type<false>          is_primitive)
-{
-    typedef typename WordAlignment<T>::VolatileWord VolatileWord;   // Word type for memcopying
-    enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
-
-    // Memcopy from aliased source into array of uninitialized words
-    typename WordAlignment<T>::UninitializedVolatileWords words;
-
-    #pragma unroll
-    for (int i = 0; i < NUM_WORDS; ++i)
-        words.buf[i] = reinterpret_cast<volatile VolatileWord*>(ptr)[i];
-
-    // Load from words
-    return *reinterpret_cast<T*>(words.buf);
-}
-
-
-/**
- * Load with LOAD_VOLATILE on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_VOLATILE> modifier,
-    Int2Type<true>          is_pointer)
-{
-    return ThreadLoadVolatile(ptr, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-#if (CUB_PTX_ARCH <= 130)
-
-/**
- * Load with LOAD_CG uses LOAD_CV in pre-SM20 PTX to ensure coherent reads when run on newer architectures with L1
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_CG>       modifier,
-    Int2Type<true>          is_pointer)
-{
-    return ThreadLoad<LOAD_CV>(ptr);
-}
-
-#endif  // (CUB_PTX_ARCH <= 130)
-
-
-/**
- * Load with arbitrary MODIFIER on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<MODIFIER>      modifier,
-    Int2Type<true>          is_pointer)
-{
-    typedef typename WordAlignment<T>::DeviceWord DeviceWord;
-    enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
-
-    // Memcopy from aliased source into array of uninitialized words
-    typename WordAlignment<T>::UninitializedDeviceWords words;
-
-    IterateThreadLoad<PtxLoadModifier(MODIFIER), 0, NUM_WORDS>::Load(
-        reinterpret_cast<DeviceWord*>(ptr),
-        words.buf);
-
-    // Load from words
-    return *reinterpret_cast<T*>(words.buf);
-}
-
-
-/**
- * Generic ThreadLoad definition
- */
-template <
-    PtxLoadModifier MODIFIER,
-    typename InputIteratorRA>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorRA>::value_type ThreadLoad(InputIteratorRA itr)
-{
-    return ThreadLoad(
-        itr,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<InputIteratorRA>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group IoModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/thread/thread_operators.cuh b/kokkos/kokkos/TPL/cub/thread/thread_operators.cuh
deleted file mode 100644
index bfb3d7c..0000000
--- a/kokkos/kokkos/TPL/cub/thread/thread_operators.cuh
+++ /dev/null
@@ -1,145 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple binary operator functor types
- */
-
-/******************************************************************************
- * Simple functor operators
- ******************************************************************************/
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup ThreadModule
- * @{
- */
-
-/**
- * \brief Default equality functor
- */
-struct Equality
-{
-    /// Boolean equality operator, returns <tt>(a == b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
-    {
-        return a == b;
-    }
-};
-
-
-/**
- * \brief Default inequality functor
- */
-struct Inequality
-{
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
-    {
-        return a != b;
-    }
-};
-
-
-/**
- * \brief Default sum functor
- */
-struct Sum
-{
-    /// Boolean sum operator, returns <tt>a + b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
-    {
-        return a + b;
-    }
-};
-
-
-/**
- * \brief Default max functor
- */
-struct Max
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
-    {
-        return CUB_MAX(a, b);
-    }
-};
-
-
-/**
- * \brief Default min functor
- */
-struct Min
-{
-    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b)
-    {
-        return CUB_MIN(a, b);
-    }
-};
-
-
-/**
- * \brief Default cast functor
- */
-template <typename B>
-struct Cast
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename A>
-    __host__ __device__ __forceinline__ B operator()(const A &a)
-    {
-        return (B) a;
-    }
-};
-
-
-
-/** @} */       // end group ThreadModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/thread/thread_reduce.cuh b/kokkos/kokkos/TPL/cub/thread/thread_reduce.cuh
deleted file mode 100644
index 374fd77..0000000
--- a/kokkos/kokkos/TPL/cub/thread/thread_reduce.cuh
+++ /dev/null
@@ -1,145 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential reduction over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup ThreadModule
- * @{
- */
-
-/**
- * \name Sequential reduction over statically-sized array types
- * @{
- */
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        prefix = reduction_op(prefix, input[i]);
-    }
-
-    return prefix;
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    T prefix = input[0];
-    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
-}
-
-
-/**
- * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce<LENGTH>(input, reduction_op, prefix);
-}
-
-
-/**
- * \brief Serial reduction with the specified operator
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    return ThreadReduce<LENGTH>((T*) input, reduction_op);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group ThreadModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/thread/thread_scan.cuh b/kokkos/kokkos/TPL/cub/thread/thread_scan.cuh
deleted file mode 100644
index b43bbcf..0000000
--- a/kokkos/kokkos/TPL/cub/thread/thread_scan.cuh
+++ /dev/null
@@ -1,231 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential prefix scan over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup ThreadModule
- * @{
- */
-
-/**
- * \name Sequential prefix scan over statically-sized array types
- * @{
- */
-
-/**
- * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = prefix;
-    T exclusive = inclusive;
-
-    #pragma unroll
-    for (int i = 1; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(exclusive, input[i]);
-        output[i] = exclusive;
-        exclusive = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix);
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    T inclusive = input[0];
-    output[0] = inclusive;
-
-    // Continue scan
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(inclusive, input[i]);
-        output[i] = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = inclusive;
-
-    // Continue scan
-    #pragma unroll
-    for (int i = 1; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(inclusive, input[i]);
-        output[i] = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group ThreadModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/thread/thread_store.cuh b/kokkos/kokkos/TPL/cub/thread/thread_store.cuh
deleted file mode 100644
index 8d39e07..0000000
--- a/kokkos/kokkos/TPL/cub/thread/thread_store.cuh
+++ /dev/null
@@ -1,412 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for writing memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup IoModule
- * @{
- */
-
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of PTX cache-modifiers for memory store operations.
- */
-enum PtxStoreModifier
-{
-    STORE_DEFAULT,              ///< Default (no modifier)
-    STORE_WB,                   ///< Cache write-back all coherent levels
-    STORE_CG,                   ///< Cache at global level
-    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
-    STORE_WT,                   ///< Cache write-through (to system memory)
-    STORE_VOLATILE,             ///< Volatile shared (any memory space)
-};
-
-
-/**
- * \name Simple I/O
- * @{
- */
-
-/**
- * \brief Thread utility for writing memory using cub::PtxStoreModifier cache modifiers.
- *
- * Cache modifiers will only be effected for built-in types (i.e., C++
- * primitives and CUDA vector-types).
- *
- * For example:
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * // 32-bit store using cache-global modifier:
- * int *d_out;
- * int val;
- * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
- *
- * // 16-bit store using default modifier
- * short *d_out;
- * short val;
- * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
- *
- * // 256-bit store using write-through modifier
- * double4 *d_out;
- * double4 val;
- * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
- *
- * // 96-bit store using default cache modifier (ignoring STORE_CS)
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val;
- * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
- * \endcode
- *
- */
-template <
-    PtxStoreModifier MODIFIER,
-    typename OutputIteratorRA,
-    typename T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Define a int4 (16B) ThreadStore specialization for the given PTX load modifier
- */
-#define CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, int4*, int4>(int4* ptr, int4 val)              \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.s32 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y),                                                                     \
-            "r"(val.z),                                                                     \
-            "r"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, longlong2*, longlong2>(longlong2* ptr, longlong2 val)              \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.s64 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val.x),                                                                     \
-            "l"(val.y));                                                                    \
-    }
-
-
-/**
- * Define a int2 (8B) ThreadStore specialization for the given PTX load modifier
- */
-#define CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, short4*, short4>(short4* ptr, short4 val)              \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.s16 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val.x),                                                                     \
-            "h"(val.y),                                                                     \
-            "h"(val.z),                                                                     \
-            "h"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, int2*, int2>(int2* ptr, int2 val)              \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.s32 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, long long*, long long>(long long* ptr, long long val)                 \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".s64 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val));                                                                      \
-    }
-
-/**
- * Define a int (4B) ThreadStore specialization for the given PTX load modifier
- */
-#define CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, int*, int>(int* ptr, int val)                 \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".s32 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val));                                                                      \
-    }
-
-
-/**
- * Define a short (2B) ThreadStore specialization for the given PTX load modifier
- */
-#define CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, short*, short>(short* ptr, short val)           \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".s16 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val));                                                                      \
-    }
-
-
-/**
- * Define a char (1B) ThreadStore specialization for the given PTX load modifier
- */
-#define CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, char*, char>(char* ptr, char val)              \
-    {                                                                                       \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .s8 datum;"                                                                \
-        "   cvt.s8.s16 datum, %1;"                                                          \
-        "   st."#ptx_modifier".s8 [%0], datum;"                                             \
-        "}" : :                                                                             \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(short(val)));                                                               \
-    }
-
-/**
- * Define powers-of-two ThreadStore specializations for the given PTX load modifier
- */
-#define CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
-    CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
-    CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
-    CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
-    CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
-    CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
-
-
-/**
- * Define ThreadStore specializations for the various PTX load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    CUB_STORE_ALL(STORE_WB, ca)
-    CUB_STORE_ALL(STORE_CG, cg)
-    CUB_STORE_ALL(STORE_CS, cs)
-    CUB_STORE_ALL(STORE_WT, cv)
-#else
-    // STORE_WT on SM10-13 uses "volatile.global" to ensure writes to last level
-    CUB_STORE_ALL(STORE_WT, volatile.global)
-#endif
-
-
-
-/// Helper structure for templated store iteration (inductive case)
-template <PtxStoreModifier MODIFIER, int COUNT, int MAX>
-struct IterateThreadStore
-{
-    template <typename T>
-    static __device__ __forceinline__ void Store(T *ptr, T *vals)
-    {
-        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
-        IterateThreadStore<MODIFIER, COUNT + 1, MAX>::Store(ptr, vals);
-    }
-};
-
-/// Helper structure for templated store iteration (termination case)
-template <PtxStoreModifier MODIFIER, int MAX>
-struct IterateThreadStore<MODIFIER, MAX, MAX>
-{
-    template <typename T>
-    static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
-};
-
-
-
-
-/**
- * Store with STORE_DEFAULT on iterator types
- */
-template <typename OutputIteratorRA, typename T>
-__device__ __forceinline__ void ThreadStore(
-    OutputIteratorRA            itr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     modifier,
-    Int2Type<false>             is_pointer)
-{
-    *itr = val;
-}
-
-
-/**
- * Store with STORE_DEFAULT on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     modifier,
-    Int2Type<true>              is_pointer)
-{
-    *ptr = val;
-}
-
-
-/**
- * Store with STORE_VOLATILE on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatile(
-    T                           *ptr,
-    T                           val,
-    Int2Type<true>              is_primitive)
-{
-    *reinterpret_cast<volatile T*>(ptr) = val;
-}
-
-
-/**
- * Store with STORE_VOLATILE on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatile(
-    T                           *ptr,
-    T                           val,
-    Int2Type<false>             is_primitive)
-{
-    typedef typename WordAlignment<T>::VolatileWord VolatileWord;   // Word type for memcopying
-    enum { NUM_WORDS = sizeof(T) / sizeof(VolatileWord) };
-
-    // Store into array of uninitialized words
-    typename WordAlignment<T>::UninitializedVolatileWords words;
-    *reinterpret_cast<T*>(words.buf) = val;
-
-    // Memcopy words to aliased destination
-    #pragma unroll
-    for (int i = 0; i < NUM_WORDS; ++i)
-        reinterpret_cast<volatile VolatileWord*>(ptr)[i] = words.buf[i];
-}
-
-
-/**
- * Store with STORE_VOLATILE on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_VOLATILE>    modifier,
-    Int2Type<true>              is_pointer)
-{
-    ThreadStoreVolatile(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-#if (CUB_PTX_ARCH <= 350)
-
-/**
- * Store with STORE_CG on pointer types (uses STORE_DEFAULT on current architectures)
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_CG>          modifier,
-    Int2Type<true>              is_pointer)
-{
-    ThreadStore<STORE_DEFAULT>(ptr, val);
-}
-
-#endif  // (CUB_PTX_ARCH <= 350)
-
-
-/**
- * Store with arbitrary MODIFIER on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<MODIFIER>          modifier,
-    Int2Type<true>              is_pointer)
-{
-    typedef typename WordAlignment<T>::DeviceWord DeviceWord;   // Word type for memcopying
-    enum { NUM_WORDS = sizeof(T) / sizeof(DeviceWord) };
-
-    // Store into array of uninitialized words
-    typename WordAlignment<T>::UninitializedDeviceWords words;
-    *reinterpret_cast<T*>(words.buf) = val;
-
-    // Memcopy words to aliased destination
-    IterateThreadStore<PtxStoreModifier(MODIFIER), 0, NUM_WORDS>::Store(
-        reinterpret_cast<DeviceWord*>(ptr),
-        words.buf);
-}
-
-
-/**
- * Generic ThreadStore definition
- */
-template <PtxStoreModifier MODIFIER, typename OutputIteratorRA, typename T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorRA itr, T val)
-{
-    ThreadStore(
-        itr,
-        val,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<OutputIteratorRA>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group IoModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_allocator.cuh b/kokkos/kokkos/TPL/cub/util_allocator.cuh
deleted file mode 100644
index ae40f33..0000000
--- a/kokkos/kokkos/TPL/cub/util_allocator.cuh
+++ /dev/null
@@ -1,661 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple caching allocator for device memory allocations. The allocator is
- * thread-safe and capable of managing device allocations on multiple devices.
- ******************************************************************************/
-
-#pragma once
-
-#ifndef __CUDA_ARCH__
-    #include <set>              // NVCC (EDG, really) takes FOREVER to compile std::map
-    #include <map>
-#endif
-
-#include <math.h>
-
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-#include "host/spinlock.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-/******************************************************************************
- * CachingDeviceAllocator (host use)
- ******************************************************************************/
-
-/**
- * \brief A simple caching allocator for device memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe and is capable of managing cached device allocations
- * on multiple devices.  It behaves as follows:
- *
- * \par
- * - Allocations categorized by bin size.
- * - Bin sizes progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused device allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations on a given device will exceed
- *   \p max_cached_bytes, allocations for that device are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingDeviceAllocator is configured with:
- * - \p bin_growth = 8
- * - \p min_bin = 3
- * - \p max_bin = 7
- * - \p max_cached_bytes = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes per device
- *
- */
-struct CachingDeviceAllocator
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Invalid device ordinal
-        INVALID_DEVICE_ORDINAL = -1,
-    };
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(
-        unsigned int base,
-        unsigned int exp)
-    {
-        unsigned int retval = 1;
-        while (exp > 0)
-        {
-            if (exp & 1) {
-                retval = retval * base;        // multiply the result by the current base
-            }
-            base = base * base;                // square the base
-            exp = exp >> 1;                    // divide the exponent in half
-        }
-        return retval;
-    }
-
-
-    /**
-     * Round up to the nearest power-of
-     */
-    static void NearestPowerOf(
-        unsigned int &power,
-        size_t &rounded_bytes,
-        unsigned int base,
-        size_t value)
-    {
-        power = 0;
-        rounded_bytes = 1;
-
-        while (rounded_bytes < value)
-        {
-            rounded_bytes *= base;
-            power++;
-        }
-    }
-
-    /**
-     * Descriptor for device memory allocations
-     */
-    struct BlockDescriptor
-    {
-        int   device;        // device ordinal
-        void*           d_ptr;      // Device pointer
-        size_t          bytes;      // Size of allocation in bytes
-        unsigned int    bin;        // Bin enumeration
-
-        // Constructor
-        BlockDescriptor(void *d_ptr, int device) :
-            d_ptr(d_ptr),
-            bytes(0),
-            bin(0),
-            device(device) {}
-
-        // Constructor
-        BlockDescriptor(size_t bytes, unsigned int bin, int device) :
-            d_ptr(NULL),
-            bytes(bytes),
-            bin(bin),
-            device(device) {}
-
-        // Comparison functor for comparing device pointers
-        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device < b.device) {
-                return true;
-            } else if (a.device > b.device) {
-                return false;
-            } else {
-                return (a.d_ptr < b.d_ptr);
-            }
-        }
-
-        // Comparison functor for comparing allocation sizes
-        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device < b.device) {
-                return true;
-            } else if (a.device > b.device) {
-                return false;
-            } else {
-                return (a.bytes < b.bytes);
-            }
-        }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-#ifndef __CUDA_ARCH__   // Only define STL container members in host code
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, size_t> GpuCachedBytes;
-
-#endif // __CUDA_ARCH__
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    Spinlock        spin_lock;          /// Spinlock for thread-safety
-
-    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
-    unsigned int    min_bin;            /// Minimum bin enumeration
-    unsigned int    max_bin;            /// Maximum bin enumeration
-
-    size_t          min_bin_bytes;      /// Minimum bin size
-    size_t          max_bin_bytes;      /// Maximum bin size
-    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
-
-    bool            debug;              /// Whether or not to print (de)allocation events to stdout
-    bool            skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-
-#ifndef __CUDA_ARCH__   // Only define STL container members in host code
-
-    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
-    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
-    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
-
-#endif // __CUDA_ARCH__
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingDeviceAllocator(
-        unsigned int bin_growth,    ///< Geometric growth factor for bin-sizes
-        unsigned int min_bin,       ///< Minimum bin
-        unsigned int max_bin,       ///< Maximum bin
-        size_t max_cached_bytes)    ///< Maximum aggregate cached bytes per device
-    :
-    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
-            cached_blocks(BlockDescriptor::SizeCompare),
-            live_blocks(BlockDescriptor::PtrCompare),
-    #endif
-            debug(false),
-            spin_lock(0),
-            bin_growth(bin_growth),
-            min_bin(min_bin),
-            max_bin(max_bin),
-            min_bin_bytes(IntPow(bin_growth, min_bin)),
-            max_bin_bytes(IntPow(bin_growth, max_bin)),
-            max_cached_bytes(max_cached_bytes)
-    {}
-
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth = 8
-     * - \p min_bin = 3
-     * - \p max_bin = 7
-     * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes per device
-     */
-    CachingDeviceAllocator(bool skip_cleanup = false) :
-    #ifndef __CUDA_ARCH__   // Only define STL container members in host code
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare),
-    #endif
-        skip_cleanup(skip_cleanup),
-        debug(false),
-        spin_lock(0),
-        bin_growth(8),
-        min_bin(3),
-        max_bin(7),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes((max_bin_bytes * 3) - 1)
-    {}
-
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
-     */
-    cudaError_t SetMaxCachedBytes(
-        size_t max_cached_bytes)
-    {
-    #ifdef __CUDA_ARCH__
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        // Lock
-        Lock(&spin_lock);
-
-        this->max_cached_bytes = max_cached_bytes;
-
-        if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
-
-        // Unlock
-        Unlock(&spin_lock);
-
-        return cudaSuccess;
-
-    #endif  // __CUDA_ARCH__
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the specified device
-     */
-    cudaError_t DeviceAllocate(
-        void** d_ptr,
-        size_t bytes,
-        int device)
-    {
-    #ifdef __CUDA_ARCH__
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        bool locked                     = false;
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        // Round up to nearest bin size
-        unsigned int bin;
-        size_t bin_bytes;
-        NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
-        if (bin < min_bin) {
-            bin = min_bin;
-            bin_bytes = min_bin_bytes;
-        }
-
-        // Check if bin is greater than our maximum bin
-        if (bin > max_bin)
-        {
-            // Allocate the request exactly and give out-of-range bin
-            bin = (unsigned int) -1;
-            bin_bytes = bytes;
-        }
-
-        BlockDescriptor search_key(bin_bytes, bin, device);
-
-        // Lock
-        if (!locked) {
-            Lock(&spin_lock);
-            locked = true;
-        }
-
-        do {
-            // Find a free block big enough within the same bin on the same device
-            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-            if ((block_itr != cached_blocks.end()) &&
-                (block_itr->device == device) &&
-                (block_itr->bin == search_key.bin))
-            {
-                // Reuse existing cache block.  Insert into live blocks.
-                search_key = *block_itr;
-                live_blocks.insert(search_key);
-
-                // Remove from free blocks
-                cached_blocks.erase(block_itr);
-                cached_bytes[device] -= search_key.bytes;
-
-                if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-            }
-            else
-            {
-                // Need to allocate a new cache block. Unlock.
-                if (locked) {
-                    Unlock(&spin_lock);
-                    locked = false;
-                }
-
-                // Set to specified device
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-                if (CubDebug(error = cudaSetDevice(device))) break;
-
-                // Allocate
-                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
-
-                // Lock
-                if (!locked) {
-                    Lock(&spin_lock);
-                    locked = true;
-                }
-
-                // Insert into live blocks
-                live_blocks.insert(search_key);
-
-                if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                    device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-            }
-        } while(0);
-
-        // Unlock
-        if (locked) {
-            Unlock(&spin_lock);
-            locked = false;
-        }
-
-        // Copy device pointer to output parameter (NULL on error)
-        *d_ptr = search_key.d_ptr;
-
-        // Attempt to revert back to previous device if necessary
-        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-
-    #endif  // __CUDA_ARCH__
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the current device
-     */
-    cudaError_t DeviceAllocate(
-        void** d_ptr,
-        size_t bytes)
-    {
-    #ifdef __CUDA_ARCH__
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-        cudaError_t error = cudaSuccess;
-        do {
-            int current_device;
-            if (CubDebug(error = cudaGetDevice(&current_device))) break;
-            if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break;
-        } while(0);
-
-        return error;
-
-    #endif  // __CUDA_ARCH__
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator
-     */
-    cudaError_t DeviceFree(
-        void* d_ptr,
-        int device)
-    {
-    #ifdef __CUDA_ARCH__
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        bool locked                     = false;
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        BlockDescriptor search_key(d_ptr, device);
-
-        // Lock
-        if (!locked) {
-            Lock(&spin_lock);
-            locked = true;
-        }
-
-        do {
-            // Find corresponding block descriptor
-            BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-            if (block_itr == live_blocks.end())
-            {
-                // Cannot find pointer
-                if (CubDebug(error = cudaErrorUnknown)) break;
-            }
-            else
-            {
-                // Remove from live blocks
-                search_key = *block_itr;
-                live_blocks.erase(block_itr);
-
-                // Check if we should keep the returned allocation
-                if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
-                {
-                    // Insert returned allocation into free blocks
-                    cached_blocks.insert(search_key);
-                    cached_bytes[device] += search_key.bytes;
-
-                    if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-                }
-                else
-                {
-                    // Free the returned allocation.  Unlock.
-                    if (locked) {
-                        Unlock(&spin_lock);
-                        locked = false;
-                    }
-
-                    // Set to specified device
-                    if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-                    if (CubDebug(error = cudaSetDevice(device))) break;
-
-                    // Free device memory
-                    if (CubDebug(error = cudaFree(d_ptr))) break;
-
-                    if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                        device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-                }
-            }
-        } while (0);
-
-        // Unlock
-        if (locked) {
-            Unlock(&spin_lock);
-            locked = false;
-        }
-
-        // Attempt to revert back to entry-point device if necessary
-        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-
-    #endif  // __CUDA_ARCH__
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator
-     */
-    cudaError_t DeviceFree(
-        void* d_ptr)
-    {
-    #ifdef __CUDA_ARCH__
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        int current_device;
-        cudaError_t error = cudaSuccess;
-
-        do {
-            if (CubDebug(error = cudaGetDevice(&current_device))) break;
-            if (CubDebug(error = DeviceFree(d_ptr, current_device))) break;
-        } while(0);
-
-        return error;
-
-    #endif  // __CUDA_ARCH__
-    }
-
-
-    /**
-     * \brief Frees all cached device allocations on all devices
-     */
-    cudaError_t FreeAllCached()
-    {
-    #ifdef __CUDA_ARCH__
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        cudaError_t error         = cudaSuccess;
-        bool locked               = false;
-        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
-        int current_device        = INVALID_DEVICE_ORDINAL;
-
-        // Lock
-        if (!locked) {
-            Lock(&spin_lock);
-            locked = true;
-        }
-
-        while (!cached_blocks.empty())
-        {
-            // Get first block
-            CachedBlocks::iterator begin = cached_blocks.begin();
-
-            // Get entry-point device ordinal if necessary
-            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-            }
-
-            // Set current device ordinal if necessary
-            if (begin->device != current_device)
-            {
-                if (CubDebug(error = cudaSetDevice(begin->device))) break;
-                current_device = begin->device;
-            }
-
-            // Free device memory
-            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
-
-            // Reduce balance and erase entry
-            cached_bytes[current_device] -= begin->bytes;
-            cached_blocks.erase(begin);
-
-            if (debug) CubLog("\tdevice %d freed %lld bytes.  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
-        }
-
-        // Unlock
-        if (locked) {
-            Unlock(&spin_lock);
-            locked = false;
-        }
-
-        // Attempt to revert back to entry-point device if necessary
-        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-
-    #endif  // __CUDA_ARCH__
-    }
-
-
-    /**
-     * \brief Destructor
-     */
-    virtual ~CachingDeviceAllocator()
-    {
-        if (!skip_cleanup)
-            FreeAllCached();
-    }
-
-};
-
-
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_arch.cuh b/kokkos/kokkos/TPL/cub/util_arch.cuh
deleted file mode 100644
index 232a33c..0000000
--- a/kokkos/kokkos/TPL/cub/util_arch.cuh
+++ /dev/null
@@ -1,295 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Static architectural properties by SM version.
- */
-
-
-/******************************************************************************
- * Static architectural properties by SM version.
- *
- * "Device" reflects the PTX architecture targeted by the active compiler
- * pass.  It provides useful compile-time statics within device code.  E.g.,:
- *
- *     __shared__ int[Device::WARP_THREADS];
- *
- *     int padded_offset = threadIdx.x + (threadIdx.x >> Device::LOG_SMEM_BANKS);
- *
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
-#ifndef __CUDA_ARCH__
-    #define CUB_PTX_ARCH 0
-#else
-    #define CUB_PTX_ARCH __CUDA_ARCH__
-#endif
-
-
-/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
-#if !defined(__CUDA_ARCH__) || defined(CUB_CDP)
-#define CUB_RUNTIME_ENABLED
-#endif
-
-
-/// Execution space for destructors
-#if ((CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH < 200))
-    #define CUB_DESTRUCTOR __host__
-#else
-    #define CUB_DESTRUCTOR __host__ __device__
-#endif
-
-
-/**
- * \brief Structure for statically reporting CUDA device properties, parameterized by SM architecture.
- *
- * The default specialization is for SM10.
- */
-template <int SM_ARCH>
-struct ArchProps
-{
-    enum
-    {
-        LOG_WARP_THREADS    =
-                                        5,                        /// Log of the number of threads per warp
-        WARP_THREADS        =
-                                        1 << LOG_WARP_THREADS,    /// Number of threads per warp
-        LOG_SMEM_BANKS      =
-                                        4,                        /// Log of the number of smem banks
-        SMEM_BANKS          =
-                                        1 << LOG_SMEM_BANKS,      /// The number of smem banks
-        SMEM_BANK_BYTES     =
-                                        4,                        /// Size of smem bank words
-        SMEM_BYTES          =
-                                        16 * 1024,                /// Maximum SM shared memory
-        SMEM_ALLOC_UNIT     =
-                                        512,                      /// Smem allocation size in bytes
-        REGS_BY_BLOCK       =
-                                        true,                     /// Whether or not the architecture allocates registers by block (or by warp)
-        REG_ALLOC_UNIT      =
-                                        256,                      /// Number of registers allocated at a time per block (or by warp)
-        WARP_ALLOC_UNIT     =
-                                        2,                        /// Granularity of warps for which registers are allocated
-        MAX_SM_THREADS      =
-                                        768,                      /// Maximum number of threads per SM
-        MAX_SM_THREADBLOCKS =
-                                        8,                        /// Maximum number of thread blocks per SM
-        MAX_BLOCK_THREADS   =
-                                        512,                      /// Maximum number of thread per thread block
-        MAX_SM_REGISTERS    =
-                                        8 * 1024,                 /// Maximum number of registers per SM
-    };
-};
-
-
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Architecture properties for SM30
- */
-template <>
-struct ArchProps<300>
-{
-    enum
-    {
-        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
-        WARP_THREADS        = 1 << LOG_WARP_THREADS,
-        LOG_SMEM_BANKS      = 5,                        // 32 banks
-        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
-        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
-        SMEM_BYTES          = 48 * 1024,                // 48KB shared memory
-        SMEM_ALLOC_UNIT     = 256,                      // 256B smem allocation segment size
-        REGS_BY_BLOCK       = false,                    // Allocates registers by warp
-        REG_ALLOC_UNIT      = 256,                      // 256 registers allocated at a time per warp
-        WARP_ALLOC_UNIT     = 4,                        // Registers are allocated at a granularity of every 4 warps per threadblock
-        MAX_SM_THREADS      = 2048,                     // 2K max threads per SM
-        MAX_SM_THREADBLOCKS = 16,                       // 16 max threadblocks per SM
-        MAX_BLOCK_THREADS   = 1024,                     // 1024 max threads per threadblock
-        MAX_SM_REGISTERS    = 64 * 1024,                // 64K max registers per SM
-    };
-
-    // Callback utility
-    template <typename T>
-    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
-    {
-        target.template Callback<ArchProps>();
-    }
-};
-
-
-/**
- * Architecture properties for SM20
- */
-template <>
-struct ArchProps<200>
-{
-    enum
-    {
-        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
-        WARP_THREADS        = 1 << LOG_WARP_THREADS,
-        LOG_SMEM_BANKS      = 5,                        // 32 banks
-        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
-        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
-        SMEM_BYTES          = 48 * 1024,                // 48KB shared memory
-        SMEM_ALLOC_UNIT     = 128,                      // 128B smem allocation segment size
-        REGS_BY_BLOCK       = false,                    // Allocates registers by warp
-        REG_ALLOC_UNIT      = 64,                       // 64 registers allocated at a time per warp
-        WARP_ALLOC_UNIT     = 2,                        // Registers are allocated at a granularity of every 2 warps per threadblock
-        MAX_SM_THREADS      = 1536,                     // 1536 max threads per SM
-        MAX_SM_THREADBLOCKS = 8,                        // 8 max threadblocks per SM
-        MAX_BLOCK_THREADS   = 1024,                     // 1024 max threads per threadblock
-        MAX_SM_REGISTERS    = 32 * 1024,                // 32K max registers per SM
-    };
-
-    // Callback utility
-    template <typename T>
-    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
-    {
-        if (sm_version > 200) {
-            ArchProps<300>::Callback(target, sm_version);
-        } else {
-            target.template Callback<ArchProps>();
-        }
-    }
-};
-
-
-/**
- * Architecture properties for SM12
- */
-template <>
-struct ArchProps<120>
-{
-    enum
-    {
-        LOG_WARP_THREADS    = 5,                        // 32 threads per warp
-        WARP_THREADS        = 1 << LOG_WARP_THREADS,
-        LOG_SMEM_BANKS      = 4,                        // 16 banks
-        SMEM_BANKS          = 1 << LOG_SMEM_BANKS,
-        SMEM_BANK_BYTES     = 4,                        // 4 byte bank words
-        SMEM_BYTES          = 16 * 1024,                // 16KB shared memory
-        SMEM_ALLOC_UNIT     = 512,                      // 512B smem allocation segment size
-        REGS_BY_BLOCK       = true,                     // Allocates registers by threadblock
-        REG_ALLOC_UNIT      = 512,                      // 512 registers allocated at time per threadblock
-        WARP_ALLOC_UNIT     = 2,                        // Registers are allocated at a granularity of every 2 warps per threadblock
-        MAX_SM_THREADS      = 1024,                     // 1024 max threads per SM
-        MAX_SM_THREADBLOCKS = 8,                        // 8 max threadblocks per SM
-        MAX_BLOCK_THREADS   = 512,                      // 512 max threads per threadblock
-        MAX_SM_REGISTERS    = 16 * 1024,                // 16K max registers per SM
-    };
-
-    // Callback utility
-    template <typename T>
-    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
-    {
-        if (sm_version > 120) {
-            ArchProps<200>::Callback(target, sm_version);
-        } else {
-            target.template Callback<ArchProps>();
-        }
-    }
-};
-
-
-/**
- * Architecture properties for SM10.  Derives from the default ArchProps specialization.
- */
-template <>
-struct ArchProps<100> : ArchProps<0>
-{
-    // Callback utility
-    template <typename T>
-    static __host__ __device__ __forceinline__ void Callback(T &target, int sm_version)
-    {
-        if (sm_version > 100) {
-            ArchProps<120>::Callback(target, sm_version);
-        } else {
-            target.template Callback<ArchProps>();
-        }
-    }
-};
-
-
-/**
- * Architecture properties for SM35
- */
-template <>
-struct ArchProps<350> : ArchProps<300> {};        // Derives from SM30
-
-/**
- * Architecture properties for SM21
- */
-template <>
-struct ArchProps<210> : ArchProps<200> {};        // Derives from SM20
-
-/**
- * Architecture properties for SM13
- */
-template <>
-struct ArchProps<130> : ArchProps<120> {};        // Derives from SM12
-
-/**
- * Architecture properties for SM11
- */
-template <>
-struct ArchProps<110> : ArchProps<100> {};        // Derives from SM10
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief The architectural properties for the PTX version targeted by the active compiler pass.
- */
-struct PtxArchProps : ArchProps<CUB_PTX_ARCH> {};
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_debug.cuh b/kokkos/kokkos/TPL/cub/util_debug.cuh
deleted file mode 100644
index 2ac67d7..0000000
--- a/kokkos/kokkos/TPL/cub/util_debug.cuh
+++ /dev/null
@@ -1,115 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Error and event logging routines.
- *
- * The following macros definitions are supported:
- * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include "util_namespace.cuh"
-#include "util_arch.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-/// CUB error reporting macro (prints error messages to stderr)
-#if (defined(DEBUG) || defined(_DEBUG))
-    #define CUB_STDERR
-#endif
-
-
-
-/**
- * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
- *
- * \return The CUDA error.
- */
-__host__ __device__ __forceinline__ cudaError_t Debug(
-    cudaError_t     error,
-    const char*     filename,
-    int             line)
-{
-#ifdef CUB_STDERR
-    if (error)
-    {
-    #if (CUB_PTX_ARCH == 0)
-        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-        fflush(stderr);
-    #elif (CUB_PTX_ARCH >= 200)
-        printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
-    #endif
-    }
-#endif
-    return error;
-}
-
-
-/**
- * \brief Debug macro
- */
-#define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
-
-
-/**
- * \brief Debug macro with exit
- */
-#define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
-
-
-/**
- * \brief Log macro for printf statements.
- */
-#if (CUB_PTX_ARCH == 0)
-    #define CubLog(format, ...) printf(format,__VA_ARGS__);
-#elif (CUB_PTX_ARCH >= 200)
-    #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
-#endif
-
-
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_device.cuh b/kokkos/kokkos/TPL/cub/util_device.cuh
deleted file mode 100644
index 0631b92..0000000
--- a/kokkos/kokkos/TPL/cub/util_device.cuh
+++ /dev/null
@@ -1,378 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Properties of a given CUDA device and the corresponding PTX bundle
- */
-
-#pragma once
-
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
- */
-template <typename T>
-__global__ void EmptyKernel(void) { }
-
-
-/**
- * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
- */
-template <int ALLOCATIONS>
-__host__ __device__ __forceinline__
-cudaError_t AliasTemporaries(
-    void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is returned in \p temp_storage_bytes and no work is done.
-    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
-    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
-    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
-{
-    const int ALIGN_BYTES   = 256;
-    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
-
-    // Compute exclusive prefix sum over allocation requests
-    size_t bytes_needed = 0;
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
-        allocation_sizes[i] = bytes_needed;
-        bytes_needed += allocation_bytes;
-    }
-
-    // Check if the caller is simply requesting the size of the storage allocation
-    if (!d_temp_storage)
-    {
-        temp_storage_bytes = bytes_needed;
-        return cudaSuccess;
-    }
-
-    // Check if enough storage provided
-    if (temp_storage_bytes < bytes_needed)
-    {
-        return CubDebug(cudaErrorMemoryAllocation);
-    }
-
-    // Alias
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_sizes[i];
-    }
-
-    return cudaSuccess;
-}
-
-
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/**
- * \brief Retrieves the PTX version (major * 100 + minor * 10)
- */
-__host__ __device__ __forceinline__ cudaError_t PtxVersion(int &ptx_version)
-{
-#ifndef CUB_RUNTIME_ENABLED
-
-    // CUDA API calls not supported from this device
-    return cudaErrorInvalidConfiguration;
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        cudaFuncAttributes empty_kernel_attrs;
-        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
-        ptx_version = empty_kernel_attrs.ptxVersion * 10;
-    }
-    while (0);
-
-    return error;
-
-#endif
-}
-
-
-/**
- * Synchronize the stream if specified
- */
-__host__ __device__ __forceinline__
-static cudaError_t SyncStream(cudaStream_t stream)
-{
-#ifndef __CUDA_ARCH__
-    return cudaStreamSynchronize(stream);
-#else
-    // Device can't yet sync on a specific stream
-    return cudaDeviceSynchronize();
-#endif
-}
-
-
-
-/**
- * \brief Properties of a given CUDA device and the corresponding PTX bundle
- */
-class Device
-{
-private:
-
-    /// Type definition of the EmptyKernel kernel entry point
-    typedef void (*EmptyKernelPtr)();
-
-    /// Force EmptyKernel<void> to be generated if this class is used
-    __host__ __device__ __forceinline__
-    EmptyKernelPtr Empty()
-    {
-        return EmptyKernel<void>;
-    }
-
-public:
-
-    // Version information
-    int     sm_version;             ///< SM version of target device (SM version X.YZ in XYZ integer form)
-    int     ptx_version;            ///< Bundled PTX version for target device (PTX version X.YZ in XYZ integer form)
-
-    // Target device properties
-    int     sm_count;               ///< Number of SMs
-    int     warp_threads;           ///< Number of threads per warp
-    int     smem_bank_bytes;        ///< Number of bytes per SM bank
-    int     smem_banks;             ///< Number of smem banks
-    int     smem_bytes;             ///< Smem bytes per SM
-    int     smem_alloc_unit;        ///< Smem segment size
-    bool    regs_by_block;          ///< Whether registers are allocated by threadblock (or by warp)
-    int     reg_alloc_unit;         ///< Granularity of register allocation within the SM
-    int     warp_alloc_unit;        ///< Granularity of warp allocation within the SM
-    int     max_sm_threads;         ///< Maximum number of threads per SM
-    int     max_sm_blocks;          ///< Maximum number of threadblocks per SM
-    int     max_block_threads;      ///< Maximum number of threads per threadblock
-    int     max_sm_registers;       ///< Maximum number of registers per SM
-    int     max_sm_warps;           ///< Maximum number of warps per SM
-
-    /**
-     * Callback for initializing device properties
-     */
-    template <typename ArchProps>
-    __host__ __device__ __forceinline__ void Callback()
-    {
-        warp_threads        = ArchProps::WARP_THREADS;
-        smem_bank_bytes     = ArchProps::SMEM_BANK_BYTES;
-        smem_banks          = ArchProps::SMEM_BANKS;
-        smem_bytes          = ArchProps::SMEM_BYTES;
-        smem_alloc_unit     = ArchProps::SMEM_ALLOC_UNIT;
-        regs_by_block       = ArchProps::REGS_BY_BLOCK;
-        reg_alloc_unit      = ArchProps::REG_ALLOC_UNIT;
-        warp_alloc_unit     = ArchProps::WARP_ALLOC_UNIT;
-        max_sm_threads      = ArchProps::MAX_SM_THREADS;
-        max_sm_blocks       = ArchProps::MAX_SM_THREADBLOCKS;
-        max_block_threads   = ArchProps::MAX_BLOCK_THREADS;
-        max_sm_registers    = ArchProps::MAX_SM_REGISTERS;
-        max_sm_warps        = max_sm_threads / warp_threads;
-    }
-
-
-public:
-
-    /**
-     * Initializer.  Properties are retrieved for the specified GPU ordinal.
-     */
-    __host__ __device__ __forceinline__
-    cudaError_t Init(int device_ordinal)
-    {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // CUDA API calls not supported from this device
-        return CubDebug(cudaErrorInvalidConfiguration);
-
-    #else
-
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            // Fill in SM version
-            int major, minor;
-            if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
-            if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
-            sm_version = major * 100 + minor * 10;
-
-            // Fill in static SM properties
-            // Initialize our device properties via callback from static device properties
-            ArchProps<100>::Callback(*this, sm_version);
-
-            // Fill in SM count
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Fill in PTX version
-        #if CUB_PTX_ARCH > 0
-            ptx_version = CUB_PTX_ARCH;
-        #else
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-        #endif
-
-        }
-        while (0);
-
-        return error;
-
-    #endif
-    }
-
-
-    /**
-     * Initializer.  Properties are retrieved for the current GPU ordinal.
-     */
-    __host__ __device__ __forceinline__
-    cudaError_t Init()
-    {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // CUDA API calls not supported from this device
-        return CubDebug(cudaErrorInvalidConfiguration);
-
-    #else
-
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            int device_ordinal;
-            if ((error = CubDebug(cudaGetDevice(&device_ordinal)))) break;
-            if ((error = Init(device_ordinal))) break;
-        }
-        while (0);
-        return error;
-
-    #endif
-    }
-
-
-    /**
-     * Computes maximum SM occupancy in thread blocks for the given kernel
-     */
-    template <typename KernelPtr>
-    __host__ __device__ __forceinline__
-    cudaError_t MaxSmOccupancy(
-        int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
-        KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
-        int                 block_threads)              ///< [in] Number of threads per thread block
-    {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // CUDA API calls not supported from this device
-        return CubDebug(cudaErrorInvalidConfiguration);
-
-    #else
-
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            // Get kernel attributes
-            cudaFuncAttributes kernel_attrs;
-            if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
-
-            // Number of warps per threadblock
-            int block_warps = (block_threads +  warp_threads - 1) / warp_threads;
-
-            // Max warp occupancy
-            int max_warp_occupancy = (block_warps > 0) ?
-                max_sm_warps / block_warps :
-                max_sm_blocks;
-
-            // Maximum register occupancy
-            int max_reg_occupancy;
-            if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
-            {
-                // Prevent divide-by-zero
-                max_reg_occupancy = max_sm_blocks;
-            }
-            else if (regs_by_block)
-            {
-                // Allocates registers by threadblock
-                int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
-                max_reg_occupancy = max_sm_registers / block_regs;
-            }
-            else
-            {
-                // Allocates registers by warp
-                int sm_sides                = warp_alloc_unit;
-                int sm_registers_per_side   = max_sm_registers / sm_sides;
-                int regs_per_warp           = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
-                int warps_per_side          = sm_registers_per_side / regs_per_warp;
-                int warps                   = warps_per_side * sm_sides;
-                max_reg_occupancy           = warps / block_warps;
-            }
-
-            // Shared memory per threadblock
-            int block_allocated_smem = CUB_ROUND_UP_NEAREST(
-                kernel_attrs.sharedSizeBytes,
-                smem_alloc_unit);
-
-            // Max shared memory occupancy
-            int max_smem_occupancy = (block_allocated_smem > 0) ?
-                (smem_bytes / block_allocated_smem) :
-                max_sm_blocks;
-
-            // Max occupancy
-            max_sm_occupancy = CUB_MIN(
-                CUB_MIN(max_sm_blocks, max_warp_occupancy),
-                CUB_MIN(max_smem_occupancy, max_reg_occupancy));
-
-//            printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d)", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
-
-        } while (0);
-
-        return error;
-
-    #endif
-    }
-
-};
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_iterator.cuh b/kokkos/kokkos/TPL/cub/util_iterator.cuh
deleted file mode 100644
index 08b574c..0000000
--- a/kokkos/kokkos/TPL/cub/util_iterator.cuh
+++ /dev/null
@@ -1,718 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include "thread/thread_load.cuh"
-#include "util_device.cuh"
-#include "util_debug.cuh"
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Texture references
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-// Anonymous namespace
-namespace {
-
-/// Templated texture reference type
-template <typename T>
-struct TexIteratorRef
-{
-    // Texture reference type
-    typedef texture<T, cudaTextureType1D, cudaReadModeElementType> TexRef;
-
-    static TexRef ref;
-
-    /**
-     * Bind texture
-     */
-    static cudaError_t BindTexture(void *d_in)
-    {
-        cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<T>();
-        if (d_in)
-            return (CubDebug(cudaBindTexture(NULL, ref, d_in, tex_desc)));
-
-        return cudaSuccess;
-    }
-
-    /**
-     * Unbind textures
-     */
-    static cudaError_t UnbindTexture()
-    {
-        return CubDebug(cudaUnbindTexture(ref));
-    }
-};
-
-// Texture reference definitions
-template <typename Value>
-typename TexIteratorRef<Value>::TexRef TexIteratorRef<Value>::ref = 0;
-
-} // Anonymous namespace
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-
-
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-/******************************************************************************
- * Iterators
- *****************************************************************************/
-
-/**
- * \brief A simple random-access iterator pointing to a range of constant values
- *
- * \par Overview
- * ConstantIteratorRA is a random-access iterator that when dereferenced, always
- * returns the supplied constant of type \p OutputType.
- *
- * \tparam OutputType           The value type of this iterator
- */
-template <typename OutputType>
-class ConstantIteratorRA
-{
-public:
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    typedef ConstantIteratorRA                  self_type;
-    typedef OutputType                          value_type;
-    typedef OutputType                          reference;
-    typedef OutputType*                         pointer;
-    typedef std::random_access_iterator_tag     iterator_category;
-    typedef int                                 difference_type;
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-private:
-
-    OutputType    val;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ConstantIteratorRA(
-        const OutputType &val)          ///< Constant value for the iterator instance to report
-    :
-        val(val)
-    {}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        self_type i = *this;
-        return i;
-    }
-
-    __host__ __device__ __forceinline__ self_type operator++(int junk)
-    {
-        return *this;
-    }
-
-    __host__ __device__ __forceinline__ reference operator*()
-    {
-        return val;
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
-    {
-        return ConstantIteratorRA(val);
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
-    {
-        return ConstantIteratorRA(val);
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ reference operator[](SizeT n)
-    {
-        return ConstantIteratorRA(val);
-    }
-
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (val == rhs.val);
-    }
-
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (val != rhs.val);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-};
-
-
-
-/**
- * \brief A simple random-access transform iterator for applying a transformation operator.
- *
- * \par Overview
- * TransformIteratorRA is a random-access iterator that wraps both a native
- * device pointer of type <tt>InputType*</tt> and a unary conversion functor of
- * type \p ConversionOp. \p OutputType references are made by pulling \p InputType
- * values through the \p ConversionOp instance.
- *
- * \tparam InputType            The value type of the pointer being wrapped
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
- * \tparam OutputType           The value type of this iterator
- */
-template <typename OutputType, typename ConversionOp, typename InputType>
-class TransformIteratorRA
-{
-public:
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    typedef TransformIteratorRA                 self_type;
-    typedef OutputType                          value_type;
-    typedef OutputType                          reference;
-    typedef OutputType*                         pointer;
-    typedef std::random_access_iterator_tag     iterator_category;
-    typedef int                                 difference_type;
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-private:
-
-    ConversionOp    conversion_op;
-    InputType*      ptr;
-
-public:
-
-    /**
-     * \brief Constructor
-     * @param ptr Native pointer to wrap
-     * @param conversion_op Binary transformation functor
-     */
-    __host__ __device__ __forceinline__ TransformIteratorRA(InputType* ptr, ConversionOp conversion_op) :
-        conversion_op(conversion_op),
-        ptr(ptr) {}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        self_type i = *this;
-        ptr++;
-        return i;
-    }
-
-    __host__ __device__ __forceinline__ self_type operator++(int junk)
-    {
-        ptr++;
-        return *this;
-    }
-
-    __host__ __device__ __forceinline__ reference operator*()
-    {
-        return conversion_op(*ptr);
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
-    {
-        TransformIteratorRA retval(ptr + n, conversion_op);
-        return retval;
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
-    {
-        TransformIteratorRA retval(ptr - n, conversion_op);
-        return retval;
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ reference operator[](SizeT n)
-    {
-        return conversion_op(ptr[n]);
-    }
-
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &conversion_op(*ptr);
-    }
-
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-};
-
-
-
-/**
- * \brief A simple random-access iterator for loading primitive values through texture cache.
- *
- * \par Overview
- * TexIteratorRA is a random-access iterator that wraps a native
- * device pointer of type <tt>T*</tt>. References made through TexIteratorRA
- * causes values to be pulled through texture cache.
- *
- * \par Usage Considerations
- * - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
- * - Only one TexIteratorRA or TexIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
- *
- * \tparam InputType            The value type of the pointer being wrapped
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
- * \tparam OutputType           The value type of this iterator
- */
-template <typename T>
-class TexIteratorRA
-{
-public:
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    typedef TexIteratorRA                       self_type;
-    typedef T                                   value_type;
-    typedef T                                   reference;
-    typedef T*                                  pointer;
-    typedef std::random_access_iterator_tag     iterator_category;
-    typedef int                                 difference_type;
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// Tag identifying iterator type as being texture-bindable
-    typedef void TexBindingTag;
-
-private:
-
-    T*                  ptr;
-    size_t              tex_align_offset;
-    cudaTextureObject_t tex_obj;
-
-public:
-
-    /**
-     * \brief Constructor
-     */
-    __host__ __device__ __forceinline__ TexIteratorRA()
-    :
-        ptr(NULL),
-        tex_align_offset(0),
-        tex_obj(0)
-    {}
-
-    /// \brief Bind iterator to texture reference
-    cudaError_t BindTexture(
-        T               *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes,                  ///< Number of items
-        size_t          tex_align_offset = 0)   ///< Offset (in items) from ptr denoting the position of the iterator
-    {
-        this->ptr = ptr;
-        this->tex_align_offset = tex_align_offset;
-
-        int ptx_version;
-        cudaError_t error = cudaSuccess;
-        if (CubDebug(error = PtxVersion(ptx_version))) return error;
-        if (ptx_version >= 300)
-        {
-            // Use texture object
-            cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<T>();
-            cudaResourceDesc        res_desc;
-            cudaTextureDesc         tex_desc;
-            memset(&res_desc, 0, sizeof(cudaResourceDesc));
-            memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-            res_desc.resType                = cudaResourceTypeLinear;
-            res_desc.res.linear.devPtr      = ptr;
-            res_desc.res.linear.desc        = channel_desc;
-            res_desc.res.linear.sizeInBytes = bytes;
-            tex_desc.readMode               = cudaReadModeElementType;
-            return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
-        }
-        else
-        {
-            // Use texture reference
-            return TexIteratorRef<T>::BindTexture(ptr);
-        }
-    }
-
-    /// \brief Unbind iterator to texture reference
-    cudaError_t UnbindTexture()
-    {
-        int ptx_version;
-        cudaError_t error = cudaSuccess;
-        if (CubDebug(error = PtxVersion(ptx_version))) return error;
-        if (ptx_version < 300)
-        {
-            // Use texture reference
-            return TexIteratorRef<T>::UnbindTexture();
-        }
-        else
-        {
-            // Use texture object
-            return cudaDestroyTextureObject(tex_obj);
-        }
-    }
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        self_type i = *this;
-        ptr++;
-        tex_align_offset++;
-        return i;
-    }
-
-    __host__ __device__ __forceinline__ self_type operator++(int junk)
-    {
-        ptr++;
-        tex_align_offset++;
-        return *this;
-    }
-
-    __host__ __device__ __forceinline__ reference operator*()
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return *ptr;
-#elif (CUB_PTX_ARCH < 300)
-        // Use the texture reference
-        return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset);
-#else
-        // Use the texture object
-        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
-#endif
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
-    {
-        TexIteratorRA retval;
-        retval.ptr = ptr + n;
-        retval.tex_align_offset = tex_align_offset + n;
-        return retval;
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
-    {
-        TexIteratorRA retval;
-        retval.ptr = ptr - n;
-        retval.tex_align_offset = tex_align_offset - n;
-        return retval;
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ reference operator[](SizeT n)
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[n];
-#elif (CUB_PTX_ARCH < 300)
-        // Use the texture reference
-        return tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset + n);
-#else
-        // Use the texture object
-        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
-#endif
-    }
-
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return &(*ptr);
-#elif (CUB_PTX_ARCH < 300)
-        // Use the texture reference
-        return &(tex1Dfetch(TexIteratorRef<T>::ref, tex_align_offset));
-#else
-        // Use the texture object
-        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
-#endif
-    }
-
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-};
-
-
-/**
- * \brief A simple random-access transform iterator for loading primitive values through texture cache and and subsequently applying a transformation operator.
- *
- * \par Overview
- * TexTransformIteratorRA is a random-access iterator that wraps both a native
- * device pointer of type <tt>InputType*</tt> and a unary conversion functor of
- * type \p ConversionOp. \p OutputType references are made by pulling \p InputType
- * values through the texture cache and then transformed them using the
- * \p ConversionOp instance.
- *
- * \par Usage Considerations
- * - Can only be used with primitive types (e.g., \p char, \p int, \p float), with the exception of \p double
- * - Only one TexIteratorRA or TexTransformIteratorRA of a certain \p InputType can be bound at any given time (per host thread)
- *
- * \tparam InputType            The value type of the pointer being wrapped
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p OutputType.  Must have member <tt>OutputType operator()(const InputType &datum)</tt>.
- * \tparam OutputType           The value type of this iterator
- */
-template <typename OutputType, typename ConversionOp, typename InputType>
-class TexTransformIteratorRA
-{
-public:
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    typedef TexTransformIteratorRA              self_type;
-    typedef OutputType                          value_type;
-    typedef OutputType                          reference;
-    typedef OutputType*                         pointer;
-    typedef std::random_access_iterator_tag     iterator_category;
-    typedef int                                 difference_type;
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// Tag identifying iterator type as being texture-bindable
-    typedef void TexBindingTag;
-
-private:
-
-    ConversionOp        conversion_op;
-    InputType*          ptr;
-    size_t              tex_align_offset;
-    cudaTextureObject_t tex_obj;
-
-public:
-
-    /**
-     * \brief Constructor
-     */
-    TexTransformIteratorRA(
-        ConversionOp    conversion_op)          ///< Binary transformation functor
-    :
-        conversion_op(conversion_op),
-        ptr(NULL),
-        tex_align_offset(0),
-        tex_obj(0)
-    {}
-
-    /// \brief Bind iterator to texture reference
-    cudaError_t BindTexture(
-        InputType*      ptr,                    ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes,                  ///< Number of items
-        size_t          tex_align_offset = 0)   ///< Offset (in items) from ptr denoting the position of the iterator
-    {
-        this->ptr = ptr;
-        this->tex_align_offset = tex_align_offset;
-
-        int ptx_version;
-        cudaError_t error = cudaSuccess;
-        if (CubDebug(error = PtxVersion(ptx_version))) return error;
-        if (ptx_version >= 300)
-        {
-            // Use texture object
-            cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<InputType>();
-            cudaResourceDesc        res_desc;
-            cudaTextureDesc         tex_desc;
-            memset(&res_desc, 0, sizeof(cudaResourceDesc));
-            memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-            res_desc.resType                = cudaResourceTypeLinear;
-            res_desc.res.linear.devPtr      = ptr;
-            res_desc.res.linear.desc        = channel_desc;
-            res_desc.res.linear.sizeInBytes = bytes;
-            tex_desc.readMode               = cudaReadModeElementType;
-            return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
-        }
-        else
-        {
-            // Use texture reference
-            return TexIteratorRef<InputType>::BindTexture(ptr);
-        }
-    }
-
-    /// \brief Unbind iterator to texture reference
-    cudaError_t UnbindTexture()
-    {
-        int ptx_version;
-        cudaError_t error = cudaSuccess;
-        if (CubDebug(error = PtxVersion(ptx_version))) return error;
-        if (ptx_version >= 300)
-        {
-            // Use texture object
-            return cudaDestroyTextureObject(tex_obj);
-        }
-        else
-        {
-            // Use texture reference
-            return TexIteratorRef<InputType>::UnbindTexture();
-        }
-    }
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        self_type i = *this;
-        ptr++;
-        tex_align_offset++;
-        return i;
-    }
-
-    __host__ __device__ __forceinline__ self_type operator++(int junk)
-    {
-        ptr++;
-        tex_align_offset++;
-        return *this;
-    }
-
-    __host__ __device__ __forceinline__ reference operator*()
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return conversion_op(*ptr);
-#elif (CUB_PTX_ARCH < 300)
-        // Use the texture reference
-        return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
-#else
-        // Use the texture object
-        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
-#endif
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator+(SizeT n)
-    {
-        TexTransformIteratorRA retval(conversion_op);
-        retval.ptr = ptr + n;
-        retval.tex_align_offset = tex_align_offset + n;
-        return retval;
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ self_type operator-(SizeT n)
-    {
-        TexTransformIteratorRA retval(conversion_op);
-        retval.ptr = ptr - n;
-        retval.tex_align_offset = tex_align_offset - n;
-        return retval;
-    }
-
-    template <typename SizeT>
-    __host__ __device__ __forceinline__ reference operator[](SizeT n)
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return conversion_op(ptr[n]);
-#elif (CUB_PTX_ARCH < 300)
-        // Use the texture reference
-        return conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset + n));
-#else
-        // Use the texture object
-        return conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset + n));
-#endif
-    }
-
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return &conversion_op(*ptr);
-#elif (CUB_PTX_ARCH < 300)
-        // Use the texture reference
-        return &conversion_op(tex1Dfetch(TexIteratorRef<InputType>::ref, tex_align_offset));
-#else
-        // Use the texture object
-        return &conversion_op(tex1Dfetch<InputType>(tex_obj, tex_align_offset));
-#endif
-    }
-
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-};
-
-
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_macro.cuh b/kokkos/kokkos/TPL/cub/util_macro.cuh
deleted file mode 100644
index 091fd93..0000000
--- a/kokkos/kokkos/TPL/cub/util_macro.cuh
+++ /dev/null
@@ -1,107 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Common C/C++ macro utilities
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * Align struct
- */
-#if defined(_WIN32) || defined(_WIN64)
-    #define CUB_ALIGN(bytes) __declspec(align(32))
-#else
-    #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
-#endif
-
-/**
- * Select maximum(a, b)
- */
-#define CUB_MAX(a, b) (((a) > (b)) ? (a) : (b))
-
-/**
- * Select minimum(a, b)
- */
-#define CUB_MIN(a, b) (((a) < (b)) ? (a) : (b))
-
-/**
- * Quotient of x/y rounded down to nearest integer
- */
-#define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
-
-/**
- * Quotient of x/y rounded up to nearest integer
- */
-#define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
-
-/**
- * x rounded up to the nearest multiple of y
- */
-#define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
-
-/**
- * x rounded down to the nearest multiple of y
- */
-#define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
-
-/**
- * Return character string for given type
- */
-#define CUB_TYPE_STRING(type) ""#type
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-    #define CUB_CAT_(a, b) a ## b
-    #define CUB_CAT(a, b) CUB_CAT_(a, b)
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * Static assert
- */
-#define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_namespace.cuh b/kokkos/kokkos/TPL/cub/util_namespace.cuh
deleted file mode 100644
index 869ecc6..0000000
--- a/kokkos/kokkos/TPL/cub/util_namespace.cuh
+++ /dev/null
@@ -1,41 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Place-holder for prefixing the cub namespace
- */
-
-#pragma once
-
-// For example:
-//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
-//#define CUB_NS_POSTFIX } }
-
-#define CUB_NS_PREFIX
-#define CUB_NS_POSTFIX
diff --git a/kokkos/kokkos/TPL/cub/util_ptx.cuh b/kokkos/kokkos/TPL/cub/util_ptx.cuh
deleted file mode 100644
index ad80b04..0000000
--- a/kokkos/kokkos/TPL/cub/util_ptx.cuh
+++ /dev/null
@@ -1,380 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * PTX intrinsics
- */
-
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-/******************************************************************************
- * PTX helper macros
- ******************************************************************************/
-
-/**
- * Register modifier for pointer-types (for inlining PTX assembly)
- */
-#if defined(_WIN64) || defined(__LP64__)
-    #define __CUB_LP64__ 1
-    // 64-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "l"
-    #define _CUB_ASM_PTR_SIZE_ "u64"
-#else
-    #define __CUB_LP64__ 0
-    // 32-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "r"
-    #define _CUB_ASM_PTR_SIZE_ "u32"
-#endif
-
-
-/******************************************************************************
- * Inlined PTX intrinsics
- ******************************************************************************/
-
-/**
- * Shift-right then add.  Returns (x >> shift) + addend.
- */
-__device__ __forceinline__ unsigned int SHR_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if __CUDA_ARCH__ >= 200
-    asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x >> shift) + addend;
-#endif
-    return ret;
-}
-
-
-/**
- * Shift-left then add.  Returns (x << shift) + addend.
- */
-__device__ __forceinline__ unsigned int SHL_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if __CUDA_ARCH__ >= 200
-    asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x << shift) + addend;
-#endif
-    return ret;
-}
-
-
-/**
- * Bitfield-extract.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits source,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-    unsigned int bits;
-#if __CUDA_ARCH__ >= 200
-    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
-#else
-    const unsigned int MASK = (1 << num_bits) - 1;
-    bits = (source >> bit_start) & MASK;
-#endif
-    return bits;
-}
-
-
-/**
- * Bitfield-extract for 64-bit types.
- */
-__device__ __forceinline__ unsigned int BFE(
-    unsigned long long source,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-    const unsigned long long MASK = (1ull << num_bits) - 1;
-    return (source >> bit_start) & MASK;
-}
-
-
-/**
- * Bitfield insert.  Inserts the first num_bits of y into x starting at bit_start
- */
-__device__ __forceinline__ void BFI(
-    unsigned int &ret,
-    unsigned int x,
-    unsigned int y,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-#if __CUDA_ARCH__ >= 200
-    asm("bfi.b32 %0, %1, %2, %3, %4;" :
-        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-#else
-    // TODO
-#endif
-}
-
-
-/**
- * Three-operand add
- */
-__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
-{
-#if __CUDA_ARCH__ >= 200
-    asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-#else
-    x = x + y + z;
-#endif
-    return x;
-}
-
-
-/**
- * Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and
- * reassemble them into a 32-bit destination register
- */
-__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
-{
-    int ret;
-    asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-    return ret;
-}
-
-
-/**
- * Sync-threads barrier.
- */
-__device__ __forceinline__ void BAR(int count)
-{
-    asm volatile("bar.sync 1, %0;" : : "r"(count));
-}
-
-
-/**
- * Floating point multiply. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FMUL_RZ(float a, float b)
-{
-    float d;
-    asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
-    return d;
-}
-
-
-/**
- * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
-{
-    float d;
-    asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
-    return d;
-}
-
-
-/**
- * Terminates the calling thread
- */
-__device__ __forceinline__ void ThreadExit() {
-    asm("exit;");
-}    
-
-
-/**
- * Returns the warp lane ID of the calling thread
- */
-__device__ __forceinline__ unsigned int LaneId()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %laneid;" : "=r"(ret) );
-    return ret;
-}
-
-
-/**
- * Returns the warp ID of the calling thread
- */
-__device__ __forceinline__ unsigned int WarpId()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %warpid;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * Returns the warp lane mask of all lanes less than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLt()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * Returns the warp lane mask of all lanes less than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLe()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * Returns the warp lane mask of all lanes greater than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGt()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * Returns the warp lane mask of all lanes greater than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGe()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * Portable implementation of __all
- */
-__device__ __forceinline__ int WarpAll(int cond)
-{
-#if CUB_PTX_ARCH < 120
-
-    __shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
-
-    if (LaneId() == 0)
-        warp_signals[WarpId()] = 1;
-
-    if (cond == 0)
-        warp_signals[WarpId()] = 0;
-
-    return warp_signals[WarpId()];
-
-#else
-
-    return __all(cond);
-
-#endif
-}
-
-
-/**
- * Portable implementation of __any
- */
-__device__ __forceinline__ int WarpAny(int cond)
-{
-#if CUB_PTX_ARCH < 120
-
-    __shared__ volatile int warp_signals[PtxArchProps::MAX_SM_THREADS / PtxArchProps::WARP_THREADS];
-
-    if (LaneId() == 0)
-        warp_signals[WarpId()] = 0;
-
-    if (cond)
-        warp_signals[WarpId()] = 1;
-
-    return warp_signals[WarpId()];
-
-#else
-
-    return __any(cond);
-
-#endif
-}
-
-
-/// Generic shuffle-up
-template <typename T>
-__device__ __forceinline__ T ShuffleUp(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset)         ///< [in] The up-offset of the peer to read from
-{
-    enum
-    {
-        SHFL_C = 0,
-    };
-
-    typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    #pragma unroll
-    for (int WORD = 0; WORD < WORDS; ++WORD)
-    {
-        unsigned int shuffle_word = input_alias[WORD];
-        asm(
-            "  shfl.up.b32 %0, %1, %2, %3;"
-            : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
-        output_alias[WORD] = (ShuffleWord) shuffle_word;
-    }
-
-    return output;
-}
-
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_type.cuh b/kokkos/kokkos/TPL/cub/util_type.cuh
deleted file mode 100644
index 836aa0f..0000000
--- a/kokkos/kokkos/TPL/cub/util_type.cuh
+++ /dev/null
@@ -1,685 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Common type manipulation (metaprogramming) utilities
- */
-
-#pragma once
-
-#include <iostream>
-#include <limits>
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-
-/******************************************************************************
- * Type equality
- ******************************************************************************/
-
-/**
- * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
- */
-template <bool IF, typename ThenType, typename ElseType>
-struct If
-{
-    /// Conditional type result
-    typedef ThenType Type;      // true
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename ThenType, typename ElseType>
-struct If<false, ThenType, ElseType>
-{
-    typedef ElseType Type;      // false
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Conditional types
- ******************************************************************************/
-
-
-/**
- * \brief Type equality test
- */
-template <typename A, typename B>
-struct Equals
-{
-    enum {
-        VALUE = 0,
-        NEGATE = 1
-    };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename A>
-struct Equals <A, A>
-{
-    enum {
-        VALUE = 1,
-        NEGATE = 0
-    };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Marker types
- ******************************************************************************/
-
-/**
- * \brief A simple "NULL" marker type
- */
-struct NullType
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-    template <typename T>
-    __host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; }
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-};
-
-
-/**
- * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
- */
-template <int A>
-struct Int2Type
-{
-   enum {VALUE = A};
-};
-
-
-/******************************************************************************
- * Size and alignment
- ******************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename T>
-struct WordAlignment
-{
-    struct Pad
-    {
-        T       val;
-        char    byte;
-    };
-
-    enum
-    {
-        /// The alignment of T in bytes
-        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
-    };
-
-    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<(ALIGN_BYTES % 4 == 0),
-        int,
-        typename If<(ALIGN_BYTES % 2 == 0),
-            short,
-            char>::Type>::Type                  ShuffleWord;
-
-    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<(ALIGN_BYTES % 8 == 0),
-        long long,
-        ShuffleWord>::Type                      VolatileWord;
-
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<(ALIGN_BYTES % 16 == 0),
-        longlong2,
-        typename If<(ALIGN_BYTES % 8 == 0),
-            long long,                                 // needed to get heterogenous PODs to work on all platforms
-            ShuffleWord>::Type>::Type           DeviceWord;
-
-    enum
-    {
-        DEVICE_MULTIPLE = sizeof(DeviceWord) / sizeof(T)
-    };
-
-    struct UninitializedBytes
-    {
-        char buf[sizeof(T)];
-    };
-
-    struct UninitializedShuffleWords
-    {
-        ShuffleWord buf[sizeof(T) / sizeof(ShuffleWord)];
-    };
-
-    struct UninitializedVolatileWords
-    {
-        VolatileWord buf[sizeof(T) / sizeof(VolatileWord)];
-    };
-
-    struct UninitializedDeviceWords
-    {
-        DeviceWord buf[sizeof(T) / sizeof(DeviceWord)];
-    };
-
-
-};
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Wrapper types
- ******************************************************************************/
-
-/**
- * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
- */
-template <typename T>
-struct Uninitialized
-{
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename WordAlignment<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        WORDS = sizeof(T) / sizeof(DeviceWord)
-    };
-
-    /// Backing storage
-    DeviceWord storage[WORDS];
-
-    /// Alias
-    __host__ __device__ __forceinline__ T& Alias()
-    {
-        return reinterpret_cast<T&>(*this);
-    }
-};
-
-
-/**
- * \brief A wrapper for passing simple static arrays as kernel parameters
- */
-template <typename T, int COUNT>
-struct ArrayWrapper
-{
-    /// Static array of type \p T
-    T array[COUNT];
-};
-
-
-/**
- * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
- *
- * Many multi-pass computations require a pair of "ping-pong" storage
- * buffers (e.g., one for reading from and the other for writing to, and then
- * vice-versa for the subsequent pass).  This structure wraps a set of device
- * buffers and a "selector" member to track which is "current".
- */
-template <typename T>
-struct DoubleBuffer
-{
-    /// Pair of device buffer pointers
-    T *d_buffers[2];
-
-    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
-    int selector;
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer()
-    {
-        selector = 0;
-        d_buffers[0] = NULL;
-        d_buffers[1] = NULL;
-    }
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer(
-        T *d_current,         ///< The currently valid buffer
-        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
-    {
-        selector = 0;
-        d_buffers[0] = d_current;
-        d_buffers[1] = d_alternate;
-    }
-
-    /// \brief Return pointer to the currently valid buffer
-    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
-};
-
-
-
-/******************************************************************************
- * Static math
- ******************************************************************************/
-
-/**
- * \brief Statically determine log2(N), rounded up.
- *
- * For example:
- *     Log2<8>::VALUE   // 3
- *     Log2<3>::VALUE   // 2
- */
-template <int N, int CURRENT_VAL = N, int COUNT = 0>
-struct Log2
-{
-    /// Static logarithm value
-    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-template <int N, int COUNT>
-struct Log2<N, 0, COUNT>
-{
-    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
-        COUNT :
-        COUNT - 1 };
-};
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Statically determine if N is a power-of-two
- */
-template <int N>
-struct PowerOfTwo
-{
-    enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-
-
-/******************************************************************************
- * Pointer vs. iterator detection
- ******************************************************************************/
-
-
-/**
- * \brief Pointer vs. iterator
- */
-template <typename Tp>
-struct IsPointer
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsPointer<Tp*>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Qualifier detection
- ******************************************************************************/
-
-/**
- * \brief Volatile modifier test
- */
-template <typename Tp>
-struct IsVolatile
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsVolatile<Tp volatile>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Qualifier removal
- ******************************************************************************/
-
-/**
- * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
- *
- * For example:
- *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
- */
-template <typename Tp, typename Up = Tp>
-struct RemoveQualifiers
-{
-    /// Type without \p const and \p volatile qualifiers
-    typedef Up Type;
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, volatile Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const volatile Up>
-{
-    typedef Up Type;
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-
-/**
- * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
- */
-#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
-    template <typename T>                                               \
-    struct detector_name                                                \
-    {                                                                   \
-        template <typename C>                                           \
-        static char& test(typename C::nested_type_name*);               \
-        template <typename>                                             \
-        static int& test(...);                                          \
-        enum                                                            \
-        {                                                               \
-            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
-        };                                                              \
-    };
-
-
-
-/******************************************************************************
- * Simple enable-if (similar to Boost)
- ******************************************************************************/
-
-/**
- * \brief Simple enable-if (similar to Boost)
- */
-template <bool Condition, class T = void>
-struct EnableIf
-{
-    /// Enable-if type for SFINAE dummy variables
-    typedef T Type;
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <class T>
-struct EnableIf<false, T> {};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-/**
- * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
- */
-template <typename T, typename BinaryOp>
-struct BinaryOpHasIdxParam
-{
-private:
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
-
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
-
-    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
-
-    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
-
-    template <typename BinaryOpT> static int Test(...);
-
-public:
-
-    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
-    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
-};
-
-
-
-/******************************************************************************
- * Simple type traits utilities.
- *
- * For example:
- *     Traits<int>::CATEGORY             // SIGNED_INTEGER
- *     Traits<NullType>::NULL_TYPE       // true
- *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
- *     Traits<uint4>::PRIMITIVE;         // false
- *
- ******************************************************************************/
-
-/**
- * \brief Basic type traits categories
- */
-enum Category
-{
-    NOT_A_NUMBER,
-    SIGNED_INTEGER,
-    UNSIGNED_INTEGER,
-    FLOATING_POINT
-};
-
-
-/**
- * \brief Basic type traits
- */
-template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits>
-struct BaseTraits
-{
-    /// Category
-    static const Category CATEGORY      = _CATEGORY;
-    enum
-    {
-        PRIMITIVE       = _PRIMITIVE,
-        NULL_TYPE       = _NULL_TYPE,
-    };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Basic type traits (unsigned primitive specialization)
- */
-template <typename _UnsignedBits>
-struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = UNSIGNED_INTEGER;
-    static const UnsignedBits   MIN_KEY     = UnsignedBits(0);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key;
-    }
-};
-
-
-/**
- * Basic type traits (signed primitive specialization)
- */
-template <typename _UnsignedBits>
-struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = SIGNED_INTEGER;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   MIN_KEY     = HIGH_BIT;
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-};
-
-
-/**
- * Basic type traits (fp primitive specialization)
- */
-template <typename _UnsignedBits>
-struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = FLOATING_POINT;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   MIN_KEY     = UnsignedBits(-1);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
-        return key ^ mask;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
-        return key ^ mask;
-    };
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Numeric type traits
- */
-template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T> {};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType> {};
-
-template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {};
-template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
-template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short> {};
-template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int> {};
-template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long> {};
-template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long> {};
-
-template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char> {};
-template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short> {};
-template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int> {};
-template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long> {};
-template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long> {};
-
-template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int> {};
-template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long> {};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Type traits
- */
-template <typename T>
-struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
-
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/util_vector.cuh b/kokkos/kokkos/TPL/cub/util_vector.cuh
deleted file mode 100644
index 9a432dc..0000000
--- a/kokkos/kokkos/TPL/cub/util_vector.cuh
+++ /dev/null
@@ -1,166 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Vector type inference utilities
- */
-
-#pragma once
-
-#include <iostream>
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-/******************************************************************************
- * Vector type inference utilities.  For example:
- *
- * typename VectorHelper<unsigned int, 2>::Type    // Aliases uint2
- *
- ******************************************************************************/
-
-/**
- * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the VectorHelper structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
- */
-template <typename T, int vec_elements> struct VectorHelper;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-enum
-{
-    /// The maximum number of elements in CUDA vector types
-    MAX_VEC_ELEMENTS = 4,
-};
-
-
-/**
- * Generic vector-1 type
- */
-template <typename T>
-struct VectorHelper<T, 1>
-{
-    enum { BUILT_IN = false };
-
-    T x;
-
-    typedef VectorHelper<T, 1> Type;
-};
-
-/**
- * Generic vector-2 type
- */
-template <typename T>
-struct VectorHelper<T, 2>
-{
-    enum { BUILT_IN = false };
-
-    T x;
-    T y;
-
-    typedef VectorHelper<T, 2> Type;
-};
-
-/**
- * Generic vector-3 type
- */
-template <typename T>
-struct VectorHelper<T, 3>
-{
-    enum { BUILT_IN = false };
-
-    T x;
-    T y;
-    T z;
-
-    typedef VectorHelper<T, 3> Type;
-};
-
-/**
- * Generic vector-4 type
- */
-template <typename T>
-struct VectorHelper<T, 4>
-{
-    enum { BUILT_IN = false };
-
-    T x;
-    T y;
-    T z;
-    T w;
-
-    typedef VectorHelper<T, 4> Type;
-};
-
-/**
- * Macro for expanding partially-specialized built-in vector types
- */
-#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                            \
-  template<> struct VectorHelper<base_type, 1> { typedef short_type##1 Type; enum { BUILT_IN = true }; };         \
-  template<> struct VectorHelper<base_type, 2> { typedef short_type##2 Type; enum { BUILT_IN = true }; };         \
-  template<> struct VectorHelper<base_type, 3> { typedef short_type##3 Type; enum { BUILT_IN = true }; };         \
-  template<> struct VectorHelper<base_type, 4> { typedef short_type##4 Type; enum { BUILT_IN = true }; };
-
-// Expand CUDA vector types for built-in primitives
-CUB_DEFINE_VECTOR_TYPE(char,               char)
-CUB_DEFINE_VECTOR_TYPE(signed char,        char)
-CUB_DEFINE_VECTOR_TYPE(short,              short)
-CUB_DEFINE_VECTOR_TYPE(int,                int)
-CUB_DEFINE_VECTOR_TYPE(long,               long)
-CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
-CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
-CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
-CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
-CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
-CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
-CUB_DEFINE_VECTOR_TYPE(float,              float)
-CUB_DEFINE_VECTOR_TYPE(double,             double)
-CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
-
-// Undefine macros
-#undef CUB_DEFINE_VECTOR_TYPE
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh b/kokkos/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh
deleted file mode 100644
index 317b629..0000000
--- a/kokkos/kokkos/TPL/cub/warp/specializations/warp_reduce_shfl.cuh
+++ /dev/null
@@ -1,358 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_type.cuh"
-#include "../../util_macro.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction across CUDA warps.
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
-    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
-struct WarpReduceShfl
-{
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    enum
-    {
-        /// The number of warp reduction steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        // The 5-bit SHFL mask for logically splitting warps into sub-segments
-        SHFL_MASK = (-1 << STEPS) & 31,
-
-        // The 5-bit SFHL clamp
-        SHFL_CLAMP = LOGICAL_WARP_THREADS - 1,
-
-        // The packed C argument (mask starts 8 bits up)
-        SHFL_C = (SHFL_MASK << 8) | SHFL_CLAMP,
-    };
-
-
-    /// Shared memory storage layout type
-    typedef NullType TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    int     warp_id;
-    int     lane_id;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceShfl(
-        TempStorage &temp_storage,
-        int warp_id,
-        int lane_id)
-    :
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-    /******************************************************************************
-     * Operation
-     ******************************************************************************/
-
-    /// Summation (single-SHFL)
-    template <
-        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
-    __device__ __forceinline__ T Sum(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        Int2Type<true>      single_shfl)            ///< [in] Marker type indicating whether only one SHFL instruction is required
-    {
-        unsigned int output = reinterpret_cast<unsigned int &>(input);
-
-        // Iterate reduction steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            if (FULL_WARPS)
-            {
-                // Use predicate set from SHFL to guard against invalid peers
-                asm(
-                    "{"
-                    "  .reg .u32 r0;"
-                    "  .reg .pred p;"
-                    "  shfl.down.b32 r0|p, %1, %2, %3;"
-                    "  @p add.u32 r0, r0, %4;"
-                    "  mov.u32 %0, r0;"
-                    "}"
-                    : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output));
-            }
-            else
-            {
-                // Set range predicate to guard against invalid peers
-                asm(
-                    "{"
-                    "  .reg .u32 r0;"
-                    "  .reg .pred p;"
-                    "  shfl.down.b32 r0, %1, %2, %3;"
-                    "  setp.lt.u32 p, %5, %6;"
-                    "  mov.u32 %0, %1;"
-                    "  @p add.u32 %0, %1, r0;"
-                    "}"
-                    : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
-            }
-        }
-
-        return output;
-    }
-
-
-    /// Summation (multi-SHFL)
-    template <
-        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
-    __device__ __forceinline__ T Sum(
-        T                   input,              ///< [in] Calling thread's input
-        int                 folded_items_per_warp,        ///< [in] Total number of valid items folded into each logical warp
-        Int2Type<false>     single_shfl)        ///< [in] Marker type indicating whether only one SHFL instruction is required
-    {
-        // Delegate to generic reduce
-        return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
-    }
-
-
-    /// Summation (float)
-    template <
-        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
-    __device__ __forceinline__ float Sum(
-        float               input,              ///< [in] Calling thread's input
-        int                 folded_items_per_warp)        ///< [in] Total number of valid items folded into each logical warp
-    {
-        T output = input;
-
-        // Iterate reduction steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            if (FULL_WARPS)
-            {
-                // Use predicate set from SHFL to guard against invalid peers
-                asm(
-                    "{"
-                    "  .reg .f32 r0;"
-                    "  .reg .pred p;"
-                    "  shfl.down.b32 r0|p, %1, %2, %3;"
-                    "  @p add.f32 r0, r0, %4;"
-                    "  mov.f32 %0, r0;"
-                    "}"
-                    : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output));
-            }
-            else
-            {
-                // Set range predicate to guard against invalid peers
-                asm(
-                    "{"
-                    "  .reg .f32 r0;"
-                    "  .reg .pred p;"
-                    "  shfl.down.b32 r0, %1, %2, %3;"
-                    "  setp.lt.u32 p, %5, %6;"
-                    "  mov.f32 %0, %1;"
-                    "  @p add.f32 %0, %0, r0;"
-                    "}"
-                    : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp));
-            }
-        }
-
-        return output;
-    }
-
-    /// Summation (generic)
-    template <
-        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            _T>
-    __device__ __forceinline__ _T Sum(
-        _T                  input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp)  ///< [in] Total number of valid items folded into each logical warp
-    {
-        // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
-        Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
-
-        return Sum<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, single_shfl);
-    }
-
-
-    /// Reduction
-    template <
-        bool            FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
-        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename        ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                  ///< [in] Calling thread's input
-        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
-    {
-        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
-
-        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-        T               output          = input;
-        T               temp;
-        ShuffleWord     *temp_alias     = reinterpret_cast<ShuffleWord *>(&temp);
-        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            // Grab addend from peer
-            const int OFFSET = 1 << STEP;
-
-            #pragma unroll
-            for (int WORD = 0; WORD < WORDS; ++WORD)
-            {
-                unsigned int shuffle_word = output_alias[WORD];
-                asm(
-                    "  shfl.down.b32 %0, %1, %2, %3;"
-                    : "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
-                temp_alias[WORD] = (ShuffleWord) shuffle_word;
-            }
-
-            // Perform reduction op if from a valid peer
-            if (FULL_WARPS)
-            {
-                if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
-                    output = reduction_op(output, temp);
-            }
-            else
-            {
-                if (((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE) < folded_items_per_warp)
-                    output = reduction_op(output, temp);
-            }
-        }
-
-        return output;
-    }
-
-
-    /// Segmented reduction
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        Flag,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
-    {
-        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
-
-        T output = input;
-
-        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-        T               temp;
-        ShuffleWord     *temp_alias     = reinterpret_cast<ShuffleWord *>(&temp);
-        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-
-        // Get the start flags for each thread in the warp.
-        int warp_flags = __ballot(flag);
-
-        if (!HEAD_SEGMENTED)
-            warp_flags <<= 1;
-
-        // Keep bits above the current thread.
-        warp_flags &= LaneMaskGt();
-
-        // Accommodate packing of multiple logical warps in a single physical warp
-        if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
-            warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
-
-        // Find next flag
-        int next_flag = __clz(__brev(warp_flags));
-
-        // Clip the next segment at the warp boundary if necessary
-        if (LOGICAL_WARP_THREADS != 32)
-            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Grab addend from peer
-            #pragma unroll
-            for (int WORD = 0; WORD < WORDS; ++WORD)
-            {
-                unsigned int shuffle_word = output_alias[WORD];
-
-                asm(
-                    "  shfl.down.b32 %0, %1, %2, %3;"
-                    : "=r"(shuffle_word) : "r"(shuffle_word), "r"(OFFSET), "r"(SHFL_C));
-                temp_alias[WORD] = (ShuffleWord) shuffle_word;
-
-            }
-
-            // Perform reduction op if valid
-            if (OFFSET < next_flag - lane_id)
-                output = reduction_op(output, temp);
-        }
-
-        return output;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh b/kokkos/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh
deleted file mode 100644
index a32d5fd..0000000
--- a/kokkos/kokkos/TPL/cub/warp/specializations/warp_reduce_smem.cuh
+++ /dev/null
@@ -1,291 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpReduceSmem provides smem-based variants of parallel reduction across CUDA warps.
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
-    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
-struct WarpReduceSmem
-{
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size is a power-of-two
-        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-    };
-
-    /// Shared memory flag type
-    typedef unsigned char SmemFlag;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage     &temp_storage;
-    int             warp_id;
-    int             lane_id;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceSmem(
-        TempStorage     &temp_storage,
-        int             warp_id,
-        int             lane_id)
-    :
-        temp_storage(temp_storage.Alias()),
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-    /******************************************************************************
-     * Operation
-     ******************************************************************************/
-
-    /**
-     * Reduction
-     */
-    template <
-        bool                FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op)           ///< [in] Reduction operator
-    {
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input through buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
-
-            // Update input if peer_addend is in range
-            if ((FULL_WARPS && POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
-            {
-                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
-                input = reduction_op(input, peer_addend);
-            }
-        }
-
-        return input;
-    }
-
-
-    /**
-     * Segmented reduction
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        Flag,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Reduction operator
-    {
-    #if CUB_PTX_ARCH >= 200
-
-        // Ballot-based segmented reduce
-
-        // Get the start flags for each thread in the warp.
-        int warp_flags = __ballot(flag);
-
-        if (!HEAD_SEGMENTED)
-            warp_flags <<= 1;
-
-        // Keep bits above the current thread.
-        warp_flags &= LaneMaskGt();
-
-        // Accommodate packing of multiple logical warps in a single physical warp
-        if ((LOGICAL_WARPS > 1) && (LOGICAL_WARP_THREADS < 32))
-            warp_flags >>= (warp_id * LOGICAL_WARP_THREADS);
-
-        // Find next flag
-        int next_flag = __clz(__brev(warp_flags));
-
-        // Clip the next segment at the warp boundary if necessary
-        if (LOGICAL_WARP_THREADS != 32)
-            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
-
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input into buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
-
-            // Update input if peer_addend is in range
-            if (OFFSET < next_flag - lane_id)
-            {
-                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
-                input = reduction_op(input, peer_addend);
-            }
-        }
-
-        return input;
-
-    #else
-
-        // Smem-based segmented reduce
-
-        enum
-        {
-            UNSET   = 0x0,  // Is initially unset
-            SET     = 0x1,  // Is initially set
-            SEEN    = 0x2,  // Has seen another head flag from a successor peer
-        };
-
-        // Alias flags onto shared data storage
-        volatile SmemFlag *flag_storage = reinterpret_cast<SmemFlag*>(temp_storage[warp_id]);
-
-        SmemFlag flag_status = (flag) ? SET : UNSET;
-
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input through buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], input);
-
-            // Get peer from buffer
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][lane_id + OFFSET]);
-
-            // Share flag through buffer
-            flag_storage[lane_id] = flag_status;
-
-            // Get peer flag from buffer
-            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
-
-            // Update input if peer was in range
-            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
-            {
-                if (HEAD_SEGMENTED)
-                {
-                    // Head-segmented
-                    if ((flag_status & SEEN) == 0)
-                    {
-                        // Has not seen a more distant head flag
-                        if (peer_flag_status & SET)
-                        {
-                            // Has now seen a head flag
-                            flag_status |= SEEN;
-                        }
-                        else
-                        {
-                            // Peer is not a head flag: grab its count
-                            input = reduction_op(input, peer_addend);
-                        }
-
-                        // Update seen status to include that of peer
-                        flag_status |= (peer_flag_status & SEEN);
-                    }
-                }
-                else
-                {
-                    // Tail-segmented.  Simply propagate flag status
-                    if (!flag_status)
-                    {
-                        input = reduction_op(input, peer_addend);
-                        flag_status |= peer_flag_status;
-                    }
-
-                }
-            }
-        }
-
-        return input;
-
-    #endif
-    }
-
-
-    /**
-     * Summation
-     */
-    template <
-        bool            FULL_WARPS,             ///< Whether all lanes in each warp are contributing a valid fold of items
-        int             FOLDED_ITEMS_PER_LANE>  ///< Number of items folded into each lane
-    __device__ __forceinline__ T Sum(
-        T               input,                  ///< [in] Calling thread's input
-        int             folded_items_per_warp)  ///< [in] Total number of valid items folded into each logical warp
-    {
-        return Reduce<FULL_WARPS, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, cub::Sum());
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh b/kokkos/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh
deleted file mode 100644
index 5585396..0000000
--- a/kokkos/kokkos/TPL/cub/warp/specializations/warp_scan_shfl.cuh
+++ /dev/null
@@ -1,371 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_type.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan across CUDA warps.
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
-    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
-struct WarpScanShfl
-{
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    enum
-    {
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        // The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = ((-1 << STEPS) & 31) << 8,
-    };
-
-    /// Shared memory storage layout type
-    typedef NullType TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    int             warp_id;
-    int             lane_id;
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanShfl(
-        TempStorage &temp_storage,
-        int warp_id,
-        int lane_id)
-    :
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-    /******************************************************************************
-     * Operation
-     ******************************************************************************/
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        typedef typename WordAlignment<T>::ShuffleWord ShuffleWord;
-
-        const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-        T               output;
-        ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-        ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-        #pragma unroll
-        for (int WORD = 0; WORD < WORDS; ++WORD)
-        {
-            unsigned int shuffle_word = input_alias[WORD];
-            asm("shfl.idx.b32 %0, %1, %2, %3;"
-                : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(LOGICAL_WARP_THREADS - 1));
-            output_alias[WORD] = (ShuffleWord) shuffle_word;
-        }
-
-        return output;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive prefix sum with aggregate (single-SHFL)
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate,    ///< [out] Warp-wide aggregate reduction of input items.
-        Int2Type<true>  single_shfl)
-    {
-        unsigned int temp = reinterpret_cast<unsigned int &>(input);
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            // Use predicate set from SHFL to guard against invalid peers
-            asm(
-                "{"
-                "  .reg .u32 r0;"
-                "  .reg .pred p;"
-                "  shfl.up.b32 r0|p, %1, %2, %3;"
-                "  @p add.u32 r0, r0, %4;"
-                "  mov.u32 %0, r0;"
-                "}"
-                : "=r"(temp) : "r"(temp), "r"(1 << STEP), "r"(SHFL_C), "r"(temp));
-        }
-
-        output = temp;
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
-    }
-
-
-    /// Inclusive prefix sum with aggregate (multi-SHFL)
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate,    ///< [out] Warp-wide aggregate reduction of input items.
-        Int2Type<false> single_shfl)        ///< [in] Marker type indicating whether only one SHFL instruction is required
-    {
-        // Delegate to generic scan
-        InclusiveScan(input, output, Sum(), warp_aggregate);
-    }
-
-
-    /// Inclusive prefix sum with aggregate (specialized for float)
-    __device__ __forceinline__ void InclusiveSum(
-        float           input,              ///< [in] Calling thread's input item.
-        float           &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        float           &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        output = input;
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            // Use predicate set from SHFL to guard against invalid peers
-            asm(
-                "{"
-                "  .reg .f32 r0;"
-                "  .reg .pred p;"
-                "  shfl.up.b32 r0|p, %1, %2, %3;"
-                "  @p add.f32 r0, r0, %4;"
-                "  mov.f32 %0, r0;"
-                "}"
-                : "=f"(output) : "f"(output), "r"(1 << STEP), "r"(SHFL_C), "f"(output));
-        }
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
-    }
-
-
-    /// Inclusive prefix sum with aggregate (specialized for unsigned long long)
-    __device__ __forceinline__ void InclusiveSum(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        unsigned long long  &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        unsigned long long  &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        output = input;
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            // Use predicate set from SHFL to guard against invalid peers
-            asm(
-                "{"
-                "  .reg .u32 r0;"
-                "  .reg .u32 r1;"
-                "  .reg .u32 lo;"
-                "  .reg .u32 hi;"
-                "  .reg .pred p;"
-                "  mov.b64 {lo, hi}, %1;"
-                "  shfl.up.b32 r0|p, lo, %2, %3;"
-                "  shfl.up.b32 r1|p, hi, %2, %3;"
-                "  @p add.cc.u32 r0, r0, lo;"
-                "  @p addc.u32 r1, r1, hi;"
-                "  mov.b64 %0, {r0, r1};"
-                "}"
-                : "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C));
-        }
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
-    }
-
-
-    /// Inclusive prefix sum with aggregate (generic)
-    template <typename _T>
-    __device__ __forceinline__ void InclusiveSum(
-        _T               input,             ///< [in] Calling thread's input item.
-        _T               &output,           ///< [out] Calling thread's output item.  May be aliased with \p input.
-        _T               &warp_aggregate)   ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions)
-        Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl;
-
-        InclusiveSum(input, output, warp_aggregate, single_shfl);
-    }
-
-
-    /// Inclusive prefix sum
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        T warp_aggregate;
-        InclusiveSum(input, output, warp_aggregate);
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        output = input;
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            // Grab addend from peer
-            const int OFFSET = 1 << STEP;
-            T temp = ShuffleUp(output, OFFSET);
-
-            // Perform scan op if from a valid peer
-            if (lane_id >= OFFSET)
-                output = scan_op(temp, output);
-        }
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
-    }
-
-
-    /// Inclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T warp_aggregate;
-        InclusiveScan(input, output, scan_op, warp_aggregate);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Exclusive operations
-    //---------------------------------------------------------------------
-
-    /// Exclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Compute inclusive scan
-        T inclusive;
-        InclusiveScan(input, inclusive, scan_op, warp_aggregate);
-
-        // Grab result from predecessor
-        T exclusive = ShuffleUp(inclusive, 1);
-
-        output = (lane_id == 0) ?
-            identity :
-            exclusive;
-    }
-
-
-    /// Exclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T warp_aggregate;
-        ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
-    }
-
-
-    /// Exclusive scan with aggregate, without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Compute inclusive scan
-        T inclusive;
-        InclusiveScan(input, inclusive, scan_op, warp_aggregate);
-
-        // Grab result from predecessor
-        output = ShuffleUp(inclusive, 1);
-    }
-
-
-    /// Exclusive scan without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T warp_aggregate;
-        ExclusiveScan(input, output, scan_op, warp_aggregate);
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh b/kokkos/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh
deleted file mode 100644
index 513b35c..0000000
--- a/kokkos/kokkos/TPL/cub/warp/specializations/warp_scan_smem.cuh
+++ /dev/null
@@ -1,327 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanSmem provides smem-based variants of parallel prefix scan across CUDA warps.
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARPS,          ///< Number of logical warps entrant
-    int         LOGICAL_WARP_THREADS>   ///< Number of threads per logical warp
-struct WarpScanSmem
-{
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    enum
-    {
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-    };
-
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    typedef T _TempStorage[LOGICAL_WARPS][WARP_SMEM_ELEMENTS];
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage     &temp_storage;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanSmem(
-        TempStorage     &temp_storage,
-        int             warp_id,
-        int             lane_id)
-    :
-        temp_storage(temp_storage.Alias()),
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-    /******************************************************************************
-     * Operation
-     ******************************************************************************/
-
-    /// Initialize identity padding (specialized for operations that have identity)
-    __device__ __forceinline__ void InitIdentity(Int2Type<true> has_identity)
-    {
-        T identity = T();
-        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
-    }
-
-
-    /// Initialize identity padding (specialized for operations without identity)
-    __device__ __forceinline__ void InitIdentity(Int2Type<false> has_identity)
-    {}
-
-
-    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
-    template <
-        bool HAS_IDENTITY,
-        typename ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T               &partial,
-        ScanOp          scan_op,
-        Int2Type<STEPS>  step)
-    {}
-
-
-    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        int         STEP,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T               &partial,
-        ScanOp          scan_op,
-        Int2Type<STEP>  step)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share partial into buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
-
-        // Update partial if addend is in range
-        if (HAS_IDENTITY || (lane_id >= OFFSET))
-        {
-            T addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - OFFSET]);
-            partial = scan_op(addend, partial);
-        }
-
-        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
-    }
-
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        if (lane_id == src_lane)
-        {
-            ThreadStore<STORE_VOLATILE>(temp_storage[warp_id], input);
-        }
-
-        return ThreadLoad<LOAD_VOLATILE>(temp_storage[warp_id]);
-    }
-
-
-    /// Basic inclusive scan
-    template <
-        bool        HAS_IDENTITY,
-        bool        SHARE_FINAL,
-        typename    ScanOp>
-    __device__ __forceinline__ T BasicScan(
-        T               partial,            ///< Calling thread's input partial reduction
-        ScanOp          scan_op)            ///< Binary associative scan functor
-    {
-        // Iterate scan steps
-        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<0>());
-
-        if (SHARE_FINAL)
-        {
-            // Share partial into buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id], partial);
-        }
-
-        return partial;
-    }
-
-
-    /// Inclusive prefix sum
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
-
-        // Initialize identity region
-        InitIdentity(Int2Type<HAS_IDENTITY>());
-
-        // Compute inclusive warp scan (has identity, don't share final)
-        output = BasicScan<HAS_IDENTITY, false>(input, Sum());
-    }
-
-
-    /// Inclusive prefix sum with aggregate
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        const bool HAS_IDENTITY = Traits<T>::PRIMITIVE;
-
-        // Initialize identity region
-        InitIdentity(Int2Type<HAS_IDENTITY>());
-
-        // Compute inclusive warp scan (has identity, share final)
-        output = BasicScan<HAS_IDENTITY, true>(input, Sum());
-
-        // Retrieve aggregate in <em>warp-lane</em><sub>0</sub>
-        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
-    }
-
-
-    /// Inclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        // Compute inclusive warp scan (no identity, don't share final)
-        output = BasicScan<false, false>(input, scan_op);
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Compute inclusive warp scan (no identity, share final)
-        output = BasicScan<false, true>(input, scan_op);
-
-        // Retrieve aggregate
-        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
-    }
-
-    /// Exclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        // Initialize identity region
-        ThreadStore<STORE_VOLATILE>(&temp_storage[warp_id][lane_id], identity);
-
-        // Compute inclusive warp scan (identity, share final)
-        T inclusive = BasicScan<true, true>(input, scan_op);
-
-        // Retrieve exclusive scan
-        output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
-    }
-
-
-    /// Exclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Exclusive warp scan (which does share final)
-        ExclusiveScan(input, output, identity, scan_op);
-
-        // Retrieve aggregate
-        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
-    }
-
-
-    /// Exclusive scan without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        // Compute inclusive warp scan (no identity, share final)
-        T inclusive = BasicScan<false, true>(input, scan_op);
-
-        // Retrieve exclusive scan
-        output = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][HALF_WARP_THREADS + lane_id - 1]);
-    }
-
-
-    /// Exclusive scan with aggregate, without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Exclusive warp scan (which does share final)
-        ExclusiveScan(input, output, scan_op);
-
-        // Retrieve aggregate
-        warp_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage[warp_id][WARP_SMEM_ELEMENTS - 1]);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/warp/warp_reduce.cuh b/kokkos/kokkos/TPL/cub/warp/warp_reduce.cuh
deleted file mode 100644
index 548369d..0000000
--- a/kokkos/kokkos/TPL/cub/warp/warp_reduce.cuh
+++ /dev/null
@@ -1,677 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads.
- */
-
-#pragma once
-
-#include "specializations/warp_reduce_shfl.cuh"
-#include "specializations/warp_reduce_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across CUDA warp threads. ![](warp_reduce_logo.png)
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a list of input elements.
- *
- * \tparam T                        The reduction input/output element type
- * \tparam LOGICAL_WARPS            <b>[optional]</b> The number of entrant "logical" warps performing concurrent warp reductions.  Default is 1.
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
- *
- * \par Simple Examples
- * \warpcollective{WarpReduce}
- * \par
- * The code snippet below illustrates four concurrent warp sum reductions within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for 4 warps on type int
- *     typedef cub::WarpReduce<int, 4> WarpReduce;
- *
- *     // Allocate shared memory for WarpReduce
- *     __shared__ typename WarpReduce::TempStorage temp_storage;
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
- *     int aggregate = WarpReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
- * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
- * \p 2544, and \p 3568, respectively (and is undefined in other threads).
- *
- * \par
- * The code snippet below illustrates a single warp sum reduction within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for one warp on type int
- *     typedef cub::WarpReduce<int, 1> WarpReduce;
- *
- *     // Allocate shared memory for WarpReduce
- *     __shared__ typename WarpReduce::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a reduction
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Return the warp-wide sum to lane0
- *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>0, 1, 2, 3, ..., 31</tt>.
- * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
- *
- * \par Usage and Performance Considerations
- * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- * - Warp reductions are concurrent if more than one logical warp is participating
- * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic reduction)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARPS           = 1,
-    int         LOGICAL_WARP_THREADS    = PtxArchProps::WARP_THREADS>
-class WarpReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    enum
-    {
-        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-    };
-
-public:
-
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
-    typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
-        WarpReduceShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
-        WarpReduceSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpReduce;
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-private:
-
-    /// Shared memory storage layout type for WarpReduce
-    typedef typename InternalWarpReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Warp ID
-    int warp_id;
-
-    /// Lane ID
-    int lane_id;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{WarpReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     *
-     */
-    __device__ __forceinline__ WarpReduce()
-    :
-        temp_storage(PrivateStorage()),
-        warp_id((LOGICAL_WARPS == 1) ?
-            0 :
-            threadIdx.x / LOGICAL_WARP_THREADS),
-        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
-            LaneId() :
-            threadIdx.x % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        warp_id((LOGICAL_WARPS == 1) ?
-            0 :
-            threadIdx.x / LOGICAL_WARP_THREADS),
-        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
-            LaneId() :
-            threadIdx.x % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Threads are identified using the given warp and lane identifiers.
-     */
-    __device__ __forceinline__ WarpReduce(
-        int warp_id,                           ///< [in] A suitable warp membership identifier
-        int lane_id)                           ///< [in] A lane identifier within the warp
-    :
-        temp_storage(PrivateStorage()),
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Threads are identified using the given warp and lane identifiers.
-     */
-    __device__ __forceinline__ WarpReduce(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int warp_id,                           ///< [in] A suitable warp membership identifier
-        int lane_id)                           ///< [in] A lane identifier within the warp
-    :
-        temp_storage(temp_storage.Alias()),
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a warp-wide sum in each active warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp sum reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for 4 warps on type int
-     *     typedef cub::WarpReduce<int, 4> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
-     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input)              ///< [in] Calling thread's input
-    {
-        return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, LOGICAL_WARP_THREADS);
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide sum in each active warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads in each logical warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a sum reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for a single warp on type int
-     *     typedef cub::WarpReduce<int, 1> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Sum(
-     *         thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
-     * undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input,              ///< [in] Calling thread's input
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        // Determine if we don't need bounds checking
-        if (valid_items >= LOGICAL_WARP_THREADS)
-        {
-            return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<true, 1>(input, valid_items);
-        }
-        else
-        {
-            return InternalWarpReduce(temp_storage, warp_id, lane_id).Sum<false, 1>(input, valid_items);
-        }
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in each active warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a head-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for a single warp on type int
-     *     typedef cub::WarpReduce<int, 1> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
-     *         thread_data, head_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     */
-    template <
-        typename            Flag>
-    __device__ __forceinline__ T HeadSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return HeadSegmentedReduce(input, head_flag, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in each active warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a tail-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for a single warp on type int
-     *     typedef cub::WarpReduce<int, 1> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
-     *         thread_data, tail_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            Flag>
-    __device__ __forceinline__ T TailSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return TailSegmentedReduce(input, tail_flag, cub::Sum());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a warp-wide reduction in each active warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp max reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for 4 warps on type int
-     *     typedef cub::WarpReduce<int, 4> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Reduce(
-     *         thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, 1, 2, 3, ..., 127</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
-     * \p 95, and \p 127, respectively  (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide reduction in each active warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads in each logical warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a max reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for a single warp on type int
-     *     typedef cub::WarpReduce<int, 1> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Reduce(
-     *         thread_data, cub::Max(), valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
-     * undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        // Determine if we don't need bounds checking
-        if (valid_items >= LOGICAL_WARP_THREADS)
-        {
-            return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<true, 1>(input, valid_items, reduction_op);
-        }
-        else
-        {
-            return InternalWarpReduce(temp_storage, warp_id, lane_id).Reduce<false, 1>(input, valid_items, reduction_op);
-        }
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in each active warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a head-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for a single warp on type int
-     *     typedef cub::WarpReduce<int, 1> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
-     *         thread_data, head_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            Flag>
-    __device__ __forceinline__ T HeadSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<true>(input, head_flag, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in each active warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a tail-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for a single warp on type int
-     *     typedef cub::WarpReduce<int, 1> WarpReduce;
-     *
-     *     // Allocate shared memory for WarpReduce
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
-     *         thread_data, tail_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>0, 1, 2, 3, ..., 31</tt> and is <tt>0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            Flag>
-    __device__ __forceinline__ T TailSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage, warp_id, lane_id).template SegmentedReduce<false>(input, tail_flag, reduction_op);
-    }
-
-
-
-    //@}  end member group
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/cub/warp/warp_scan.cuh b/kokkos/kokkos/TPL/cub/warp/warp_scan.cuh
deleted file mode 100644
index a588b52..0000000
--- a/kokkos/kokkos/TPL/cub/warp/warp_scan.cuh
+++ /dev/null
@@ -1,1297 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2013, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across CUDA warp threads.
- */
-
-#pragma once
-
-#include "specializations/warp_scan_shfl.cuh"
-#include "specializations/warp_scan_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across CUDA warp threads.  ![](warp_scan_logo.png)
- *
- * \par Overview
- * Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output list where each element is computed to be the reduction
- * of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
- *
- * \tparam T                        The scan input/output element type
- * \tparam LOGICAL_WARPS            <b>[optional]</b> The number of "logical" warps performing concurrent warp scans. Default is 1.
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
- *
- * \par Simple Examples
- * \warpcollective{WarpScan}
- * \par
- * The code snippet below illustrates four concurrent warp prefix sums within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for 4 warps on type int
- *     typedef cub::WarpScan<int, 4> WarpScan;
- *
- *     // Allocate shared memory for WarpScan
- *     __shared__ typename WarpScan::TempStorage temp_storage;
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Compute warp-wide prefix sums
- *     WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
- * The corresponding output \p thread_data in each of the four warps of threads will be
- * <tt>0, 1, 2, 3, ..., 31</tt>.
- *
- * \par
- * The code snippet below illustrates a single warp prefix sum within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for one warp on type int
- *     typedef cub::WarpScan<int, 1> WarpScan;
- *
- *     // Allocate shared memory for WarpScan
- *     __shared__ typename WarpScan::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a prefix sum
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Compute warp-wide prefix sums
- *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>1, 1, 1, 1, ...</tt>.
- * The corresponding output \p thread_data will be <tt>0, 1, 2, 3, ..., 31</tt>.
- *
- * \par Usage and Performance Considerations
- * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- * - Warp scans are concurrent if more than one warp is participating
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Zero bank conflicts for most types.
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic scan)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARPS           = 1,
-    int         LOGICAL_WARP_THREADS    = PtxArchProps::WARP_THREADS>
-class WarpScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and typedefs
-     ******************************************************************************/
-
-    enum
-    {
-        POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-    };
-
-    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and ((only one logical warp) or (LOGICAL_WARP_THREADS is a power-of-two))
-    typedef typename If<(CUB_PTX_ARCH >= 300) && ((LOGICAL_WARPS == 1) || POW_OF_TWO),
-        WarpScanShfl<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS>,
-        WarpScanSmem<T, LOGICAL_WARPS, LOGICAL_WARP_THREADS> >::Type InternalWarpScan;
-
-    /// Shared memory storage layout type for WarpScan
-    typedef typename InternalWarpScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Warp ID
-    int warp_id;
-
-    /// Lane ID
-    int lane_id;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{WarpScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using a private static allocation of shared memory as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpScan()
-    :
-        temp_storage(PrivateStorage()),
-        warp_id((LOGICAL_WARPS == 1) ?
-            0 :
-            threadIdx.x / LOGICAL_WARP_THREADS),
-        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
-            LaneId() :
-            threadIdx.x % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /**
-     * \brief Collective constructor for 1D thread blocks using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        warp_id((LOGICAL_WARPS == 1) ?
-            0 :
-            threadIdx.x / LOGICAL_WARP_THREADS),
-        lane_id(((LOGICAL_WARPS == 1) || (LOGICAL_WARP_THREADS == PtxArchProps::WARP_THREADS)) ?
-            LaneId() :
-            threadIdx.x % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.  Threads are identified using the given warp and lane identifiers.
-     */
-    __device__ __forceinline__ WarpScan(
-        int warp_id,                           ///< [in] A suitable warp membership identifier
-        int lane_id)                           ///< [in] A lane identifier within the warp
-    :
-        temp_storage(PrivateStorage()),
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Threads are identified using the given warp and lane identifiers.
-     */
-    __device__ __forceinline__ WarpScan(
-        TempStorage &temp_storage,             ///< [in] Reference to memory allocation having layout type TempStorage
-        int warp_id,                           ///< [in] A suitable warp membership identifier
-        int lane_id)                           ///< [in] A lane identifier within the warp
-    :
-        temp_storage(temp_storage.Alias()),
-        warp_id(warp_id),
-        lane_id(lane_id)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive prefix sum in each logical warp.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     WarpScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32</tt>.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveSum(input, output);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * The \p warp_aggregate is undefined in threads other than <em>warp-lane</em><sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     WarpScan(temp_storage).InclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveSum(input, output, warp_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * The \p warp_aggregate is undefined in threads other than <em>warp-lane</em><sub>0</sub>.
-     *
-     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
-     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the entire warp of threads, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 32 integer items that are partitioned across the warp.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct WarpPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
-     *     // for returning a value for seeding the warp-wide scan.
-     *     __device__ int operator()(int warp_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += warp_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize WarpScan for one warp
-     *     typedef cub::WarpScan<int, 1> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     WarpPrefixOp prefix_op(0);
-     *
-     *     // Have the warp iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
-     *     {
-     *         // Load a segment of consecutive items
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the warp-wide inclusive prefix sum
-     *         int warp_aggregate;
-     *         WarpScan(temp_storage).InclusiveSum(
-     *             thread_data, thread_data, warp_aggregate, prefix_op);
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, 3, ..., 32</tt>.
-     * The output for the second segment will be <tt>33, 34, 35, ..., 64</tt>.  Furthermore,
-     * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan.
-     *
-     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
-     */
-    template <typename WarpPrefixOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items, exclusive of the \p warp_prefix_op value
-        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
-    {
-        // Compute inclusive warp scan
-        InclusiveSum(input, output, warp_aggregate);
-
-        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
-        T prefix;
-        prefix = warp_prefix_op(warp_aggregate);
-        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
-
-        // Update output
-        output = prefix + output;
-    }
-
-    //@}  end member group
-
-private:
-
-    /// Computes an exclusive prefix sum in each logical warp.
-    __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type<true> is_primitive)
-    {
-        // Compute exclusive warp scan from inclusive warp scan
-        T inclusive;
-        InclusiveSum(input, inclusive);
-        output = inclusive - input;
-    }
-
-    /// Computes an exclusive prefix sum in each logical warp.  Specialized for non-primitive types.
-    __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type<false> is_primitive)
-    {
-        // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
-        T identity = T();
-        ExclusiveScan(input, output, identity, Sum());
-    }
-
-    /// Computes an exclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type<true> is_primitive)
-    {
-        // Compute exclusive warp scan from inclusive warp scan
-        T inclusive;
-        InclusiveSum(input, inclusive, warp_aggregate);
-        output = inclusive - input;
-    }
-
-    /// Computes an exclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.  Specialized for non-primitive types.
-    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type<false> is_primitive)
-    {
-        // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
-        T identity = T();
-        ExclusiveScan(input, output, identity, Sum(), warp_aggregate);
-    }
-
-    /// Computes an exclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-    template <typename WarpPrefixOp>
-    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixOp &warp_prefix_op, Int2Type<true> is_primitive)
-    {
-        // Compute exclusive warp scan from inclusive warp scan
-        T inclusive;
-        InclusiveSum(input, inclusive, warp_aggregate, warp_prefix_op);
-        output = inclusive - input;
-    }
-
-    /// Computes an exclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.  Specialized for non-primitive types.
-    template <typename WarpPrefixOp>
-    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixOp &warp_prefix_op, Int2Type<false> is_primitive)
-    {
-        // Delegate to regular scan for non-primitive types (because we won't be able to use subtraction)
-        T identity = T();
-        ExclusiveScan(input, output, identity, Sum(), warp_aggregate, warp_prefix_op);
-    }
-
-public:
-
-
-    /******************************************************************//**
-     * \name Exclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive prefix sum in each logical warp.
-     *
-     * This operation assumes the value of obtained by the <tt>T</tt>'s default
-     * constructor (or by zero-initialization if no user-defined default
-     * constructor exists) is suitable as the identity value "zero" for
-     * addition.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        ExclusiveSum(input, output, Int2Type<Traits<T>::PRIMITIVE>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix sum in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * This operation assumes the value of obtained by the <tt>T</tt>'s default
-     * constructor (or by zero-initialization if no user-defined default
-     * constructor exists) is suitable as the identity value "zero" for
-     * addition.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, 1, 1, ...</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        ExclusiveSum(input, output, warp_aggregate, Int2Type<Traits<T>::PRIMITIVE>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix sum in each logical warp.  Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * This operation assumes the value of obtained by the <tt>T</tt>'s default
-     * constructor (or by zero-initialization if no user-defined default
-     * constructor exists) is suitable as the identity value "zero" for
-     * addition.
-     *
-     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
-     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the entire warp of threads, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 32 integer items that are partitioned across the warp.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct WarpPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
-     *     // for returning a value for seeding the warp-wide scan.
-     *     __device__ int operator()(int warp_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += warp_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize WarpScan for one warp
-     *     typedef cub::WarpScan<int, 1> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     WarpPrefixOp prefix_op(0);
-     *
-     *     // Have the warp iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
-     *     {
-     *         // Load a segment of consecutive items
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the warp-wide exclusive prefix sum
-     *         int warp_aggregate;
-     *         WarpScan(temp_storage).ExclusiveSum(
-     *             thread_data, thread_data, warp_aggregate, prefix_op);
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, 2, ..., 31</tt>.
-     * The output for the second segment will be <tt>32, 33, 34, ..., 63</tt>.  Furthermore,
-     * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan.
-     *
-     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
-     */
-    template <typename WarpPrefixOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
-        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
-    {
-        ExclusiveSum(input, output, warp_aggregate, warp_prefix_op, Int2Type<Traits<T>::PRIMITIVE>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an inclusive prefix sum using the specified binary scan functor in each logical warp.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     WarpScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum using the specified binary scan functor in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     WarpScan(temp_storage).InclusiveScan(
-     *         thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).InclusiveScan(input, output, scan_op, warp_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum using the specified binary scan functor in each logical warp.  The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
-     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the entire warp of threads, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 32 integer items that are partitioned across the warp.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct WarpPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
-     *     // for returning a value for seeding the warp-wide scan.
-     *     __device__ int operator()(int warp_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize WarpScan for one warp
-     *     typedef cub::WarpScan<int, 1> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     WarpPrefixOp prefix_op(0);
-     *
-     *     // Have the warp iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
-     *     {
-     *         // Load a segment of consecutive items
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the warp-wide inclusive prefix max scan
-     *         int warp_aggregate;
-     *         WarpScan(temp_storage).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), warp_aggregate, prefix_op);
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 30, 30</tt>.
-     * The output for the second segment will be <tt>32, 32, 34, 34, ..., 62, 62</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second
-     * scan, etc.
-     *
-     * \tparam ScanOp                       <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename WarpPrefixOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
-        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
-    {
-        // Compute inclusive warp scan
-        InclusiveScan(input, output, scan_op, warp_aggregate);
-
-        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
-        T prefix;
-        prefix = warp_prefix_op(warp_aggregate);
-        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
-
-        // Update output
-        output = scan_op(prefix, output);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
-     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the entire warp of threads, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 32 integer items that are partitioned across the warp.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct WarpPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
-     *     // for returning a value for seeding the warp-wide scan.
-     *     __device__ int operator()(int warp_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize WarpScan for one warp
-     *     typedef cub::WarpScan<int, 1> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     WarpPrefixOp prefix_op(INT_MIN);
-     *
-     *     // Have the warp iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
-     *     {
-     *         // Load a segment of consecutive items
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the warp-wide exclusive prefix max scan
-     *         int warp_aggregate;
-     *         WarpScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op);
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>.
-     * The output for the second segment will be <tt>30, 32, 32, 34, ..., 60, 62</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second
-     * scan, etc.
-     *
-     * \tparam ScanOp                       <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename WarpPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
-        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
-    {
-        // Exclusive warp scan
-        ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
-
-        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
-        T prefix = warp_prefix_op(warp_aggregate);
-        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
-
-        // Update output
-        output = (lane_id == 0) ?
-            prefix :
-            scan_op(prefix, output);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Identityless exclusive prefix scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  Because no identity value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in each warp lane0 is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  Because no identity value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for 4 warps on type int
-     *     typedef cub::WarpScan<int, 4> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     WarpScan(temp_storage).ExclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in each warp lane0 is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage, warp_id, lane_id).ExclusiveScan(input, output, scan_op, warp_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor in each logical warp.  The \p warp_prefix_op value from thread-thread-lane<sub>0</sub> is applied to all scan outputs.  Also computes the warp-wide \p warp_aggregate of all inputs for thread-thread-lane<sub>0</sub>.
-     *
-     * The \p warp_prefix_op functor must implement a member function <tt>T operator()(T warp_aggregate)</tt>.
-     * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the entire warp of threads, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the threadblock-wide prefix.  Can be stateful.
-     *
-     * Supports non-commutative scan operators.
-     *
-     * \smemreuse
-     *
-     * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 32 integer items that are partitioned across the warp.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct WarpPrefixOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ WarpPrefixOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the entire warp. Lane-0 is responsible
-     *     // for returning a value for seeding the warp-wide scan.
-     *     __device__ int operator()(int warp_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize WarpScan for one warp
-     *     typedef cub::WarpScan<int, 1> WarpScan;
-     *
-     *     // Allocate shared memory for WarpScan
-     *     __shared__ typename WarpScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     WarpPrefixOp prefix_op(INT_MIN);
-     *
-     *     // Have the warp iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 32)
-     *     {
-     *         // Load a segment of consecutive items
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the warp-wide exclusive prefix max scan
-     *         int warp_aggregate;
-     *         WarpScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op);
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>.
-     * The output for the second segment will be <tt>30, 32, 32, 34, ..., 60, 62</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second
-     * scan, etc.
-     *
-     * \tparam ScanOp                       <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam WarpPrefixOp                 <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T warp_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename WarpPrefixOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate,    ///< [out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value).
-        WarpPrefixOp    &warp_prefix_op)    ///< [in-out] <b>[<em>warp-lane</em><sub>0</sub> only]</b> Call-back functor for specifying a warp-wide prefix to be applied to all inputs.
-    {
-        // Exclusive warp scan
-        ExclusiveScan(input, output, scan_op, warp_aggregate);
-
-        // Compute warp-wide prefix from aggregate, then broadcast to other lanes
-        T prefix = warp_prefix_op(warp_aggregate);
-        prefix = InternalWarpScan(temp_storage, warp_id, lane_id).Broadcast(prefix, 0);
-
-        // Update output with prefix
-        output = (lane_id == 0) ?
-            prefix :
-            scan_op(prefix, output);
-    }
-
-    //@}  end member group
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/kokkos/kokkos/TPL/gtest.readme b/kokkos/kokkos/TPL/gtest.readme
deleted file mode 100644
index 91cc4c3..0000000
--- a/kokkos/kokkos/TPL/gtest.readme
+++ /dev/null
@@ -1,18 +0,0 @@
-These files were created using the python script included with the gtest distribution.
-
-# Fusing Google Test Source Files
-# 
-# Google Test's implementation consists of ~30 files (excluding its own tests). 
-# Sometimes you may want them to be packaged up in two files (a .h and a .cc) instead,
-# such that you can easily copy them to a new machine and start hacking there. For 
-# this we provide an experimental Python script fuse_gtest_files.py in the scripts/ 
-# directory (since release 1.3.0). Assuming you have Python 2.4 or above installed on
-# your machine, just go to that directory and run
-# 
-# python fuse_gtest_files.py OUTPUT_DIR
-# 
-# and you should see an OUTPUT_DIR directory being created with files gtest/gtest.h 
-# and gtest/gtest-all.cc in it. These files contain everything you need to use Google 
-# Test. Just copy them to anywhere you want and you are ready to write tests. You can 
-# use the scripts/test/Makefile file as an example on how to compile your tests against
-# them. 
diff --git a/kokkos/kokkos/TPL/gtest/GTEST_COPYING b/kokkos/kokkos/TPL/gtest/GTEST_COPYING
deleted file mode 100644
index 1941a11..0000000
--- a/kokkos/kokkos/TPL/gtest/GTEST_COPYING
+++ /dev/null
@@ -1,28 +0,0 @@
-Copyright 2008, Google Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-    * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-    * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/kokkos/kokkos/TPL/gtest/gtest-all.cc b/kokkos/kokkos/TPL/gtest/gtest-all.cc
deleted file mode 100644
index e0a578d..0000000
--- a/kokkos/kokkos/TPL/gtest/gtest-all.cc
+++ /dev/null
@@ -1,9118 +0,0 @@
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// Google C++ Testing Framework (Google Test)
-//
-// Sometimes it's desirable to build Google Test by compiling a single file.
-// This file serves this purpose.
-
-// This line ensures that gtest.h can be compiled on its own, even
-// when it's fused.
-#include <gtest/gtest.h>
-
-// The following lines pull in the real gtest *.cc files.
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Utilities for testing Google Test itself and code that uses Google Test
-// (e.g. frameworks built on top of Google Test).
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-
-
-namespace testing {
-
-// This helper class can be used to mock out Google Test failure reporting
-// so that we can test Google Test or code that builds on Google Test.
-//
-// An object of this class appends a TestPartResult object to the
-// TestPartResultArray object given in the constructor whenever a Google Test
-// failure is reported. It can either intercept only failures that are
-// generated in the same thread that created this object or it can intercept
-// all generated failures. The scope of this mock object can be controlled with
-// the second argument to the two arguments constructor.
-class GTEST_API_ ScopedFakeTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  // The two possible mocking modes of this object.
-  enum InterceptMode {
-    INTERCEPT_ONLY_CURRENT_THREAD,  // Intercepts only thread local failures.
-    INTERCEPT_ALL_THREADS           // Intercepts all failures.
-  };
-
-  // The c'tor sets this object as the test part result reporter used
-  // by Google Test.  The 'result' parameter specifies where to report the
-  // results. This reporter will only catch failures generated in the current
-  // thread. DEPRECATED
-  explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result);
-
-  // Same as above, but you can choose the interception scope of this object.
-  ScopedFakeTestPartResultReporter(InterceptMode intercept_mode,
-                                   TestPartResultArray* result);
-
-  // The d'tor restores the previous test part result reporter.
-  virtual ~ScopedFakeTestPartResultReporter();
-
-  // Appends the TestPartResult object to the TestPartResultArray
-  // received in the constructor.
-  //
-  // This method is from the TestPartResultReporterInterface
-  // interface.
-  virtual void ReportTestPartResult(const TestPartResult& result);
- private:
-  void Init();
-
-  const InterceptMode intercept_mode_;
-  TestPartResultReporterInterface* old_reporter_;
-  TestPartResultArray* const result_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
-};
-
-namespace internal {
-
-// A helper class for implementing EXPECT_FATAL_FAILURE() and
-// EXPECT_NONFATAL_FAILURE().  Its destructor verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-class GTEST_API_ SingleFailureChecker {
- public:
-  // The constructor remembers the arguments.
-  SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type,
-                       const string& substr);
-  ~SingleFailureChecker();
- private:
-  const TestPartResultArray* const results_;
-  const TestPartResult::Type type_;
-  const string substr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-// A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
-// statement will cause exactly one fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_FATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - 'statement' cannot reference local non-static variables or
-//     non-static members of the current object.
-//   - 'statement' cannot return a value.
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
-// gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-// A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
-//
-// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
-// affects and considers failures generated in the current thread and
-// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads.
-//
-// 'statement' is allowed to reference local variables and members of
-// the current object.
-//
-// The verification of the assertion is done correctly even when the statement
-// throws an exception or aborts the current function.
-//
-// Known restrictions:
-//   - You cannot stream a failure message to this macro.
-//
-// Note that even though the implementations of the following two
-// macros are much alike, we cannot refactor them to use a common
-// helper macro, due to some peculiarity in how the preprocessor
-// works.  If we do that, the code won't compile when the user gives
-// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that
-// expands to code containing an unprotected comma.  The
-// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc
-// catches that.
-//
-// For the same reason, we have to write
-//   if (::testing::internal::AlwaysTrue()) { statement; }
-// instead of
-//   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-// to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS,\
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
-  } while (::testing::internal::AlwaysFalse())
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-
-#include <ctype.h>
-#include <math.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <wchar.h>
-#include <wctype.h>
-
-#include <algorithm>
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <vector>
-
-#if GTEST_OS_LINUX
-
-// TODO(kenton@google.com): Use autoconf to detect availability of
-// gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
-// Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
-
-#elif GTEST_OS_SYMBIAN
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-#elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-# include <sys/time.h>  // NOLINT
-
-// On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
-
-# include <windows.h>  // NOLINT
-
-#elif GTEST_OS_WINDOWS  // We are on Windows proper.
-
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
-
-# if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
-//   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
-//   supports these.  consider using them instead.
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <windows.h>  // NOLINT
-
-#else
-
-// Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
-//   gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
-// cpplint thinks that the header is already included, so we want to
-// silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-
-#endif  // GTEST_OS_LINUX
-
-#if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
-#endif
-
-#if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-#endif
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// This file contains purely Google Test's internal implementation.  Please
-// DO NOT #INCLUDE IT IN A USER PROGRAM.
-
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
-
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// A user is trying to include this from his code - just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
-#ifndef _WIN32_WCE
-# include <errno.h>
-#endif  // !_WIN32_WCE
-#include <stddef.h>
-#include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
-#include <string.h>  // For memmove.
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-
-#if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-
-namespace testing {
-
-// Declares the flags.
-//
-// We don't want the users to modify this flag in the code, but want
-// Google Test's own unit tests to be able to access it. Therefore we
-// declare it here as opposed to in gtest.h.
-GTEST_DECLARE_bool_(death_test_use_fork);
-
-namespace internal {
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kPrintTimeFlag[] = "print_time";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-
-// A valid random seed must be in [1, kMaxRandomSeed].
-const int kMaxRandomSeed = 99999;
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-GTEST_API_ extern bool g_help_flag;
-
-// Returns the current time in milliseconds.
-GTEST_API_ TimeInMillis GetTimeInMillis();
-
-// Returns true iff Google Test should use colors in the output.
-GTEST_API_ bool ShouldUseColor(bool stdout_is_tty);
-
-// Formats the given time in milliseconds as seconds.
-GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms);
-
-// Parses a string for an Int32 flag, in the form of "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, Int32* value);
-
-// Returns a random seed in range [1, kMaxRandomSeed] based on the
-// given --gtest_random_seed flag value.
-inline int GetRandomSeedFromFlag(Int32 random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
-
-  // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
-  // it's easy to type.
-  const int normalized_seed =
-      static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
-  return normalized_seed;
-}
-
-// Returns the first valid random seed after 'seed'.  The behavior is
-// undefined if 'seed' is invalid.  The seed after kMaxRandomSeed is
-// considered to be 1.
-inline int GetNextRandomSeed(int seed) {
-  GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed)
-      << "Invalid random seed " << seed << " - must be in [1, "
-      << kMaxRandomSeed << "].";
-  const int next_seed = seed + 1;
-  return (next_seed > kMaxRandomSeed) ? 1 : next_seed;
-}
-
-// This class saves the values of all Google Test flags in its c'tor, and
-// restores them in its d'tor.
-class GTestFlagSaver {
- public:
-  // The c'tor.
-  GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    print_time_ = GTEST_FLAG(print_time);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
-  }
-
-  // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
-  ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
-  }
- private:
-  // Fields for saving the original values of flags.
-  bool also_run_disabled_tests_;
-  bool break_on_failure_;
-  bool catch_exceptions_;
-  String color_;
-  String death_test_style_;
-  bool death_test_use_fork_;
-  String filter_;
-  String internal_run_death_test_;
-  bool list_tests_;
-  String output_;
-  bool print_time_;
-  bool pretty_;
-  internal::Int32 random_seed_;
-  internal::Int32 repeat_;
-  bool shuffle_;
-  internal::Int32 stack_trace_depth_;
-  String stream_result_to_;
-  bool throw_on_failure_;
-} GTEST_ATTRIBUTE_UNUSED_;
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// The output buffer str must containt at least 32 characters.
-// The function returns the address of the output buffer.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'.
-GTEST_API_ char* CodePointToUtf8(UInt32 code_point, char* str);
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-GTEST_API_ String WideStringToUtf8(const wchar_t* str, int num_chars);
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded();
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (e.g., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-GTEST_API_ bool ShouldShard(const char* total_shards_str,
-                            const char* shard_index_str,
-                            bool in_subprocess_for_death_test);
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error and
-// and aborts.
-GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val);
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
-
-// STL container utilities.
-
-// Returns the number of elements in the given container that satisfy
-// the given predicate.
-template <class Container, typename Predicate>
-inline int CountIf(const Container& c, Predicate predicate) {
-  // Implemented as an explicit loop since std::count_if() in libCstd on
-  // Solaris has a non-standard signature.
-  int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
-  }
-  return count;
-}
-
-// Applies a function/functor to each element in the container.
-template <class Container, typename Functor>
-void ForEach(const Container& c, Functor functor) {
-  std::for_each(c.begin(), c.end(), functor);
-}
-
-// Returns the i-th element of the vector, or default_value if i is not
-// in range [0, v.size()).
-template <typename E>
-inline E GetElementOr(const std::vector<E>& v, int i, E default_value) {
-  return (i < 0 || i >= static_cast<int>(v.size())) ? default_value : v[i];
-}
-
-// Performs an in-place shuffle of a range of the vector's elements.
-// 'begin' and 'end' are element indices as an STL-style range;
-// i.e. [begin, end) are shuffled, where 'end' == size() means to
-// shuffle to the end of the vector.
-template <typename E>
-void ShuffleRange(internal::Random* random, int begin, int end,
-                  std::vector<E>* v) {
-  const int size = static_cast<int>(v->size());
-  GTEST_CHECK_(0 <= begin && begin <= size)
-      << "Invalid shuffle range start " << begin << ": must be in range [0, "
-      << size << "].";
-  GTEST_CHECK_(begin <= end && end <= size)
-      << "Invalid shuffle range finish " << end << ": must be in range ["
-      << begin << ", " << size << "].";
-
-  // Fisher-Yates shuffle, from
-  // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle
-  for (int range_width = end - begin; range_width >= 2; range_width--) {
-    const int last_in_range = begin + range_width - 1;
-    const int selected = begin + random->Generate(range_width);
-    std::swap((*v)[selected], (*v)[last_in_range]);
-  }
-}
-
-// Performs an in-place shuffle of the vector's elements.
-template <typename E>
-inline void Shuffle(internal::Random* random, std::vector<E>* v) {
-  ShuffleRange(random, 0, static_cast<int>(v->size()), v);
-}
-
-// A function for deleting an object.  Handy for being used as a
-// functor.
-template <typename T>
-static void Delete(T* x) {
-  delete x;
-}
-
-// A predicate that checks the key of a TestProperty against a known key.
-//
-// TestPropertyKeyIs is copyable.
-class TestPropertyKeyIs {
- public:
-  // Constructor.
-  //
-  // TestPropertyKeyIs has NO default constructor.
-  explicit TestPropertyKeyIs(const char* key)
-      : key_(key) {}
-
-  // Returns true iff the test name of test property matches on key_.
-  bool operator()(const TestProperty& test_property) const {
-    return String(test_property.key()).Compare(key_) == 0;
-  }
-
- private:
-  String key_;
-};
-
-// Class UnitTestOptions.
-//
-// This class contains functions for processing options the user
-// specifies when running the tests.  It has only static members.
-//
-// In most cases, the user can specify an option using either an
-// environment variable or a command line flag.  E.g. you can set the
-// test filter using either GTEST_FILTER or --gtest_filter.  If both
-// the variable and the flag are present, the latter overrides the
-// former.
-class GTEST_API_ UnitTestOptions {
- public:
-  // Functions for processing the gtest_output flag.
-
-  // Returns the output format, or "" for normal printed output.
-  static String GetOutputFormat();
-
-  // Returns the absolute path of the requested output file, or the
-  // default (test_detail.xml in the original working directory) if
-  // none was explicitly specified.
-  static String GetAbsolutePathToOutputFile();
-
-  // Functions for processing the gtest_filter flag.
-
-  // Returns true iff the wildcard pattern matches the string.  The
-  // first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
-  // Returns true iff the user-specified filter matches the test case
-  // name and the test name.
-  static bool FilterMatchesTest(const String &test_case_name,
-                                const String &test_name);
-
-#if GTEST_OS_WINDOWS
-  // Function for supporting the gtest_catch_exception flag.
-
-  // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-  // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-  // This function is useful as an __except condition.
-  static int GTestShouldProcessSEH(DWORD exception_code);
-#endif  // GTEST_OS_WINDOWS
-
-  // Returns true if "name" matches the ':' separated list of glob-style
-  // filters in "filter".
-  static bool MatchesFilter(const String& name, const char* filter);
-};
-
-// Returns the current application's name, removing directory path if that
-// is present.  Used by UnitTestOptions::GetOutputFile.
-GTEST_API_ FilePath GetCurrentExecutableName();
-
-// The role interface for getting the OS stack trace as a string.
-class OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetterInterface() {}
-  virtual ~OsStackTraceGetterInterface() {}
-
-  // Returns the current OS stack trace as a String.  Parameters:
-  //
-  //   max_depth  - the maximum number of stack frames to be included
-  //                in the trace.
-  //   skip_count - the number of top frames to be skipped; doesn't count
-  //                against max_depth.
-  virtual String CurrentStackTrace(int max_depth, int skip_count) = 0;
-
-  // UponLeavingGTest() should be called immediately before Google Test calls
-  // user code. It saves some information about the current stack that
-  // CurrentStackTrace() will use to find and hide Google Test stack frames.
-  virtual void UponLeavingGTest() = 0;
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
-};
-
-// A working implementation of the OsStackTraceGetterInterface interface.
-class OsStackTraceGetter : public OsStackTraceGetterInterface {
- public:
-  OsStackTraceGetter() : caller_frame_(NULL) {}
-  virtual String CurrentStackTrace(int max_depth, int skip_count);
-  virtual void UponLeavingGTest();
-
-  // This string is inserted in place of stack frames that are part of
-  // Google Test's implementation.
-  static const char* const kElidedFramesMarker;
-
- private:
-  Mutex mutex_;  // protects all internal state
-
-  // We save the stack frame below the frame that calls user code.
-  // We do this because the address of the frame immediately below
-  // the user code changes between the call to UponLeavingGTest()
-  // and any calls to CurrentStackTrace() from within the user code.
-  void* caller_frame_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
-};
-
-// Information about a Google Test trace point.
-struct TraceInfo {
-  const char* file;
-  int line;
-  String message;
-};
-
-// This is the default global test part result reporter used in UnitTestImpl.
-// This class should only be used by UnitTestImpl.
-class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
- public:
-  explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. Reports the test part
-  // result in the current test.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
-};
-
-// This is the default per thread test part result reporter used in
-// UnitTestImpl. This class should only be used by UnitTestImpl.
-class DefaultPerThreadTestPartResultReporter
-    : public TestPartResultReporterInterface {
- public:
-  explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test);
-  // Implements the TestPartResultReporterInterface. The implementation just
-  // delegates to the current global test part result reporter of *unit_test_.
-  virtual void ReportTestPartResult(const TestPartResult& result);
-
- private:
-  UnitTestImpl* const unit_test_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
-};
-
-// The private implementation of the UnitTest class.  We don't protect
-// the methods under a mutex, as this class is not accessible by a
-// user and the UnitTest class that delegates work to this class does
-// proper locking.
-class GTEST_API_ UnitTestImpl {
- public:
-  explicit UnitTestImpl(UnitTest* parent);
-  virtual ~UnitTestImpl();
-
-  // There are two different ways to register your own TestPartResultReporter.
-  // You can register your own repoter to listen either only for test results
-  // from the current thread or for results from all threads.
-  // By default, each per-thread test result repoter just passes a new
-  // TestPartResult to the global test result reporter, which registers the
-  // test part result for the currently running test.
-
-  // Returns the global test part result reporter.
-  TestPartResultReporterInterface* GetGlobalTestPartResultReporter();
-
-  // Sets the global test part result reporter.
-  void SetGlobalTestPartResultReporter(
-      TestPartResultReporterInterface* reporter);
-
-  // Returns the test part result reporter for the current thread.
-  TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread();
-
-  // Sets the test part result reporter for the current thread.
-  void SetTestPartResultReporterForCurrentThread(
-      TestPartResultReporterInterface* reporter);
-
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
-
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
-
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
-
-  // Gets the number of all test cases that contain at least one test
-  // that should run.
-  int test_case_to_run_count() const;
-
-  // Gets the number of successful tests.
-  int successful_test_count() const;
-
-  // Gets the number of failed tests.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests.
-  int disabled_test_count() const;
-
-  // Gets the number of all tests.
-  int total_test_count() const;
-
-  // Gets the number of tests that should run.
-  int test_to_run_count() const;
-
-  // Gets the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns true iff the unit test passed (i.e. all test cases passed).
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
-  bool Failed() const {
-    return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed();
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  const TestCase* GetTestCase(int i) const {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[i];
-  }
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i) {
-    const int index = GetElementOr(test_case_indices_, i, -1);
-    return index < 0 ? NULL : test_cases_[index];
-  }
-
-  // Provides access to the event listener list.
-  TestEventListeners* listeners() { return &listeners_; }
-
-  // Returns the TestResult for the test that's currently running, or
-  // the TestResult for the ad hoc test if no test is running.
-  TestResult* current_test_result();
-
-  // Returns the TestResult for the ad hoc test.
-  const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; }
-
-  // Sets the OS stack trace getter.
-  //
-  // Does nothing if the input and the current OS stack trace getter
-  // are the same; otherwise, deletes the old getter and makes the
-  // input the current getter.
-  void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter);
-
-  // Returns the current OS stack trace getter if it is not NULL;
-  // otherwise, creates an OsStackTraceGetter, makes it the current
-  // getter, and returns it.
-  OsStackTraceGetterInterface* os_stack_trace_getter();
-
-  // Returns the current OS stack trace as a String.
-  //
-  // The maximum number of stack frames to be included is specified by
-  // the gtest_stack_trace_depth flag.  The skip_count parameter
-  // specifies the number of top frames to be skipped, which doesn't
-  // count against the number of frames to be included.
-  //
-  // For example, if Foo() calls Bar(), which in turn calls
-  // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-  // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  String CurrentOsStackTraceExceptTop(int skip_count);
-
-  // Finds and returns a TestCase with the given name.  If one doesn't
-  // exist, creates one and returns it.
-  //
-  // Arguments:
-  //
-  //   test_case_name: name of the test case
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test case
-  //   tear_down_tc:   pointer to the function that tears down the test case
-  TestCase* GetTestCase(const char* test_case_name,
-                        const char* type_param,
-                        Test::SetUpTestCaseFunc set_up_tc,
-                        Test::TearDownTestCaseFunc tear_down_tc);
-
-  // Adds a TestInfo to the unit test.
-  //
-  // Arguments:
-  //
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  //   test_info:    the TestInfo object
-  void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc,
-                   TestInfo* test_info) {
-    // In order to support thread-safe death tests, we need to
-    // remember the original working directory when the test program
-    // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
-    // the user may have changed the current directory before calling
-    // RUN_ALL_TESTS().  Therefore we capture the current directory in
-    // AddTestInfo(), which is called to register a TEST or TEST_F
-    // before main() is reached.
-    if (original_working_dir_.IsEmpty()) {
-      original_working_dir_.Set(FilePath::GetCurrentDir());
-      GTEST_CHECK_(!original_working_dir_.IsEmpty())
-          << "Failed to get the current working directory.";
-    }
-
-    GetTestCase(test_info->test_case_name(),
-                test_info->type_param(),
-                set_up_tc,
-                tear_down_tc)->AddTestInfo(test_info);
-  }
-
-#if GTEST_HAS_PARAM_TEST
-  // Returns ParameterizedTestCaseRegistry object used to keep track of
-  // value-parameterized tests and instantiate and register them.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
-    return parameterized_test_registry_;
-  }
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Sets the TestCase object for the test that's currently running.
-  void set_current_test_case(TestCase* a_current_test_case) {
-    current_test_case_ = a_current_test_case;
-  }
-
-  // Sets the TestInfo object for the test that's currently running.  If
-  // current_test_info is NULL, the assertion results will be stored in
-  // ad_hoc_test_result_.
-  void set_current_test_info(TestInfo* a_current_test_info) {
-    current_test_info_ = a_current_test_info;
-  }
-
-  // Registers all parameterized tests defined using TEST_P and
-  // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter
-  // combination. This method can be called more then once; it has guards
-  // protecting from registering the tests more then once.  If
-  // value-parameterized tests are disabled, RegisterParameterizedTests is
-  // present but does nothing.
-  void RegisterParameterizedTests();
-
-  // Runs all tests in this UnitTest object, prints the result, and
-  // returns true if all tests are successful.  If any exception is
-  // thrown during a test, this test is considered to be failed, but
-  // the rest of the tests will still be run.
-  bool RunAllTests();
-
-  // Clears the results of all tests, except the ad hoc tests.
-  void ClearNonAdHocTestResult() {
-    ForEach(test_cases_, TestCase::ClearTestCaseResult);
-  }
-
-  // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
-
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
-
-  // Matches the full name of each test against the user-specified
-  // filter to decide whether the test should run, then records the
-  // result in each TestCase and TestInfo object.
-  // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests
-  // based on sharding variables in the environment.
-  // Returns the number of tests that should run.
-  int FilterTests(ReactionToSharding shard_tests);
-
-  // Prints the names of the tests matching the user-specified filter flag.
-  void ListTestsMatchingFilter();
-
-  const TestCase* current_test_case() const { return current_test_case_; }
-  TestInfo* current_test_info() { return current_test_info_; }
-  const TestInfo* current_test_info() const { return current_test_info_; }
-
-  // Returns the vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*>& environments() { return environments_; }
-
-  // Getters for the per-thread Google Test trace stack.
-  std::vector<TraceInfo>& gtest_trace_stack() {
-    return *(gtest_trace_stack_.pointer());
-  }
-  const std::vector<TraceInfo>& gtest_trace_stack() const {
-    return gtest_trace_stack_.get();
-  }
-
-#if GTEST_HAS_DEATH_TEST
-  void InitDeathTestSubprocessControlInfo() {
-    internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag());
-  }
-  // Returns a pointer to the parsed --gtest_internal_run_death_test
-  // flag, or NULL if that flag was not specified.
-  // This information is useful only in a death test child process.
-  // Must not be called before a call to InitGoogleTest.
-  const InternalRunDeathTestFlag* internal_run_death_test_flag() const {
-    return internal_run_death_test_flag_.get();
-  }
-
-  // Returns a pointer to the current death test factory.
-  internal::DeathTestFactory* death_test_factory() {
-    return death_test_factory_.get();
-  }
-
-  void SuppressTestEventsIfInSubprocess();
-
-  friend class ReplaceDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // Initializes the event listener performing XML output as specified by
-  // UnitTestOptions. Must not be called before InitGoogleTest.
-  void ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-  // Initializes the event listener for streaming test results to a socket.
-  // Must not be called before InitGoogleTest.
-  void ConfigureStreamingOutput();
-#endif
-
-  // Performs initialization dependent upon flag values obtained in
-  // ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-  // ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-  // this function is also called from RunAllTests.  Since this function can be
-  // called more than once, it has to be idempotent.
-  void PostFlagParsingInit();
-
-  // Gets the random seed used at the start of the current test iteration.
-  int random_seed() const { return random_seed_; }
-
-  // Gets the random number generator.
-  internal::Random* random() { return &random_; }
-
-  // Shuffles all test cases, and the tests within each test case,
-  // making sure that death tests are still run first.
-  void ShuffleTests();
-
-  // Restores the test cases and tests to their order before the first shuffle.
-  void UnshuffleTests();
-
-  // Returns the value of GTEST_FLAG(catch_exceptions) at the moment
-  // UnitTest::Run() starts.
-  bool catch_exceptions() const { return catch_exceptions_; }
-
- private:
-  friend class ::testing::UnitTest;
-
-  // Used by UnitTest::Run() to capture the state of
-  // GTEST_FLAG(catch_exceptions) at the moment it starts.
-  void set_catch_exceptions(bool value) { catch_exceptions_ = value; }
-
-  // The UnitTest object that owns this implementation object.
-  UnitTest* const parent_;
-
-  // The working directory when the first TEST() or TEST_F() was
-  // executed.
-  internal::FilePath original_working_dir_;
-
-  // The default test part result reporters.
-  DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_;
-  DefaultPerThreadTestPartResultReporter
-      default_per_thread_test_part_result_reporter_;
-
-  // Points to (but doesn't own) the global test part result reporter.
-  TestPartResultReporterInterface* global_test_part_result_repoter_;
-
-  // Protects read and write access to global_test_part_result_reporter_.
-  internal::Mutex global_test_part_result_reporter_mutex_;
-
-  // Points to (but doesn't own) the per-thread test part result reporter.
-  internal::ThreadLocal<TestPartResultReporterInterface*>
-      per_thread_test_part_result_reporter_;
-
-  // The vector of environments that need to be set-up/torn-down
-  // before/after the tests are run.
-  std::vector<Environment*> environments_;
-
-  // The vector of TestCases in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestCase*> test_cases_;
-
-  // Provides a level of indirection for the test case list to allow
-  // easy shuffling and restoring the test case order.  The i-th
-  // element of this vector is the index of the i-th test case in the
-  // shuffled order.
-  std::vector<int> test_case_indices_;
-
-#if GTEST_HAS_PARAM_TEST
-  // ParameterizedTestRegistry object used to register value-parameterized
-  // tests.
-  internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
-
-  // Indicates whether RegisterParameterizedTests() has been called already.
-  bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Index of the last death test case registered.  Initially -1.
-  int last_death_test_case_;
-
-  // This points to the TestCase for the currently running test.  It
-  // changes as Google Test goes through one test case after another.
-  // When no test is running, this is set to NULL and Google Test
-  // stores assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestCase* current_test_case_;
-
-  // This points to the TestInfo for the currently running test.  It
-  // changes as Google Test goes through one test after another.  When
-  // no test is running, this is set to NULL and Google Test stores
-  // assertion results in ad_hoc_test_result_.  Initially NULL.
-  TestInfo* current_test_info_;
-
-  // Normally, a user only writes assertions inside a TEST or TEST_F,
-  // or inside a function called by a TEST or TEST_F.  Since Google
-  // Test keeps track of which test is current running, it can
-  // associate such an assertion with the test it belongs to.
-  //
-  // If an assertion is encountered when no TEST or TEST_F is running,
-  // Google Test attributes the assertion result to an imaginary "ad hoc"
-  // test, and records the result in ad_hoc_test_result_.
-  TestResult ad_hoc_test_result_;
-
-  // The list of event listeners that can be used to track events inside
-  // Google Test.
-  TestEventListeners listeners_;
-
-  // The OS stack trace getter.  Will be deleted when the UnitTest
-  // object is destructed.  By default, an OsStackTraceGetter is used,
-  // but the user can set this field to use a custom getter if that is
-  // desired.
-  OsStackTraceGetterInterface* os_stack_trace_getter_;
-
-  // True iff PostFlagParsingInit() has been called.
-  bool post_flag_parse_init_performed_;
-
-  // The random number seed used at the beginning of the test run.
-  int random_seed_;
-
-  // Our random number generator.
-  internal::Random random_;
-
-  // How long the test took to run, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-#if GTEST_HAS_DEATH_TEST
-  // The decomposed components of the gtest_internal_run_death_test flag,
-  // parsed when RUN_ALL_TESTS is called.
-  internal::scoped_ptr<InternalRunDeathTestFlag> internal_run_death_test_flag_;
-  internal::scoped_ptr<internal::DeathTestFactory> death_test_factory_;
-#endif  // GTEST_HAS_DEATH_TEST
-
-  // A per-thread stack of traces created by the SCOPED_TRACE() macro.
-  internal::ThreadLocal<std::vector<TraceInfo> > gtest_trace_stack_;
-
-  // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests()
-  // starts.
-  bool catch_exceptions_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
-};  // class UnitTestImpl
-
-// Convenience function for accessing the global UnitTest
-// implementation object.
-inline UnitTestImpl* GetUnitTestImpl() {
-  return UnitTest::GetInstance()->impl();
-}
-
-#if GTEST_USES_SIMPLE_RE
-
-// Internal helper functions for implementing the simple regular
-// expression matcher.
-GTEST_API_ bool IsInSet(char ch, const char* str);
-GTEST_API_ bool IsAsciiDigit(char ch);
-GTEST_API_ bool IsAsciiPunct(char ch);
-GTEST_API_ bool IsRepeat(char ch);
-GTEST_API_ bool IsAsciiWhiteSpace(char ch);
-GTEST_API_ bool IsAsciiWordChar(char ch);
-GTEST_API_ bool IsValidEscape(char ch);
-GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
-GTEST_API_ bool ValidateRegex(const char* regex);
-GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
-GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
-
-#endif  // GTEST_USES_SIMPLE_RE
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv);
-GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv);
-
-#if GTEST_HAS_DEATH_TEST
-
-// Returns the message describing the last system error, regardless of the
-// platform.
-GTEST_API_ String GetLastErrnoDescription();
-
-# if GTEST_OS_WINDOWS
-// Provides leak-safe Windows kernel handle ownership.
-class AutoHandle {
- public:
-  AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
-  explicit AutoHandle(HANDLE handle) : handle_(handle) {}
-
-  ~AutoHandle() { Reset(); }
-
-  HANDLE Get() const { return handle_; }
-  void Reset() { Reset(INVALID_HANDLE_VALUE); }
-  void Reset(HANDLE handle) {
-    if (handle != handle_) {
-      if (handle_ != INVALID_HANDLE_VALUE)
-        ::CloseHandle(handle_);
-      handle_ = handle;
-    }
-  }
-
- private:
-  HANDLE handle_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
-};
-# endif  // GTEST_OS_WINDOWS
-
-// Attempts to parse a string into a positive integer pointed to by the
-// number parameter.  Returns true if that is possible.
-// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use
-// it here.
-template <typename Integer>
-bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
-  // Fail fast if the given string does not begin with a digit;
-  // this bypasses strtoXXX's "optional leading whitespace and plus
-  // or minus sign" semantics, which are undesirable here.
-  if (str.empty() || !IsDigit(str[0])) {
-    return false;
-  }
-  errno = 0;
-
-  char* end;
-  // BiggestConvertible is the largest integer type that system-provided
-  // string-to-number conversion routines can return.
-
-# if GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  // MSVC and C++ Builder define __int64 instead of the standard long long.
-  typedef unsigned __int64 BiggestConvertible;
-  const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10);
-
-# else
-
-  typedef unsigned long long BiggestConvertible;  // NOLINT
-  const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10);
-
-# endif  // GTEST_OS_WINDOWS && !defined(__GNUC__)
-
-  const bool parse_success = *end == '\0' && errno == 0;
-
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
-  // available.
-  GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
-
-  const Integer result = static_cast<Integer>(parsed);
-  if (parse_success && static_cast<BiggestConvertible>(result) == parsed) {
-    *number = result;
-    return true;
-  }
-  return false;
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// TestResult contains some private methods that should be hidden from
-// Google Test user but are required for testing. This class allow our tests
-// to access them.
-//
-// This class is supplied only for the purpose of testing Google Test's own
-// constructs. Do not use it in user tests, either directly or indirectly.
-class TestResultAccessor {
- public:
-  static void RecordProperty(TestResult* test_result,
-                             const TestProperty& property) {
-    test_result->RecordProperty(property);
-  }
-
-  static void ClearTestPartResults(TestResult* test_result) {
-    test_result->ClearTestPartResults();
-  }
-
-  static const std::vector<testing::TestPartResult>& test_part_results(
-      const TestResult& test_result) {
-    return test_result.test_part_results();
-  }
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
-#undef GTEST_IMPLEMENTATION_
-
-#if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
-#endif  // GTEST_OS_WINDOWS
-
-namespace testing {
-
-using internal::CountIf;
-using internal::ForEach;
-using internal::GetElementOr;
-using internal::Shuffle;
-
-// Constants.
-
-// A test whose test case name or test name matches this filter is
-// disabled and not run.
-static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*";
-
-// A test case whose name matches this filter is considered a death
-// test case and will be run before test cases whose name doesn't
-// match this filter.
-static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
-
-// A test filter that matches everything.
-static const char kUniversalFilter[] = "*";
-
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
-
-// The environment variable name for the test shard index.
-static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
-// The environment variable name for the total number of test shards.
-static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS";
-// The environment variable name for the test shard status file.
-static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE";
-
-namespace internal {
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-const char kStackTraceMarker[] = "\nStack trace:\n";
-
-// g_help_flag is true iff the --help flag or an equivalent form is
-// specified on the command line.
-bool g_help_flag = false;
-
-}  // namespace internal
-
-GTEST_DEFINE_bool_(
-    also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
-    "Run disabled tests too, in addition to the tests normally being run.");
-
-GTEST_DEFINE_bool_(
-    break_on_failure,
-    internal::BoolFromGTestEnv("break_on_failure", false),
-    "True iff a failed assertion should be a debugger break-point.");
-
-GTEST_DEFINE_bool_(
-    catch_exceptions,
-    internal::BoolFromGTestEnv("catch_exceptions", true),
-    "True iff " GTEST_NAME_
-    " should catch exceptions and treat them as test failures.");
-
-GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
-    "Whether to use colors in the output.  Valid values: yes, no, "
-    "and auto.  'auto' means to use colors if the output is "
-    "being sent to a terminal and the TERM environment variable "
-    "is set to xterm, xterm-color, xterm-256color, linux or cygwin.");
-
-GTEST_DEFINE_string_(
-    filter,
-    internal::StringFromGTestEnv("filter", kUniversalFilter),
-    "A colon-separated list of glob (not regex) patterns "
-    "for filtering the tests to run, optionally followed by a "
-    "'-' and a : separated list of negative patterns (tests to "
-    "exclude).  A test is run if it matches one of the positive "
-    "patterns and does not match any of the negative patterns.");
-
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
-
-GTEST_DEFINE_string_(
-    output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
-    "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
-    "If a directory is specified, output files will be created "
-    "within that directory, with file-names based on the test "
-    "executable's name and, if necessary, made unique by adding "
-    "digits.");
-
-GTEST_DEFINE_bool_(
-    print_time,
-    internal::BoolFromGTestEnv("print_time", true),
-    "True iff " GTEST_NAME_
-    " should display elapsed time in text output.");
-
-GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
-    "Random number seed to use when shuffling test orders.  Must be in range "
-    "[1, 99999], or 0 to use a seed based on the current time.");
-
-GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
-    "How many times to repeat each test.  Specify a negative number "
-    "for repeating forever.  Useful for shaking out flaky tests.");
-
-GTEST_DEFINE_bool_(
-    show_internal_stack_frames, false,
-    "True iff " GTEST_NAME_ " should include internal stack frames when "
-    "printing test failure stack traces.");
-
-GTEST_DEFINE_bool_(
-    shuffle,
-    internal::BoolFromGTestEnv("shuffle", false),
-    "True iff " GTEST_NAME_
-    " should randomize tests' order on every run.");
-
-GTEST_DEFINE_int32_(
-    stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
-    "The maximum number of stack frames to print when an "
-    "assertion fails.  The valid range is 0 through 100, inclusive.");
-
-GTEST_DEFINE_string_(
-    stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
-    "This flag specifies the host name and the port number on which to stream "
-    "test results. Example: \"localhost:555\". The flag is effective only on "
-    "Linux.");
-
-GTEST_DEFINE_bool_(
-    throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
-    "When this flag is specified, a failed assertion will throw an exception "
-    "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
-
-namespace internal {
-
-// Generates a random number from [0, range), using a Linear
-// Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
-// than kMaxRange.
-UInt32 Random::Generate(UInt32 range) {
-  // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
-
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
-  GTEST_CHECK_(range <= kMaxRange)
-      << "Generation of a number in [0, " << range << ") was requested, "
-      << "but this can only generate numbers in [0, " << kMaxRange << ").";
-
-  // Converting via modulus introduces a bit of downward bias, but
-  // it's simple, and a linear congruential generator isn't too good
-  // to begin with.
-  return state_ % range;
-}
-
-// GTestIsInitialized() returns true iff the user has initialized
-// Google Test.  Useful for catching the user mistake of not initializing
-// Google Test before calling RUN_ALL_TESTS().
-//
-// A user must call testing::InitGoogleTest() to initialize Google
-// Test.  g_init_gtest_count is set to the number of times
-// InitGoogleTest() has been called.  We don't protect this variable
-// under a mutex as it is only accessed in the main thread.
-int g_init_gtest_count = 0;
-static bool GTestIsInitialized() { return g_init_gtest_count != 0; }
-
-// Iterates over a vector of TestCases, keeping a running sum of the
-// results of calling a given int-returning method on each.
-// Returns the sum.
-static int SumOverTestCaseList(const std::vector<TestCase*>& case_list,
-                               int (TestCase::*method)() const) {
-  int sum = 0;
-  for (size_t i = 0; i < case_list.size(); i++) {
-    sum += (case_list[i]->*method)();
-  }
-  return sum;
-}
-
-// Returns true iff the test case passed.
-static bool TestCasePassed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Passed();
-}
-
-// Returns true iff the test case failed.
-static bool TestCaseFailed(const TestCase* test_case) {
-  return test_case->should_run() && test_case->Failed();
-}
-
-// Returns true iff test_case contains at least one test that should
-// run.
-static bool ShouldRunTestCase(const TestCase* test_case) {
-  return test_case->should_run();
-}
-
-// AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
-
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
-
-// Message assignment, for assertion streaming support.
-void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
-}
-
-// Mutex for linked pointers.
-GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// Application pathname gotten in InitGoogleTest.
-String g_executable_path;
-
-// Returns the current application's name, removing directory path if that
-// is present.
-FilePath GetCurrentExecutableName() {
-  FilePath result;
-
-#if GTEST_OS_WINDOWS
-  result.Set(FilePath(g_executable_path).RemoveExtension("exe"));
-#else
-  result.Set(FilePath(g_executable_path));
-#endif  // GTEST_OS_WINDOWS
-
-  return result.RemoveDirectoryName();
-}
-
-// Functions for processing the gtest_output flag.
-
-// Returns the output format, or "" for normal printed output.
-String UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return String("");
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  return (colon == NULL) ?
-      String(gtest_output_flag) :
-      String(gtest_output_flag, colon - gtest_output_flag);
-}
-
-// Returns the name of the requested output file, or the default if none
-// was explicitly specified.
-String UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return String("");
-
-  const char* const colon = strchr(gtest_output_flag, ':');
-  if (colon == NULL)
-    return String(internal::FilePath::ConcatPaths(
-               internal::FilePath(
-                   UnitTest::GetInstance()->original_working_dir()),
-               internal::FilePath(kDefaultOutputFile)).ToString() );
-
-  internal::FilePath output_name(colon + 1);
-  if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
-    // path (as its meaning depends on the current drive), yet the
-    // following logic for turning it into an absolute path is wrong.
-    // Fix it.
-    output_name = internal::FilePath::ConcatPaths(
-        internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(colon + 1));
-
-  if (!output_name.IsDirectory())
-    return output_name.ToString();
-
-  internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
-      output_name, internal::GetCurrentExecutableName(),
-      GetOutputFormat().c_str()));
-  return result.ToString();
-}
-
-// Returns true iff the wildcard pattern matches the string.  The
-// first ':' or '\0' character in pattern marks the end of it.
-//
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
-  }
-}
-
-bool UnitTestOptions::MatchesFilter(const String& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
-      return true;
-    }
-
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
-
-    // Returns if no more pattern can be found.
-    if (cur_pattern == NULL) {
-      return false;
-    }
-
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
-  }
-}
-
-// TODO(keithray): move String function implementations to gtest-string.cc.
-
-// Returns true iff the user-specified filter matches the test case
-// name and the test name.
-bool UnitTestOptions::FilterMatchesTest(const String &test_case_name,
-                                        const String &test_name) {
-  const String& full_name = String::Format("%s.%s",
-                                           test_case_name.c_str(),
-                                           test_name.c_str());
-
-  // Split --gtest_filter at '-', if there is one, to separate into
-  // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  String positive;
-  String negative;
-  if (dash == NULL) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = String("");
-  } else {
-    positive = String(p, dash - p);  // Everything up to the dash
-    negative = String(dash+1);       // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
-}
-
-#if GTEST_HAS_SEH
-// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the
-// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise.
-// This function is useful as an __except condition.
-int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
-  // Google Test should handle a SEH exception if:
-  //   1. the user wants it to, AND
-  //   2. this is not a breakpoint exception, AND
-  //   3. this is not a C++ exception (VC++ implements them via SEH,
-  //      apparently).
-  //
-  // SEH exception code for C++ exceptions.
-  // (see http://support.microsoft.com/kb/185294 for more information).
-  const DWORD kCxxExceptionCode = 0xe06d7363;
-
-  bool should_handle = true;
-
-  if (!GTEST_FLAG(catch_exceptions))
-    should_handle = false;
-  else if (exception_code == EXCEPTION_BREAKPOINT)
-    should_handle = false;
-  else if (exception_code == kCxxExceptionCode)
-    should_handle = false;
-
-  return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH;
-}
-#endif  // GTEST_HAS_SEH
-
-}  // namespace internal
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results. Intercepts only failures from the current thread.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
-  Init();
-}
-
-// The c'tor sets this object as the test part result reporter used by
-// Google Test.  The 'result' parameter specifies where to report the
-// results.
-ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
-    InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
-  Init();
-}
-
-void ScopedFakeTestPartResultReporter::Init() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    old_reporter_ = impl->GetGlobalTestPartResultReporter();
-    impl->SetGlobalTestPartResultReporter(this);
-  } else {
-    old_reporter_ = impl->GetTestPartResultReporterForCurrentThread();
-    impl->SetTestPartResultReporterForCurrentThread(this);
-  }
-}
-
-// The d'tor restores the test part result reporter used by Google Test
-// before.
-ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  if (intercept_mode_ == INTERCEPT_ALL_THREADS) {
-    impl->SetGlobalTestPartResultReporter(old_reporter_);
-  } else {
-    impl->SetTestPartResultReporterForCurrentThread(old_reporter_);
-  }
-}
-
-// Increments the test part result count and remembers the result.
-// This method is from the TestPartResultReporterInterface interface.
-void ScopedFakeTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  result_->Append(result);
-}
-
-namespace internal {
-
-// Returns the type ID of ::testing::Test.  We should always call this
-// instead of GetTypeId< ::testing::Test>() to get the type ID of
-// testing::Test.  This is to work around a suspected linker bug when
-// using Google Test as a framework on Mac OS X.  The bug causes
-// GetTypeId< ::testing::Test>() to return different values depending
-// on whether the call is from the Google Test framework itself or
-// from user test code.  GetTestTypeId() is guaranteed to always
-// return the same value, as it always calls GetTypeId<>() from the
-// gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
-
-// The value of GetTestTypeId() as seen from within the Google Test
-// library.  This is solely for testing GetTestTypeId().
-extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
-
-// This predicate-formatter checks that 'results' contains a test part
-// failure of the given type and that the failure message contains the
-// given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
-  const String expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
-  Message msg;
-  if (results.size() != 1) {
-    msg << "Expected: " << expected << "\n"
-        << "  Actual: " << results.size() << " failures";
-    for (int i = 0; i < results.size(); i++) {
-      msg << "\n" << results.GetTestPartResult(i);
-    }
-    return AssertionFailure() << msg;
-  }
-
-  const TestPartResult& r = results.GetTestPartResult(0);
-  if (r.type() != type) {
-    return AssertionFailure() << "Expected: " << expected << "\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  if (strstr(r.message(), substr.c_str()) == NULL) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
-  }
-
-  return AssertionSuccess();
-}
-
-// The constructor of SingleFailureChecker remembers where to look up
-// test part results, what type of failure we expect, and what
-// substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
-
-// The destructor of SingleFailureChecker verifies that the given
-// TestPartResultArray contains exactly one failure that has the given
-// type and contains the given substring.  If that's not the case, a
-// non-fatal failure will be generated.
-SingleFailureChecker::~SingleFailureChecker() {
-  EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_);
-}
-
-DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->current_test_result()->AddTestPartResult(result);
-  unit_test_->listeners()->repeater()->OnTestPartResult(result);
-}
-
-DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
-
-void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
-    const TestPartResult& result) {
-  unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result);
-}
-
-// Returns the global test part result reporter.
-TestPartResultReporterInterface*
-UnitTestImpl::GetGlobalTestPartResultReporter() {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  return global_test_part_result_repoter_;
-}
-
-// Sets the global test part result reporter.
-void UnitTestImpl::SetGlobalTestPartResultReporter(
-    TestPartResultReporterInterface* reporter) {
-  internal::MutexLock lock(&global_test_part_result_reporter_mutex_);
-  global_test_part_result_repoter_ = reporter;
-}
-
-// Returns the test part result reporter for the current thread.
-TestPartResultReporterInterface*
-UnitTestImpl::GetTestPartResultReporterForCurrentThread() {
-  return per_thread_test_part_result_reporter_.get();
-}
-
-// Sets the test part result reporter for the current thread.
-void UnitTestImpl::SetTestPartResultReporterForCurrentThread(
-    TestPartResultReporterInterface* reporter) {
-  per_thread_test_part_result_reporter_.set(reporter);
-}
-
-// Gets the number of successful test cases.
-int UnitTestImpl::successful_test_case_count() const {
-  return CountIf(test_cases_, TestCasePassed);
-}
-
-// Gets the number of failed test cases.
-int UnitTestImpl::failed_test_case_count() const {
-  return CountIf(test_cases_, TestCaseFailed);
-}
-
-// Gets the number of all test cases.
-int UnitTestImpl::total_test_case_count() const {
-  return static_cast<int>(test_cases_.size());
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTestImpl::test_case_to_run_count() const {
-  return CountIf(test_cases_, ShouldRunTestCase);
-}
-
-// Gets the number of successful tests.
-int UnitTestImpl::successful_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count);
-}
-
-// Gets the number of failed tests.
-int UnitTestImpl::failed_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count);
-}
-
-// Gets the number of disabled tests.
-int UnitTestImpl::disabled_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count);
-}
-
-// Gets the number of all tests.
-int UnitTestImpl::total_test_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::total_test_count);
-}
-
-// Gets the number of tests that should run.
-int UnitTestImpl::test_to_run_count() const {
-  return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count);
-}
-
-// Returns the current OS stack trace as a String.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// CurrentOsStackTraceExceptTop(1), Foo() will be included in the
-// trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-String UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
-  (void)skip_count;
-  return String("");
-}
-
-// Returns the current time in milliseconds.
-TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
-
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
-  //   GetSystemTimeAsFileTime()?
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
-  }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
-
-# ifdef _MSC_VER
-
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
-  //   SystemTimeToFileTime()
-#  pragma warning(push)          // Saves the current warning state.
-#  pragma warning(disable:4996)  // Temporarily disables warning 4996.
-  _ftime64(&now);
-#  pragma warning(pop)           // Restores the warning state.
-# else
-
-  _ftime64(&now);
-
-# endif  // _MSC_VER
-
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, NULL);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
-}
-
-// Utilities
-
-// class String
-
-// Returns the input enclosed in double quotes if it's not NULL;
-// otherwise returns "(null)".  For example, "\"Hello\"" is returned
-// for input "Hello".
-//
-// This is useful for printing a C string in the syntax of a literal.
-//
-// Known issue: escape sequences are not handled yet.
-String String::ShowCStringQuoted(const char* c_str) {
-  return c_str ? String::Format("\"%s\"", c_str) : String("(null)");
-}
-
-// Copies at most length characters from str into a newly-allocated
-// piece of memory of size length+1.  The memory is allocated with new[].
-// A terminating null byte is written to the memory, and a pointer to it
-// is returned.  If str is NULL, NULL is returned.
-static char* CloneString(const char* str, size_t length) {
-  if (str == NULL) {
-    return NULL;
-  } else {
-    char* const clone = new char[length + 1];
-    posix::StrNCpy(clone, str, length);
-    clone[length] = '\0';
-    return clone;
-  }
-}
-
-// Clones a 0-terminated C string, allocating memory using new.  The
-// caller is responsible for deleting[] the return value.  Returns the
-// cloned string, or NULL if the input is NULL.
-const char * String::CloneCString(const char* c_str) {
-  return (c_str == NULL) ?
-                    NULL : CloneString(c_str, strlen(c_str));
-}
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Creates a UTF-16 wide string from the given ANSI string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the wide string, or NULL if the
-// input is NULL.
-LPCWSTR String::AnsiToUtf16(const char* ansi) {
-  if (!ansi) return NULL;
-  const int length = strlen(ansi);
-  const int unicode_length =
-      MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                          NULL, 0);
-  WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
-  unicode[unicode_length] = 0;
-  return unicode;
-}
-
-// Creates an ANSI string from the given wide string, allocating
-// memory using new. The caller is responsible for deleting the return
-// value using delete[]. Returns the ANSI string, or NULL if the
-// input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
-  if (!utf16_str) return NULL;
-  const int ansi_length =
-      WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                          NULL, 0, NULL, NULL);
-  char* ansi = new char[ansi_length + 1];
-  WideCharToMultiByte(CP_ACP, 0, utf16_str, -1,
-                      ansi, ansi_length, NULL, NULL);
-  ansi[ansi_length] = 0;
-  return ansi;
-}
-
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Compares two C strings.  Returns true iff they have the same content.
-//
-// Unlike strcmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
-  if ( lhs == NULL ) return rhs == NULL;
-
-  if ( rhs == NULL ) return false;
-
-  return strcmp(lhs, rhs) == 0;
-}
-
-#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-// Converts an array of wide chars to a narrow string using the UTF-8
-// encoding, and streams the result to the given Message object.
-static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
-                                     Message* msg) {
-  // TODO(wan): consider allowing a testing::String object to
-  // contain '\0'.  This will make it behave more like std::string,
-  // and will allow ToUtf8String() to return the correct encoding
-  // for '\0' s.t. we can get rid of the conditional here (and in
-  // several other places).
-  for (size_t i = 0; i != length; ) {  // NOLINT
-    if (wstr[i] != L'\0') {
-      *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
-    } else {
-      *msg << '\0';
-      i++;
-    }
-  }
-}
-
-#endif  // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING
-
-}  // namespace internal
-
-#if GTEST_HAS_STD_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-// Converts the given wide string to a narrow string using the UTF-8
-// encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::wstring& wstr) {
-  internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
-  return *this;
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != NULL ?
-               new ::std::string(*other.message_) :
-               static_cast< ::std::string*>(NULL)) {
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != NULL)
-    negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
-namespace internal {
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-AssertionResult EqFailure(const char* expected_expression,
-                          const char* actual_expression,
-                          const String& expected_value,
-                          const String& actual_value,
-                          bool ignoring_case) {
-  Message msg;
-  msg << "Value of: " << actual_expression;
-  if (actual_value != actual_expression) {
-    msg << "\n  Actual: " << actual_value;
-  }
-
-  msg << "\nExpected: " << expected_expression;
-  if (ignoring_case) {
-    msg << " (ignoring case)";
-  }
-  if (expected_value != expected_expression) {
-    msg << "\nWhich is: " << expected_value;
-  }
-
-  return AssertionFailure() << msg;
-}
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-String GetBoolAssertionFailureMessage(const AssertionResult& assertion_result,
-                                      const char* expression_text,
-                                      const char* actual_predicate_value,
-                                      const char* expected_predicate_value) {
-  const char* actual_message = assertion_result.message();
-  Message msg;
-  msg << "Value of: " << expression_text
-      << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
-  msg << "\nExpected: " << expected_predicate_value;
-  return msg.GetString();
-}
-
-// Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
-  const double diff = fabs(val1 - val2);
-  if (diff <= abs_error) return AssertionSuccess();
-
-  // TODO(wan): do not print the value of an expression if it's
-  // already a literal.
-  return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
-}
-
-
-// Helper template for implementing FloatLE() and DoubleLE().
-template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
-  // Returns success if val1 is less than val2,
-  if (val1 < val2) {
-    return AssertionSuccess();
-  }
-
-  // or if val1 is almost equal to val2.
-  const FloatingPoint<RawType> lhs(val1), rhs(val2);
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  // Note that the above two checks will both fail if either val1 or
-  // val2 is NaN, as the IEEE floating-point standard requires that
-  // any predicate involving a NaN must return false.
-
-  ::std::stringstream val1_ss;
-  val1_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val1;
-
-  ::std::stringstream val2_ss;
-  val2_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-          << val2;
-
-  return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
-}
-
-}  // namespace internal
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
-  return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
-}
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
-  return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
-}
-
-namespace internal {
-
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            BiggestInt expected,
-                            BiggestInt actual) {
-  if (expected == actual) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
-// The helper function for {ASSERT|EXPECT}_STREQ.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const char* expected,
-                               const char* actual) {
-  if (String::CStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowCStringQuoted(expected),
-                   String::ShowCStringQuoted(actual),
-                   false);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                   const char* actual_expression,
-                                   const char* expected,
-                                   const char* actual) {
-  if (String::CaseInsensitiveCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowCStringQuoted(expected),
-                   String::ShowCStringQuoted(actual),
-                   true);
-}
-
-// The helper function for {ASSERT|EXPECT}_STRNE.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
-                               const char* s2) {
-  if (!String::CStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-// The helper function for {ASSERT|EXPECT}_STRCASENE.
-AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
-                                   const char* s2) {
-  if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  } else {
-    return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
-  }
-}
-
-}  // namespace internal
-
-namespace {
-
-// Helper functions for implementing IsSubString() and IsNotSubstring().
-
-// This group of overloaded functions return true iff needle is a
-// substring of haystack.  NULL is considered a substring of itself
-// only.
-
-bool IsSubstringPred(const char* needle, const char* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return strstr(haystack, needle) != NULL;
-}
-
-bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
-  if (needle == NULL || haystack == NULL)
-    return needle == haystack;
-
-  return wcsstr(haystack, needle) != NULL;
-}
-
-// StringType here can be either ::std::string or ::std::wstring.
-template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
-  return haystack.find(needle) != StringType::npos;
-}
-
-// This function implements either IsSubstring() or IsNotSubstring(),
-// depending on the value of the expected_to_be_substring parameter.
-// StringType here can be const char*, const wchar_t*, ::std::string,
-// or ::std::wstring.
-template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
-  if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
-    return AssertionSuccess();
-
-  const bool is_wide_string = sizeof(needle[0]) > 1;
-  const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
-  return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
-}
-
-}  // namespace
-
-// IsSubstring() and IsNotSubstring() check whether needle is a
-// substring of haystack (NULL is considered a substring of itself
-// only), and return an appropriate error message when they fail.
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-
-#if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
-}
-
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
-  return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-
-namespace {
-
-// Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
-                                     long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
-
-  // Windows CE doesn't support FormatMessage.
-  const char error_text[] = "";
-
-# else
-
-  // Looks up the human-readable system message for the HRESULT code
-  // and since we're not passing any params to FormatMessage, we don't
-  // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
-  const DWORD kBufSize = 4096;  // String::Format can't exceed this length.
-  // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
-  DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,  // no source, we're asking system
-                                          hr,  // the error
-                                          0,  // no line width restrictions
-                                          error_text,  // output buffer
-                                          kBufSize,  // buf size
-                                          NULL);  // no arguments for inserts
-  // Trims tailing white space (FormatMessage leaves a trailing cr-lf)
-  for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
-    error_text[message_length - 1] = '\0';
-  }
-
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-  const String error_hex(String::Format("0x%08X ", hr));
-  return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << error_text << "\n";
-}
-
-}  // namespace
-
-AssertionResult IsHRESULTSuccess(const char* expr, long hr) {  // NOLINT
-  if (SUCCEEDED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "succeeds", hr);
-}
-
-AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
-  if (FAILED(hr)) {
-    return AssertionSuccess();
-  }
-  return HRESULTFailureHelper(expr, "fails", hr);
-}
-
-#endif  // GTEST_OS_WINDOWS
-
-// Utility functions for encoding Unicode text (wide strings) in
-// UTF-8.
-
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
-// like this:
-//
-// Code-point length   Encoding
-//   0 -  7 bits       0xxxxxxx
-//   8 - 11 bits       110xxxxx 10xxxxxx
-//  12 - 16 bits       1110xxxx 10xxxxxx 10xxxxxx
-//  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-
-// The maximum code-point a one-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint1 = (static_cast<UInt32>(1) <<  7) - 1;
-
-// The maximum code-point a two-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint2 = (static_cast<UInt32>(1) << (5 + 6)) - 1;
-
-// The maximum code-point a three-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint3 = (static_cast<UInt32>(1) << (4 + 2*6)) - 1;
-
-// The maximum code-point a four-byte UTF-8 sequence can represent.
-const UInt32 kMaxCodePoint4 = (static_cast<UInt32>(1) << (3 + 3*6)) - 1;
-
-// Chops off the n lowest bits from a bit pattern.  Returns the n
-// lowest bits.  As a side effect, the original bit pattern will be
-// shifted to the right by n bits.
-inline UInt32 ChopLowBits(UInt32* bits, int n) {
-  const UInt32 low_bits = *bits & ((static_cast<UInt32>(1) << n) - 1);
-  *bits >>= n;
-  return low_bits;
-}
-
-// Converts a Unicode code point to a narrow string in UTF-8 encoding.
-// code_point parameter is of type UInt32 because wchar_t may not be
-// wide enough to contain a code point.
-// The output buffer str must containt at least 32 characters.
-// The function returns the address of the output buffer.
-// If the code_point is not a valid Unicode code point
-// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'.
-char* CodePointToUtf8(UInt32 code_point, char* str) {
-  if (code_point <= kMaxCodePoint1) {
-    str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
-  } else if (code_point <= kMaxCodePoint2) {
-    str[2] = '\0';
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xC0 | code_point);                   // 110xxxxx
-  } else if (code_point <= kMaxCodePoint3) {
-    str[3] = '\0';
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xE0 | code_point);                   // 1110xxxx
-  } else if (code_point <= kMaxCodePoint4) {
-    str[4] = '\0';
-    str[3] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[2] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
-    str[0] = static_cast<char>(0xF0 | code_point);                   // 11110xxx
-  } else {
-    // The longest string String::Format can produce when invoked
-    // with these parameters is 28 character long (not including
-    // the terminating nul character). We are asking for 32 character
-    // buffer just in case. This is also enough for strncpy to
-    // null-terminate the destination string.
-    posix::StrNCpy(
-        str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(), 32);
-    str[31] = '\0';  // Makes sure no change in the format to strncpy leaves
-                     // the result unterminated.
-  }
-  return str;
-}
-
-// The following two functions only make sense if the the system
-// uses UTF-16 for wide string encoding. All supported systems
-// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
-
-// Determines if the arguments constitute UTF-16 surrogate pair
-// and thus should be combined into a single Unicode code point
-// using CreateCodePointFromUtf16SurrogatePair.
-inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
-}
-
-// Creates a Unicode code point from UTF16 surrogate pair.
-inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first,
-                                                    wchar_t second) {
-  const UInt32 mask = (1 << 10) - 1;
-  return (sizeof(wchar_t) == 2) ?
-      (((first & mask) << 10) | (second & mask)) + 0x10000 :
-      // This function should not be called when the condition is
-      // false, but we provide a sensible default in case it is.
-      static_cast<UInt32>(first);
-}
-
-// Converts a wide string to a narrow string in UTF-8 encoding.
-// The wide string is assumed to have the following encoding:
-//   UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS)
-//   UTF-32 if sizeof(wchar_t) == 4 (on Linux)
-// Parameter str points to a null-terminated wide string.
-// Parameter num_chars may additionally limit the number
-// of wchar_t characters processed. -1 is used when the entire string
-// should be processed.
-// If the string contains code points that are not valid Unicode code points
-// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output
-// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding
-// and contains invalid UTF-16 surrogate pairs, values in those pairs
-// will be encoded as individual Unicode characters from Basic Normal Plane.
-String WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
-
-  ::std::stringstream stream;
-  for (int i = 0; i < num_chars; ++i) {
-    UInt32 unicode_code_point;
-
-    if (str[i] == L'\0') {
-      break;
-    } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
-      i++;
-    } else {
-      unicode_code_point = static_cast<UInt32>(str[i]);
-    }
-
-    char buffer[32];  // CodePointToUtf8 requires a buffer this big.
-    stream << CodePointToUtf8(unicode_code_point, buffer);
-  }
-  return StringStreamToString(&stream);
-}
-
-// Converts a wide C string to a String using the UTF-8 encoding.
-// NULL will be converted to "(null)".
-String String::ShowWideCString(const wchar_t * wide_c_str) {
-  if (wide_c_str == NULL) return String("(null)");
-
-  return String(internal::WideStringToUtf8(wide_c_str, -1).c_str());
-}
-
-// Similar to ShowWideCString(), except that this function encloses
-// the converted string in double quotes.
-String String::ShowWideCStringQuoted(const wchar_t* wide_c_str) {
-  if (wide_c_str == NULL) return String("(null)");
-
-  return String::Format("L\"%s\"",
-                        String::ShowWideCString(wide_c_str).c_str());
-}
-
-// Compares two wide C strings.  Returns true iff they have the same
-// content.
-//
-// Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
-// C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-  return wcscmp(lhs, rhs) == 0;
-}
-
-// Helper function for *_STREQ on wide strings.
-AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                               const char* actual_expression,
-                               const wchar_t* expected,
-                               const wchar_t* actual) {
-  if (String::WideCStringEquals(expected, actual)) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   String::ShowWideCStringQuoted(expected),
-                   String::ShowWideCStringQuoted(actual),
-                   false);
-}
-
-// Helper function for *_STRNE on wide strings.
-AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
-                               const wchar_t* s2) {
-  if (!String::WideCStringEquals(s1, s2)) {
-    return AssertionSuccess();
-  }
-
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << String::ShowWideCStringQuoted(s1)
-                            << " vs " << String::ShowWideCStringQuoted(s2);
-}
-
-// Compares two C strings, ignoring case.  Returns true iff they have
-// the same content.
-//
-// Unlike strcasecmp(), this function can handle NULL argument(s).  A
-// NULL C string is considered different to any non-NULL C string,
-// including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
-  if (lhs == NULL)
-    return rhs == NULL;
-  if (rhs == NULL)
-    return false;
-  return posix::StrCaseCmp(lhs, rhs) == 0;
-}
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                              const wchar_t* rhs) {
-  if (lhs == NULL) return rhs == NULL;
-
-  if (rhs == NULL) return false;
-
-#if GTEST_OS_WINDOWS
-  return _wcsicmp(lhs, rhs) == 0;
-#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID
-  return wcscasecmp(lhs, rhs) == 0;
-#else
-  // Android, Mac OS X and Cygwin don't define wcscasecmp.
-  // Other unknown OSes may not define it either.
-  wint_t left, right;
-  do {
-    left = towlower(*lhs++);
-    right = towlower(*rhs++);
-  } while (left && left == right);
-  return left == right;
-#endif  // OS selector
-}
-
-// Compares this with another String.
-// Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0
-// if this is greater than rhs.
-int String::Compare(const String & rhs) const {
-  const char* const lhs_c_str = c_str();
-  const char* const rhs_c_str = rhs.c_str();
-
-  if (lhs_c_str == NULL) {
-    return rhs_c_str == NULL ? 0 : -1;  // NULL < anything except NULL
-  } else if (rhs_c_str == NULL) {
-    return 1;
-  }
-
-  const size_t shorter_str_len =
-      length() <= rhs.length() ? length() : rhs.length();
-  for (size_t i = 0; i != shorter_str_len; i++) {
-    if (lhs_c_str[i] < rhs_c_str[i]) {
-      return -1;
-    } else if (lhs_c_str[i] > rhs_c_str[i]) {
-      return 1;
-    }
-  }
-  return (length() < rhs.length()) ? -1 :
-      (length() > rhs.length()) ? 1 : 0;
-}
-
-// Returns true iff this String ends with the given suffix.  *Any*
-// String is considered to end with a NULL or empty suffix.
-bool String::EndsWith(const char* suffix) const {
-  if (suffix == NULL || CStringEquals(suffix, "")) return true;
-
-  if (c_str() == NULL) return false;
-
-  const size_t this_len = strlen(c_str());
-  const size_t suffix_len = strlen(suffix);
-  return (this_len >= suffix_len) &&
-         CStringEquals(c_str() + this_len - suffix_len, suffix);
-}
-
-// Returns true iff this String ends with the given suffix, ignoring case.
-// Any String is considered to end with a NULL or empty suffix.
-bool String::EndsWithCaseInsensitive(const char* suffix) const {
-  if (suffix == NULL || CStringEquals(suffix, "")) return true;
-
-  if (c_str() == NULL) return false;
-
-  const size_t this_len = strlen(c_str());
-  const size_t suffix_len = strlen(suffix);
-  return (this_len >= suffix_len) &&
-         CaseInsensitiveCStringEquals(c_str() + this_len - suffix_len, suffix);
-}
-
-// Formats a list of arguments to a String, using the same format
-// spec string as for printf.
-//
-// We do not use the StringPrintf class as it is not universally
-// available.
-//
-// The result is limited to 4096 characters (including the tailing 0).
-// If 4096 characters are not enough to format the input, or if
-// there's an error, "<formatting error or buffer exceeded>" is
-// returned.
-String String::Format(const char * format, ...) {
-  va_list args;
-  va_start(args, format);
-
-  char buffer[4096];
-  const int kBufferSize = sizeof(buffer)/sizeof(buffer[0]);
-
-  // MSVC 8 deprecates vsnprintf(), so we want to suppress warning
-  // 4996 (deprecated function) there.
-#ifdef _MSC_VER  // We are using MSVC.
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4996)  // Temporarily disables warning 4996.
-
-  const int size = vsnprintf(buffer, kBufferSize, format, args);
-
-# pragma warning(pop)           // Restores the warning state.
-#else  // We are not using MSVC.
-  const int size = vsnprintf(buffer, kBufferSize, format, args);
-#endif  // _MSC_VER
-  va_end(args);
-
-  // vsnprintf()'s behavior is not portable.  When the buffer is not
-  // big enough, it returns a negative value in MSVC, and returns the
-  // needed buffer size on Linux.  When there is an output error, it
-  // always returns a negative value.  For simplicity, we lump the two
-  // error cases together.
-  if (size < 0 || size >= kBufferSize) {
-    return String("<formatting error or buffer exceeded>");
-  } else {
-    return String(buffer, size);
-  }
-}
-
-// Converts the buffer in a stringstream to a String, converting NUL
-// bytes to "\\0" along the way.
-String StringStreamToString(::std::stringstream* ss) {
-  const ::std::string& str = ss->str();
-  const char* const start = str.c_str();
-  const char* const end = start + str.length();
-
-  // We need to use a helper stringstream to do this transformation
-  // because String doesn't support push_back().
-  ::std::stringstream helper;
-  for (const char* ch = start; ch != end; ++ch) {
-    if (*ch == '\0') {
-      helper << "\\0";  // Replaces NUL with "\\0";
-    } else {
-      helper.put(*ch);
-    }
-  }
-
-  return String(helper.str().c_str());
-}
-
-// Appends the user-supplied message to the Google-Test-generated message.
-String AppendUserMessage(const String& gtest_msg,
-                         const Message& user_msg) {
-  // Appends the user message if it's non-empty.
-  const String user_msg_string = user_msg.GetString();
-  if (user_msg_string.empty()) {
-    return gtest_msg;
-  }
-
-  Message msg;
-  msg << gtest_msg << "\n" << user_msg_string;
-
-  return msg.GetString();
-}
-
-}  // namespace internal
-
-// class TestResult
-
-// Creates an empty TestResult.
-TestResult::TestResult()
-    : death_test_count_(0),
-      elapsed_time_(0) {
-}
-
-// D'tor.
-TestResult::~TestResult() {
-}
-
-// Returns the i-th test part result among all the results. i can
-// range from 0 to total_part_count() - 1. If i is not in that range,
-// aborts the program.
-const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
-  return test_part_results_.at(i);
-}
-
-// Returns the i-th test property. i can range from 0 to
-// test_property_count() - 1. If i is not in that range, aborts the
-// program.
-const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
-  return test_properties_.at(i);
-}
-
-// Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
-
-// Adds a test part result to the list.
-void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
-  test_part_results_.push_back(test_part_result);
-}
-
-// Adds a test property to the list. If a property with the same key as the
-// supplied property is already represented, the value of this test_property
-// replaces the old value for that key.
-void TestResult::RecordProperty(const TestProperty& test_property) {
-  if (!ValidateTestProperty(test_property)) {
-    return;
-  }
-  internal::MutexLock lock(&test_properites_mutex_);
-  const std::vector<TestProperty>::iterator property_with_matching_key =
-      std::find_if(test_properties_.begin(), test_properties_.end(),
-                   internal::TestPropertyKeyIs(test_property.key()));
-  if (property_with_matching_key == test_properties_.end()) {
-    test_properties_.push_back(test_property);
-    return;
-  }
-  property_with_matching_key->SetValue(test_property.value());
-}
-
-// Adds a failure if the key is a reserved attribute of Google Test
-// testcase tags.  Returns true if the property is valid.
-bool TestResult::ValidateTestProperty(const TestProperty& test_property) {
-  internal::String key(test_property.key());
-  if (key == "name" || key == "status" || key == "time" || key == "classname") {
-    ADD_FAILURE()
-        << "Reserved key used in RecordProperty(): "
-        << key
-        << " ('name', 'status', 'time', and 'classname' are reserved by "
-        << GTEST_NAME_ << ")";
-    return false;
-  }
-  return true;
-}
-
-// Clears the object.
-void TestResult::Clear() {
-  test_part_results_.clear();
-  test_properties_.clear();
-  death_test_count_ = 0;
-  elapsed_time_ = 0;
-}
-
-// Returns true iff the test failed.
-bool TestResult::Failed() const {
-  for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
-  }
-  return false;
-}
-
-// Returns true iff the test part fatally failed.
-static bool TestPartFatallyFailed(const TestPartResult& result) {
-  return result.fatally_failed();
-}
-
-// Returns true iff the test fatally failed.
-bool TestResult::HasFatalFailure() const {
-  return CountIf(test_part_results_, TestPartFatallyFailed) > 0;
-}
-
-// Returns true iff the test part non-fatally failed.
-static bool TestPartNonfatallyFailed(const TestPartResult& result) {
-  return result.nonfatally_failed();
-}
-
-// Returns true iff the test has a non-fatal failure.
-bool TestResult::HasNonfatalFailure() const {
-  return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0;
-}
-
-// Gets the number of all test parts.  This is the sum of the number
-// of successful test parts and the number of failed test parts.
-int TestResult::total_part_count() const {
-  return static_cast<int>(test_part_results_.size());
-}
-
-// Returns the number of the test properties.
-int TestResult::test_property_count() const {
-  return static_cast<int>(test_properties_.size());
-}
-
-// class Test
-
-// Creates a Test object.
-
-// The c'tor saves the values of all Google Test flags.
-Test::Test()
-    : gtest_flag_saver_(new internal::GTestFlagSaver) {
-}
-
-// The d'tor restores the values of all Google Test flags.
-Test::~Test() {
-  delete gtest_flag_saver_;
-}
-
-// Sets up the test fixture.
-//
-// A sub-class may override this.
-void Test::SetUp() {
-}
-
-// Tears down the test fixture.
-//
-// A sub-class may override this.
-void Test::TearDown() {
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const char* key, const char* value) {
-  UnitTest::GetInstance()->RecordPropertyForCurrentTest(key, value);
-}
-
-// Allows user supplied key value pairs to be recorded for later output.
-void Test::RecordProperty(const char* key, int value) {
-  Message value_message;
-  value_message << value;
-  RecordProperty(key, value_message.GetString().c_str());
-}
-
-namespace internal {
-
-void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const String& message) {
-  // This function is a friend of UnitTest and as such has access to
-  // AddTestPartResult.
-  UnitTest::GetInstance()->AddTestPartResult(
-      result_type,
-      NULL,  // No info about the source file where the exception occurred.
-      -1,    // We have no info on which line caused the exception.
-      message,
-      String());  // No stack trace, either.
-}
-
-}  // namespace internal
-
-// Google Test requires all tests in the same test case to use the same test
-// fixture class.  This function checks if the current test has the
-// same fixture class as the first test in the current test case.  If
-// yes, it returns true; otherwise it generates a Google Test failure and
-// returns false.
-bool Test::HasSameFixtureClass() {
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  const TestCase* const test_case = impl->current_test_case();
-
-  // Info about the first test in the current test case.
-  const TestInfo* const first_test_info = test_case->test_info_list()[0];
-  const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_;
-  const char* const first_test_name = first_test_info->name();
-
-  // Info about the current test.
-  const TestInfo* const this_test_info = impl->current_test_info();
-  const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_;
-  const char* const this_test_name = this_test_info->name();
-
-  if (this_fixture_id != first_fixture_id) {
-    // Is the first test defined using TEST?
-    const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId();
-    // Is this test defined using TEST?
-    const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId();
-
-    if (first_is_TEST || this_is_TEST) {
-      // The user mixed TEST and TEST_F in this test case - we'll tell
-      // him/her how to fix it.
-
-      // Gets the name of the TEST and the name of the TEST_F.  Note
-      // that first_is_TEST and this_is_TEST cannot both be true, as
-      // the fixture IDs are different for the two tests.
-      const char* const TEST_name =
-          first_is_TEST ? first_test_name : this_test_name;
-      const char* const TEST_F_name =
-          first_is_TEST ? this_test_name : first_test_name;
-
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class, so mixing TEST_F and TEST in the same test case is\n"
-          << "illegal.  In test case " << this_test_info->test_case_name()
-          << ",\n"
-          << "test " << TEST_F_name << " is defined using TEST_F but\n"
-          << "test " << TEST_name << " is defined using TEST.  You probably\n"
-          << "want to change the TEST to TEST_F or move it to another test\n"
-          << "case.";
-    } else {
-      // The user defined two fixture classes with the same name in
-      // two namespaces - we'll tell him/her how to fix it.
-      ADD_FAILURE()
-          << "All tests in the same test case must use the same test fixture\n"
-          << "class.  However, in test case "
-          << this_test_info->test_case_name() << ",\n"
-          << "you defined test " << first_test_name
-          << " and test " << this_test_name << "\n"
-          << "using two different test fixture classes.  This can happen if\n"
-          << "the two classes are from different namespaces or translation\n"
-          << "units and have the same name.  You should probably rename one\n"
-          << "of the classes to put the tests into different test cases.";
-    }
-    return false;
-  }
-
-  return true;
-}
-
-#if GTEST_HAS_SEH
-
-// Adds an "exception thrown" fatal failure to the current test.  This
-// function returns its result via an output parameter pointer because VC++
-// prohibits creation of objects with destructors on stack in functions
-// using __try (see error C2712).
-static internal::String* FormatSehExceptionMessage(DWORD exception_code,
-                                                   const char* location) {
-  Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
-
-  return new internal::String(message.GetString());
-}
-
-#endif  // GTEST_HAS_SEH
-
-#if GTEST_HAS_EXCEPTIONS
-
-// Adds an "exception thrown" fatal failure to the current test.
-static internal::String FormatCxxExceptionMessage(const char* description,
-                                                  const char* location) {
-  Message message;
-  if (description != NULL) {
-    message << "C++ exception with description \"" << description << "\"";
-  } else {
-    message << "Unknown C++ exception";
-  }
-  message << " thrown in " << location << ".";
-
-  return message.GetString();
-}
-
-static internal::String PrintTestPartResultToString(
-    const TestPartResult& test_part_result);
-
-// A failed Google Test assertion will throw an exception of this type when
-// GTEST_FLAG(throw_on_failure) is true (if exceptions are enabled).  We
-// derive it from std::runtime_error, which is for errors presumably
-// detectable only at run time.  Since std::runtime_error inherits from
-// std::exception, many testing frameworks know how to extract and print the
-// message inside it.
-class GoogleTestFailureException : public ::std::runtime_error {
- public:
-  explicit GoogleTestFailureException(const TestPartResult& failure)
-      : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {}
-};
-#endif  // GTEST_HAS_EXCEPTIONS
-
-namespace internal {
-// We put these helper functions in the internal namespace as IBM's xlC
-// compiler rejects the code if they were declared static.
-
-// Runs the given method and handles SEH exceptions it throws, when
-// SEH is supported; returns the 0-value for type Result in case of an
-// SEH exception.  (Microsoft compilers cannot handle SEH and C++
-// exceptions in the same function.  Therefore, we provide a separate
-// wrapper function for handling SEH exceptions.)
-template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-#if GTEST_HAS_SEH
-  __try {
-    return (object->*method)();
-  } __except (internal::UnitTestOptions::GTestShouldProcessSEH(  // NOLINT
-      GetExceptionCode())) {
-    // We create the exception message on the heap because VC++ prohibits
-    // creation of objects with destructors on stack in functions using __try
-    // (see error C2712).
-    internal::String* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
-    internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
-                                             *exception_message);
-    delete exception_message;
-    return static_cast<Result>(0);
-  }
-#else
-  (void)location;
-  return (object->*method)();
-#endif  // GTEST_HAS_SEH
-}
-
-// Runs the given method and catches and reports C++ and/or SEH-style
-// exceptions, if they are supported; returns the 0-value for type
-// Result in case of an SEH exception.
-template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
-  // NOTE: The user code can affect the way in which Google Test handles
-  // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
-  // RUN_ALL_TESTS() starts. It is technically possible to check the flag
-  // after the exception is caught and either report or re-throw the
-  // exception based on the flag's value:
-  //
-  // try {
-  //   // Perform the test method.
-  // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
-  //     // Report the exception as failure.
-  //   else
-  //     throw;  // Re-throws the original exception.
-  // }
-  //
-  // However, the purpose of this flag is to allow the program to drop into
-  // the debugger when the exception is thrown. On most platforms, once the
-  // control enters the catch block, the exception origin information is
-  // lost and the debugger will stop the program at the point of the
-  // re-throw in this function -- instead of at the point of the original
-  // throw statement in the code under test.  For this reason, we perform
-  // the check early, sacrificing the ability to affect Google Test's
-  // exception handling in the method where the exception is thrown.
-  if (internal::GetUnitTestImpl()->catch_exceptions()) {
-#if GTEST_HAS_EXCEPTIONS
-    try {
-      return HandleSehExceptionsInMethodIfSupported(object, method, location);
-    } catch (const GoogleTestFailureException&) {  // NOLINT
-      // This exception doesn't originate in code under test. It makes no
-      // sense to report it as a test failure.
-      throw;
-    } catch (const std::exception& e) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(e.what(), location));
-    } catch (...) {  // NOLINT
-      internal::ReportFailureInUnknownLocation(
-          TestPartResult::kFatalFailure,
-          FormatCxxExceptionMessage(NULL, location));
-    }
-    return static_cast<Result>(0);
-#else
-    return HandleSehExceptionsInMethodIfSupported(object, method, location);
-#endif  // GTEST_HAS_EXCEPTIONS
-  } else {
-    return (object->*method)();
-  }
-}
-
-}  // namespace internal
-
-// Runs the test and updates the test result.
-void Test::Run() {
-  if (!HasSameFixtureClass()) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()");
-  // We will run the test only if SetUp() was successful.
-  if (!HasFatalFailure()) {
-    impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
-  }
-
-  // However, we want to clean up as much as possible.  Hence we will
-  // always call TearDown(), even if SetUp() or the test body has
-  // failed.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
-}
-
-// Returns true iff the current test has a fatal failure.
-bool Test::HasFatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure();
-}
-
-// Returns true iff the current test has a non-fatal failure.
-bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
-}
-
-// class TestInfo
-
-// Constructs a TestInfo object. It assumes ownership of the test factory
-// object.
-// TODO(vladl@google.com): Make a_test_case_name and a_name const string&'s
-// to signify they cannot be NULLs.
-TestInfo::TestInfo(const char* a_test_case_name,
-                   const char* a_name,
-                   const char* a_type_param,
-                   const char* a_value_param,
-                   internal::TypeId fixture_class_id,
-                   internal::TestFactoryBase* factory)
-    : test_case_name_(a_test_case_name),
-      name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      value_param_(a_value_param ? new std::string(a_value_param) : NULL),
-      fixture_class_id_(fixture_class_id),
-      should_run_(false),
-      is_disabled_(false),
-      matches_filter_(false),
-      factory_(factory),
-      result_() {}
-
-// Destructs a TestInfo object.
-TestInfo::~TestInfo() { delete factory_; }
-
-namespace internal {
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_case_name:   name of the test case
-//   name:             name of the test
-//   type_param:       the name of the test's type parameter, or NULL if
-//                     this is not a typed or a type-parameterized test.
-//   value_param:      text representation of the test's value parameter,
-//                     or NULL if this is not a value-parameterized test.
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name, const char* name,
-    const char* type_param,
-    const char* value_param,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory) {
-  TestInfo* const test_info =
-      new TestInfo(test_case_name, name, type_param, value_param,
-                   fixture_class_id, factory);
-  GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info);
-  return test_info;
-}
-
-#if GTEST_HAS_PARAM_TEST
-void ReportInvalidTestCaseType(const char* test_case_name,
-                               const char* file, int line) {
-  Message errors;
-  errors
-      << "Attempted redefinition of test case " << test_case_name << ".\n"
-      << "All tests in the same test case must use the same test fixture\n"
-      << "class.  However, in test case " << test_case_name << ", you tried\n"
-      << "to define a test using a fixture class different from the one\n"
-      << "used earlier. This can happen if the two fixture classes are\n"
-      << "from different namespaces and have the same name. You should\n"
-      << "probably rename one of the classes to put the tests into different\n"
-      << "test cases.";
-
-  fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-          errors.GetString().c_str());
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-}  // namespace internal
-
-namespace {
-
-// A predicate that checks the test name of a TestInfo against a known
-// value.
-//
-// This is used for implementation of the TestCase class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestNameIs is copyable.
-class TestNameIs {
- public:
-  // Constructor.
-  //
-  // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
-
-  // Returns true iff the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
-    return test_info && internal::String(test_info->name()).Compare(name_) == 0;
-  }
-
- private:
-  internal::String name_;
-};
-
-}  // namespace
-
-namespace internal {
-
-// This method expands all parameterized tests registered with macros TEST_P
-// and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
-// This will be done just once during the program runtime.
-void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
-  if (!parameterized_tests_registered_) {
-    parameterized_test_registry_.RegisterTests();
-    parameterized_tests_registered_ = true;
-  }
-#endif
-}
-
-}  // namespace internal
-
-// Creates the test object, runs it, records its result, and then
-// deletes it.
-void TestInfo::Run() {
-  if (!should_run_) return;
-
-  // Tells UnitTest where to store test result.
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_info(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  // Notifies the unit test event listeners that a test is about to start.
-  repeater->OnTestStart(*this);
-
-  const TimeInMillis start = internal::GetTimeInMillis();
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-
-  // Creates the test object.
-  Test* const test = internal::HandleExceptionsInMethodIfSupported(
-      factory_, &internal::TestFactoryBase::CreateTest,
-      "the test fixture's constructor");
-
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
-    // This doesn't throw as all user code that can throw are wrapped into
-    // exception handling code.
-    test->Run();
-  }
-
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
-
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
-
-  // Notifies the unit test event listener that a test has just finished.
-  repeater->OnTestEnd(*this);
-
-  // Tells UnitTest to stop associating assertion results to this
-  // test.
-  impl->set_current_test_info(NULL);
-}
-
-// class TestCase
-
-// Gets the number of successful tests in this test case.
-int TestCase::successful_test_count() const {
-  return CountIf(test_info_list_, TestPassed);
-}
-
-// Gets the number of failed tests in this test case.
-int TestCase::failed_test_count() const {
-  return CountIf(test_info_list_, TestFailed);
-}
-
-int TestCase::disabled_test_count() const {
-  return CountIf(test_info_list_, TestDisabled);
-}
-
-// Get the number of tests in this test case that should run.
-int TestCase::test_to_run_count() const {
-  return CountIf(test_info_list_, ShouldRunTest);
-}
-
-// Gets the number of all tests.
-int TestCase::total_test_count() const {
-  return static_cast<int>(test_info_list_.size());
-}
-
-// Creates a TestCase with the given name.
-//
-// Arguments:
-//
-//   name:         name of the test case
-//   a_type_param: the name of the test case's type parameter, or NULL if
-//                 this is not a typed or a type-parameterized test case.
-//   set_up_tc:    pointer to the function that sets up the test case
-//   tear_down_tc: pointer to the function that tears down the test case
-TestCase::TestCase(const char* a_name, const char* a_type_param,
-                   Test::SetUpTestCaseFunc set_up_tc,
-                   Test::TearDownTestCaseFunc tear_down_tc)
-    : name_(a_name),
-      type_param_(a_type_param ? new std::string(a_type_param) : NULL),
-      set_up_tc_(set_up_tc),
-      tear_down_tc_(tear_down_tc),
-      should_run_(false),
-      elapsed_time_(0) {
-}
-
-// Destructor of TestCase.
-TestCase::~TestCase() {
-  // Deletes every Test in the collection.
-  ForEach(test_info_list_, internal::Delete<TestInfo>);
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-const TestInfo* TestCase::GetTestInfo(int i) const {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Returns the i-th test among all the tests. i can range from 0 to
-// total_test_count() - 1. If i is not in that range, returns NULL.
-TestInfo* TestCase::GetMutableTestInfo(int i) {
-  const int index = GetElementOr(test_indices_, i, -1);
-  return index < 0 ? NULL : test_info_list_[index];
-}
-
-// Adds a test to this test case.  Will delete the test upon
-// destruction of the TestCase object.
-void TestCase::AddTestInfo(TestInfo * test_info) {
-  test_info_list_.push_back(test_info);
-  test_indices_.push_back(static_cast<int>(test_indices_.size()));
-}
-
-// Runs every test in this TestCase.
-void TestCase::Run() {
-  if (!should_run_) return;
-
-  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
-  impl->set_current_test_case(this);
-
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
-  repeater->OnTestCaseStart(*this);
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunSetUpTestCase, "SetUpTestCase()");
-
-  const internal::TimeInMillis start = internal::GetTimeInMillis();
-  for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-  }
-  elapsed_time_ = internal::GetTimeInMillis() - start;
-
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &TestCase::RunTearDownTestCase, "TearDownTestCase()");
-
-  repeater->OnTestCaseEnd(*this);
-  impl->set_current_test_case(NULL);
-}
-
-// Clears the results of all tests in this test case.
-void TestCase::ClearResult() {
-  ForEach(test_info_list_, TestInfo::ClearTestResult);
-}
-
-// Shuffles the tests in this test case.
-void TestCase::ShuffleTests(internal::Random* random) {
-  Shuffle(random, &test_indices_);
-}
-
-// Restores the test order to before the first shuffle.
-void TestCase::UnshuffleTests() {
-  for (size_t i = 0; i < test_indices_.size(); i++) {
-    test_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Formats a countable noun.  Depending on its quantity, either the
-// singular form or the plural form is used. e.g.
-//
-// FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
-// FormatCountableNoun(5, "book", "books") returns "5 books".
-static internal::String FormatCountableNoun(int count,
-                                            const char * singular_form,
-                                            const char * plural_form) {
-  return internal::String::Format("%d %s", count,
-                                  count == 1 ? singular_form : plural_form);
-}
-
-// Formats the count of tests.
-static internal::String FormatTestCount(int test_count) {
-  return FormatCountableNoun(test_count, "test", "tests");
-}
-
-// Formats the count of test cases.
-static internal::String FormatTestCaseCount(int test_case_count) {
-  return FormatCountableNoun(test_case_count, "test case", "test cases");
-}
-
-// Converts a TestPartResult::Type enum to human-friendly string
-// representation.  Both kNonFatalFailure and kFatalFailure are translated
-// to "Failure", as the user usually doesn't care about the difference
-// between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
-  switch (type) {
-    case TestPartResult::kSuccess:
-      return "Success";
-
-    case TestPartResult::kNonFatalFailure:
-    case TestPartResult::kFatalFailure:
-#ifdef _MSC_VER
-      return "error: ";
-#else
-      return "Failure\n";
-#endif
-    default:
-      return "Unknown result type";
-  }
-}
-
-// Prints a TestPartResult to a String.
-static internal::String PrintTestPartResultToString(
-    const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
-}
-
-// Prints a TestPartResult.
-static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const internal::String& result =
-      PrintTestPartResultToString(test_part_result);
-  printf("%s\n", result.c_str());
-  fflush(stdout);
-  // If the test program runs in Visual Studio or a debugger, the
-  // following statements add the test part result message to the Output
-  // window such that the user can double-click on it to jump to the
-  // corresponding source code location; otherwise they do nothing.
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  // We don't call OutputDebugString*() on Windows Mobile, as printing
-  // to stdout is done by OutputDebugString() there already - we don't
-  // want the same message printed twice.
-  ::OutputDebugStringA(result.c_str());
-  ::OutputDebugStringA("\n");
-#endif
-}
-
-// class PrettyUnitTestResultPrinter
-
-namespace internal {
-
-enum GTestColor {
-  COLOR_DEFAULT,
-  COLOR_RED,
-  COLOR_GREEN,
-  COLOR_YELLOW
-};
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
-  }
-}
-
-#else
-
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
-// an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
-  switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
-    default:            return NULL;
-  };
-}
-
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-
-// Returns true iff Google Test should use colors in the output.
-bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
-
-  if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
-    // On Windows the TERM variable is usually not set, but the
-    // console there does support colors.
-    return stdout_is_tty;
-#else
-    // On non-Windows platforms, we rely on the TERM variable.
-    const char* const term = posix::GetEnv("TERM");
-    const bool term_supports_color =
-        String::CStringEquals(term, "xterm") ||
-        String::CStringEquals(term, "xterm-color") ||
-        String::CStringEquals(term, "xterm-256color") ||
-        String::CStringEquals(term, "screen") ||
-        String::CStringEquals(term, "linux") ||
-        String::CStringEquals(term, "cygwin");
-    return stdout_is_tty && term_supports_color;
-#endif  // GTEST_OS_WINDOWS
-  }
-
-  return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
-  // We take "yes", "true", "t", and "1" as meaning "yes".  If the
-  // value is neither one of these nor "auto", we treat it as "no" to
-  // be conservative.
-}
-
-// Helpers for printing colored strings to stdout. Note that on Windows, we
-// cannot simply emit special characters and have the terminal change colors.
-// This routine must actually emit the characters rather than return a string
-// that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
-  va_list args;
-  va_start(args, fmt);
-
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  const bool use_color = false;
-#else
-  static const bool in_color_mode =
-      ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS
-  // The '!= 0' comparison is necessary to satisfy MSVC 7.1.
-
-  if (!use_color) {
-    vprintf(fmt, args);
-    va_end(args);
-    return;
-  }
-
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
-
-  // Gets the current text color.
-  CONSOLE_SCREEN_BUFFER_INFO buffer_info;
-  GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
-  const WORD old_color_attrs = buffer_info.wAttributes;
-
-  // We need to flush the stream buffers into the console before each
-  // SetConsoleTextAttribute call lest it affect the text that is already
-  // printed but has not yet reached the console.
-  fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
-  vprintf(fmt, args);
-
-  fflush(stdout);
-  // Restores the text color.
-  SetConsoleTextAttribute(stdout_handle, old_color_attrs);
-#else
-  printf("\033[0;3%sm", GetAnsiColorCode(color));
-  vprintf(fmt, args);
-  printf("\033[m");  // Resets the terminal to default.
-#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE
-  va_end(args);
-}
-
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
-  const char* const type_param = test_info.type_param();
-  const char* const value_param = test_info.value_param();
-
-  if (type_param != NULL || value_param != NULL) {
-    printf(", where ");
-    if (type_param != NULL) {
-      printf("TypeParam = %s", type_param);
-      if (value_param != NULL)
-        printf(" and ");
-    }
-    if (value_param != NULL) {
-      printf("GetParam() = %s", value_param);
-    }
-  }
-}
-
-// This class implements the TestEventListener interface.
-//
-// Class PrettyUnitTestResultPrinter is copyable.
-class PrettyUnitTestResultPrinter : public TestEventListener {
- public:
-  PrettyUnitTestResultPrinter() {}
-  static void PrintTestName(const char * test_case, const char * test) {
-    printf("%s.%s", test_case, test);
-  }
-
-  // The following methods override what's in the TestEventListener class.
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
-
- private:
-  static void PrintFailedTests(const UnitTest& unit_test);
-
-  internal::String test_case_name_;
-};
-
-  // Fired before each iteration of tests starts.
-void PrettyUnitTestResultPrinter::OnTestIterationStart(
-    const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
-    printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
-
-  const char* const filter = GTEST_FLAG(filter).c_str();
-
-  // Prints the filter if it's not *.  This reminds the user that some
-  // tests may be skipped.
-  if (!internal::String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
-  }
-
-  if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
-    const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
-                  static_cast<int>(shard_index) + 1,
-                  internal::posix::GetEnv(kTestTotalShards));
-  }
-
-  if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: Randomizing tests' orders with a seed of %d .\n",
-                  unit_test.random_seed());
-  }
-
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("Running %s from %s.\n",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment set-up.\n");
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
-  test_case_name_ = test_case.name();
-  const internal::String counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s", counts.c_str(), test_case_name_.c_str());
-  if (test_case.type_param() == NULL) {
-    printf("\n");
-  } else {
-    printf(", where TypeParam = %s\n", test_case.type_param());
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
-  PrintTestName(test_case_name_.c_str(), test_info.name());
-  printf("\n");
-  fflush(stdout);
-}
-
-// Called after an assertion failure.
-void PrettyUnitTestResultPrinter::OnTestPartResult(
-    const TestPartResult& result) {
-  // If the test part succeeded, we don't need to do anything.
-  if (result.type() == TestPartResult::kSuccess)
-    return;
-
-  // Print failure message from the assertion (e.g. expected this and got that).
-  PrintTestPartResult(result);
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
-  if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
-  } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-  }
-  PrintTestName(test_case_name_.c_str(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
-
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
-  } else {
-    printf("\n");
-  }
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
-
-  test_case_name_ = test_case.name();
-  const internal::String counts =
-      FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
-  printf("%s from %s (%s ms total)\n\n",
-         counts.c_str(), test_case_name_.c_str(),
-         internal::StreamableToString(test_case.elapsed_time()).c_str());
-  fflush(stdout);
-}
-
-void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
-    const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
-  printf("Global test environment tear-down\n");
-  fflush(stdout);
-}
-
-// Internal helper for printing the list of failed tests.
-void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
-  const int failed_test_count = unit_test.failed_test_count();
-  if (failed_test_count == 0) {
-    return;
-  }
-
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
-    const TestCase& test_case = *unit_test.GetTestCase(i);
-    if (!test_case.should_run() || (test_case.failed_test_count() == 0)) {
-      continue;
-    }
-    for (int j = 0; j < test_case.total_test_count(); ++j) {
-      const TestInfo& test_info = *test_case.GetTestInfo(j);
-      if (!test_info.should_run() || test_info.result()->Passed()) {
-        continue;
-      }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
-      printf("%s.%s", test_case.name(), test_info.name());
-      PrintFullTestCommentIfPresent(test_info);
-      printf("\n");
-    }
-  }
-}
-
-void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                     int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
-  printf("%s from %s ran.",
-         FormatTestCount(unit_test.test_to_run_count()).c_str(),
-         FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms total)",
-           internal::StreamableToString(unit_test.elapsed_time()).c_str());
-  }
-  printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
-  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
-
-  int num_failures = unit_test.failed_test_count();
-  if (!unit_test.Passed()) {
-    const int failed_test_count = unit_test.failed_test_count();
-    ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
-    printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
-    PrintFailedTests(unit_test);
-    printf("\n%2d FAILED %s\n", num_failures,
-                        num_failures == 1 ? "TEST" : "TESTS");
-  }
-
-  int num_disabled = unit_test.disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
-    if (!num_failures) {
-      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
-    }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
-  }
-  // Ensure that Google Test output is printed before, e.g., heapchecker output.
-  fflush(stdout);
-}
-
-// End PrettyUnitTestResultPrinter
-
-// class TestEventRepeater
-//
-// This class forwards events to other event listeners.
-class TestEventRepeater : public TestEventListener {
- public:
-  TestEventRepeater() : forwarding_enabled_(true) {}
-  virtual ~TestEventRepeater();
-  void Append(TestEventListener *listener);
-  TestEventListener* Release(TestEventListener* listener);
-
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled() const { return forwarding_enabled_; }
-  void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; }
-
-  virtual void OnTestProgramStart(const UnitTest& unit_test);
-  virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration);
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test);
-  virtual void OnTestCaseStart(const TestCase& test_case);
-  virtual void OnTestStart(const TestInfo& test_info);
-  virtual void OnTestPartResult(const TestPartResult& result);
-  virtual void OnTestEnd(const TestInfo& test_info);
-  virtual void OnTestCaseEnd(const TestCase& test_case);
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test);
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test);
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-  virtual void OnTestProgramEnd(const UnitTest& unit_test);
-
- private:
-  // Controls whether events will be forwarded to listeners_. Set to false
-  // in death test child processes.
-  bool forwarding_enabled_;
-  // The list of listeners that receive events.
-  std::vector<TestEventListener*> listeners_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
-};
-
-TestEventRepeater::~TestEventRepeater() {
-  ForEach(listeners_, Delete<TestEventListener>);
-}
-
-void TestEventRepeater::Append(TestEventListener *listener) {
-  listeners_.push_back(listener);
-}
-
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
-  for (size_t i = 0; i < listeners_.size(); ++i) {
-    if (listeners_[i] == listener) {
-      listeners_.erase(listeners_.begin() + i);
-      return listener;
-    }
-  }
-
-  return NULL;
-}
-
-// Since most methods are very similar, use macros to reduce boilerplate.
-// This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-// This defines a member that forwards the call to all listeners in reverse
-// order.
-#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
-
-GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest)
-GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase)
-GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
-GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
-GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase)
-GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest)
-
-#undef GTEST_REPEATER_METHOD_
-#undef GTEST_REVERSE_REPEATER_METHOD_
-
-void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test,
-                                             int iteration) {
-  if (forwarding_enabled_) {
-    for (size_t i = 0; i < listeners_.size(); i++) {
-      listeners_[i]->OnTestIterationStart(unit_test, iteration);
-    }
-  }
-}
-
-void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test,
-                                           int iteration) {
-  if (forwarding_enabled_) {
-    for (int i = static_cast<int>(listeners_.size()) - 1; i >= 0; i--) {
-      listeners_[i]->OnTestIterationEnd(unit_test, iteration);
-    }
-  }
-}
-
-// End TestEventRepeater
-
-// This class generates an XML output file.
-class XmlUnitTestResultPrinter : public EmptyTestEventListener {
- public:
-  explicit XmlUnitTestResultPrinter(const char* output_file);
-
-  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
-
- private:
-  // Is c a whitespace character that is normalized to a space character
-  // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
-  }
-
-  // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
-    return IsNormalizableWhitespace(c) || c >= 0x20;
-  }
-
-  // Returns an XML-escaped copy of the input string str.  If
-  // is_attribute is true, the text is meant to appear as an attribute
-  // value, and normalizable whitespace is preserved by replacing it
-  // with character references.
-  static String EscapeXml(const char* str, bool is_attribute);
-
-  // Returns the given string with all characters invalid in XML removed.
-  static string RemoveInvalidXmlCharacters(const string& str);
-
-  // Convenience wrapper around EscapeXml when str is an attribute value.
-  static String EscapeXmlAttribute(const char* str) {
-    return EscapeXml(str, true);
-  }
-
-  // Convenience wrapper around EscapeXml when str is not an attribute value.
-  static String EscapeXmlText(const char* str) { return EscapeXml(str, false); }
-
-  // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-  static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
-
-  // Streams an XML representation of a TestInfo object.
-  static void OutputXmlTestInfo(::std::ostream* stream,
-                                const char* test_case_name,
-                                const TestInfo& test_info);
-
-  // Prints an XML representation of a TestCase object
-  static void PrintXmlTestCase(FILE* out, const TestCase& test_case);
-
-  // Prints an XML summary of unit_test to output stream out.
-  static void PrintXmlUnitTest(FILE* out, const UnitTest& unit_test);
-
-  // Produces a string representing the test properties in a result as space
-  // delimited XML attributes based on the property key="value" pairs.
-  // When the String is not empty, it includes a space at the beginning,
-  // to delimit this attribute from prior attributes.
-  static String TestPropertiesAsXmlAttributes(const TestResult& result);
-
-  // The output file.
-  const String output_file_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
-};
-
-// Creates a new XmlUnitTestResultPrinter.
-XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
-    : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Called after the unit test ends.
-void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
-  PrintXmlUnitTest(xmlout, unit_test);
-  fclose(xmlout);
-}
-
-// Returns an XML-escaped copy of the input string str.  If is_attribute
-// is true, the text is meant to appear as an attribute value, and
-// normalizable whitespace is preserved by replacing it with character
-// references.
-//
-// Invalid XML characters in str, if any, are stripped from the output.
-// It is expected that most, if not all, of the text processed by this
-// module will consist of ordinary English text.
-// If this module is ever modified to produce version 1.1 XML output,
-// most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
-// escaping scheme for invalid characters, rather than dropping them.
-String XmlUnitTestResultPrinter::EscapeXml(const char* str, bool is_attribute) {
-  Message m;
-
-  if (str != NULL) {
-    for (const char* src = str; *src; ++src) {
-      switch (*src) {
-        case '<':
-          m << "&lt;";
-          break;
-        case '>':
-          m << "&gt;";
-          break;
-        case '&':
-          m << "&amp;";
-          break;
-        case '\'':
-          if (is_attribute)
-            m << "&apos;";
-          else
-            m << '\'';
-          break;
-        case '"':
-          if (is_attribute)
-            m << "&quot;";
-          else
-            m << '"';
-          break;
-        default:
-          if (IsValidXmlCharacter(*src)) {
-            if (is_attribute && IsNormalizableWhitespace(*src))
-              m << String::Format("&#x%02X;", unsigned(*src));
-            else
-              m << *src;
-          }
-          break;
-      }
-    }
-  }
-
-  return m.GetString();
-}
-
-// Returns the given string with all characters invalid in XML removed.
-// Currently invalid characters are dropped from the string. An
-// alternative is to replace them with certain characters such as . or ?.
-string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(const string& str) {
-  string output;
-  output.reserve(str.size());
-  for (string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
-      output.push_back(*it);
-
-  return output;
-}
-
-// The following routines generate an XML representation of a UnitTest
-// object.
-//
-// This is how Google Test concepts map to the DTD:
-//
-// <testsuites name="AllTests">        <-- corresponds to a UnitTest object
-//   <testsuite name="testcase-name">  <-- corresponds to a TestCase object
-//     <testcase name="test-name">     <-- corresponds to a TestInfo object
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//       <failure message="...">...</failure>
-//                                     <-- individual assertion failures
-//     </testcase>
-//   </testsuite>
-// </testsuites>
-
-// Formats the given time in milliseconds as seconds.
-std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) {
-  ::std::stringstream ss;
-  ss << ms/1000.0;
-  return ss.str();
-}
-
-// Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
-void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
-                                                     const char* data) {
-  const char* segment = data;
-  *stream << "<![CDATA[";
-  for (;;) {
-    const char* const next_segment = strstr(segment, "]]>");
-    if (next_segment != NULL) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
-      *stream << "]]>]]&gt;<![CDATA[";
-      segment = next_segment + strlen("]]>");
-    } else {
-      *stream << segment;
-      break;
-    }
-  }
-  *stream << "]]>";
-}
-
-// Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
-void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
-                                                 const char* test_case_name,
-                                                 const TestInfo& test_info) {
-  const TestResult& result = *test_info.result();
-  *stream << "    <testcase name=\""
-          << EscapeXmlAttribute(test_info.name()).c_str() << "\"";
-
-  if (test_info.value_param() != NULL) {
-    *stream << " value_param=\"" << EscapeXmlAttribute(test_info.value_param())
-            << "\"";
-  }
-  if (test_info.type_param() != NULL) {
-    *stream << " type_param=\"" << EscapeXmlAttribute(test_info.type_param())
-            << "\"";
-  }
-
-  *stream << " status=\""
-          << (test_info.should_run() ? "run" : "notrun")
-          << "\" time=\""
-          << FormatTimeInMillisAsSeconds(result.elapsed_time())
-          << "\" classname=\"" << EscapeXmlAttribute(test_case_name).c_str()
-          << "\"" << TestPropertiesAsXmlAttributes(result).c_str();
-
-  int failures = 0;
-  for (int i = 0; i < result.total_part_count(); ++i) {
-    const TestPartResult& part = result.GetTestPartResult(i);
-    if (part.failed()) {
-      if (++failures == 1)
-        *stream << ">\n";
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(part.summary()).c_str()
-              << "\" type=\"\">";
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string message = location + "\n" + part.message();
-      OutputXmlCDataSection(stream,
-                            RemoveInvalidXmlCharacters(message).c_str());
-      *stream << "</failure>\n";
-    }
-  }
-
-  if (failures == 0)
-    *stream << " />\n";
-  else
-    *stream << "    </testcase>\n";
-}
-
-// Prints an XML representation of a TestCase object
-void XmlUnitTestResultPrinter::PrintXmlTestCase(FILE* out,
-                                                const TestCase& test_case) {
-  fprintf(out,
-          "  <testsuite name=\"%s\" tests=\"%d\" failures=\"%d\" "
-          "disabled=\"%d\" ",
-          EscapeXmlAttribute(test_case.name()).c_str(),
-          test_case.total_test_count(),
-          test_case.failed_test_count(),
-          test_case.disabled_test_count());
-  fprintf(out,
-          "errors=\"0\" time=\"%s\">\n",
-          FormatTimeInMillisAsSeconds(test_case.elapsed_time()).c_str());
-  for (int i = 0; i < test_case.total_test_count(); ++i) {
-    ::std::stringstream stream;
-    OutputXmlTestInfo(&stream, test_case.name(), *test_case.GetTestInfo(i));
-    fprintf(out, "%s", StringStreamToString(&stream).c_str());
-  }
-  fprintf(out, "  </testsuite>\n");
-}
-
-// Prints an XML summary of unit_test to output stream out.
-void XmlUnitTestResultPrinter::PrintXmlUnitTest(FILE* out,
-                                                const UnitTest& unit_test) {
-  fprintf(out, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
-  fprintf(out,
-          "<testsuites tests=\"%d\" failures=\"%d\" disabled=\"%d\" "
-          "errors=\"0\" time=\"%s\" ",
-          unit_test.total_test_count(),
-          unit_test.failed_test_count(),
-          unit_test.disabled_test_count(),
-          FormatTimeInMillisAsSeconds(unit_test.elapsed_time()).c_str());
-  if (GTEST_FLAG(shuffle)) {
-    fprintf(out, "random_seed=\"%d\" ", unit_test.random_seed());
-  }
-  fprintf(out, "name=\"AllTests\">\n");
-  for (int i = 0; i < unit_test.total_test_case_count(); ++i)
-    PrintXmlTestCase(out, *unit_test.GetTestCase(i));
-  fprintf(out, "</testsuites>\n");
-}
-
-// Produces a string representing the test properties in a result as space
-// delimited XML attributes based on the property key="value" pairs.
-String XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
-    const TestResult& result) {
-  Message attributes;
-  for (int i = 0; i < result.test_property_count(); ++i) {
-    const TestProperty& property = result.GetTestProperty(i);
-    attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
-  }
-  return attributes.GetString();
-}
-
-// End XmlUnitTestResultPrinter
-
-#if GTEST_CAN_STREAM_RESULTS_
-
-// Streams test results to the given port on the given host machine.
-class StreamingListener : public EmptyTestEventListener {
- public:
-  // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
-
-  StreamingListener(const string& host, const string& port)
-      : sockfd_(-1), host_name_(host), port_num_(port) {
-    MakeConnection();
-    Send("gtest_streaming_protocol_version=1.0\n");
-  }
-
-  virtual ~StreamingListener() {
-    if (sockfd_ != -1)
-      CloseConnection();
-  }
-
-  void OnTestProgramStart(const UnitTest& /* unit_test */) {
-    Send("event=TestProgramStart\n");
-  }
-
-  void OnTestProgramEnd(const UnitTest& unit_test) {
-    // Note that Google Test current only report elapsed time for each
-    // test iteration, not for the entire test program.
-    Send(String::Format("event=TestProgramEnd&passed=%d\n",
-                        unit_test.Passed()));
-
-    // Notify the streaming server to stop.
-    CloseConnection();
-  }
-
-  void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) {
-    Send(String::Format("event=TestIterationStart&iteration=%d\n",
-                        iteration));
-  }
-
-  void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) {
-    Send(String::Format("event=TestIterationEnd&passed=%d&elapsed_time=%sms\n",
-                        unit_test.Passed(),
-                        StreamableToString(unit_test.elapsed_time()).c_str()));
-  }
-
-  void OnTestCaseStart(const TestCase& test_case) {
-    Send(String::Format("event=TestCaseStart&name=%s\n", test_case.name()));
-  }
-
-  void OnTestCaseEnd(const TestCase& test_case) {
-    Send(String::Format("event=TestCaseEnd&passed=%d&elapsed_time=%sms\n",
-                        test_case.Passed(),
-                        StreamableToString(test_case.elapsed_time()).c_str()));
-  }
-
-  void OnTestStart(const TestInfo& test_info) {
-    Send(String::Format("event=TestStart&name=%s\n", test_info.name()));
-  }
-
-  void OnTestEnd(const TestInfo& test_info) {
-    Send(String::Format(
-        "event=TestEnd&passed=%d&elapsed_time=%sms\n",
-        (test_info.result())->Passed(),
-        StreamableToString((test_info.result())->elapsed_time()).c_str()));
-  }
-
-  void OnTestPartResult(const TestPartResult& test_part_result) {
-    const char* file_name = test_part_result.file_name();
-    if (file_name == NULL)
-      file_name = "";
-    Send(String::Format("event=TestPartResult&file=%s&line=%d&message=",
-                        UrlEncode(file_name).c_str(),
-                        test_part_result.line_number()));
-    Send(UrlEncode(test_part_result.message()) + "\n");
-  }
-
- private:
-  // Creates a client socket and connects to the server.
-  void MakeConnection();
-
-  // Closes the socket.
-  void CloseConnection() {
-    GTEST_CHECK_(sockfd_ != -1)
-        << "CloseConnection() can be called only when there is a connection.";
-
-    close(sockfd_);
-    sockfd_ = -1;
-  }
-
-  // Sends a string to the socket.
-  void Send(const string& message) {
-    GTEST_CHECK_(sockfd_ != -1)
-        << "Send() can be called only when there is a connection.";
-
-    const int len = static_cast<int>(message.length());
-    if (write(sockfd_, message.c_str(), len) != len) {
-      GTEST_LOG_(WARNING)
-          << "stream_result_to: failed to stream to "
-          << host_name_ << ":" << port_num_;
-    }
-  }
-
-  int sockfd_;   // socket file descriptor
-  const string host_name_;
-  const string port_num_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
-};  // class StreamingListener
-
-// Checks if str contains '=', '&', '%' or '\n' characters. If yes,
-// replaces them by "%xx" where xx is their hexadecimal value. For
-// example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
-// in both time and space -- important as the input str may contain an
-// arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
-  result.reserve(strlen(str) + 1);
-  for (char ch = *str; ch != '\0'; ch = *++str) {
-    switch (ch) {
-      case '%':
-      case '=':
-      case '&':
-      case '\n':
-        result.append(String::Format("%%%02x", static_cast<unsigned char>(ch)));
-        break;
-      default:
-        result.push_back(ch);
-        break;
-    }
-  }
-  return result;
-}
-
-void StreamingListener::MakeConnection() {
-  GTEST_CHECK_(sockfd_ == -1)
-      << "MakeConnection() can't be called when there is already a connection.";
-
-  addrinfo hints;
-  memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
-  hints.ai_socktype = SOCK_STREAM;
-  addrinfo* servinfo = NULL;
-
-  // Use the getaddrinfo() to get a linked list of IP addresses for
-  // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
-  if (error_num != 0) {
-    GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
-                        << gai_strerror(error_num);
-  }
-
-  // Loop through all the results and connect to the first we can.
-  for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL;
-       cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
-    if (sockfd_ != -1) {
-      // Connect the client socket to the server socket.
-      if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
-        close(sockfd_);
-        sockfd_ = -1;
-      }
-    }
-  }
-
-  freeaddrinfo(servinfo);  // all done with this structure
-
-  if (sockfd_ == -1) {
-    GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to "
-                        << host_name_ << ":" << port_num_;
-  }
-}
-
-// End of class Streaming Listener
-#endif  // GTEST_CAN_STREAM_RESULTS__
-
-// Class ScopedTrace
-
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-// L < UnitTest::mutex_
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
-
-  UnitTest::GetInstance()->PushGTestTrace(trace);
-}
-
-// Pops the info pushed by the c'tor.
-// L < UnitTest::mutex_
-ScopedTrace::~ScopedTrace() {
-  UnitTest::GetInstance()->PopGTestTrace();
-}
-
-
-// class OsStackTraceGetter
-
-// Returns the current OS stack trace as a String.  Parameters:
-//
-//   max_depth  - the maximum number of stack frames to be included
-//                in the trace.
-//   skip_count - the number of top frames to be skipped; doesn't count
-//                against max_depth.
-//
-// L < mutex_
-// We use "L < mutex_" to denote that the function may acquire mutex_.
-String OsStackTraceGetter::CurrentStackTrace(int, int) {
-  return String("");
-}
-
-// L < mutex_
-void OsStackTraceGetter::UponLeavingGTest() {
-}
-
-const char* const
-OsStackTraceGetter::kElidedFramesMarker =
-    "... " GTEST_NAME_ " internal frames ...";
-
-}  // namespace internal
-
-// class TestEventListeners
-
-TestEventListeners::TestEventListeners()
-    : repeater_(new internal::TestEventRepeater()),
-      default_result_printer_(NULL),
-      default_xml_generator_(NULL) {
-}
-
-TestEventListeners::~TestEventListeners() { delete repeater_; }
-
-// Returns the standard listener responsible for the default console
-// output.  Can be removed from the listeners list to shut down default
-// console output.  Note that removing this object from the listener list
-// with Release transfers its ownership to the user.
-void TestEventListeners::Append(TestEventListener* listener) {
-  repeater_->Append(listener);
-}
-
-// Removes the given event listener from the list and returns it.  It then
-// becomes the caller's responsibility to delete the listener. Returns
-// NULL if the listener is not found in the list.
-TestEventListener* TestEventListeners::Release(TestEventListener* listener) {
-  if (listener == default_result_printer_)
-    default_result_printer_ = NULL;
-  else if (listener == default_xml_generator_)
-    default_xml_generator_ = NULL;
-  return repeater_->Release(listener);
-}
-
-// Returns repeater that broadcasts the TestEventListener events to all
-// subscribers.
-TestEventListener* TestEventListeners::repeater() { return repeater_; }
-
-// Sets the default_result_printer attribute to the provided listener.
-// The listener is also added to the listener list and previous
-// default_result_printer is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) {
-  if (default_result_printer_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_result_printer_);
-    default_result_printer_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Sets the default_xml_generator attribute to the provided listener.  The
-// listener is also added to the listener list and previous
-// default_xml_generator is removed from it and deleted. The listener can
-// also be NULL in which case it will not be added to the list. Does
-// nothing if the previous and the current listener objects are the same.
-void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) {
-  if (default_xml_generator_ != listener) {
-    // It is an error to pass this method a listener that is already in the
-    // list.
-    delete Release(default_xml_generator_);
-    default_xml_generator_ = listener;
-    if (listener != NULL)
-      Append(listener);
-  }
-}
-
-// Controls whether events will be forwarded by the repeater to the
-// listeners in the list.
-bool TestEventListeners::EventForwardingEnabled() const {
-  return repeater_->forwarding_enabled();
-}
-
-void TestEventListeners::SuppressEventForwarding() {
-  repeater_->set_forwarding_enabled(false);
-}
-
-// class UnitTest
-
-// Gets the singleton UnitTest object.  The first time this method is
-// called, a UnitTest object is constructed and returned.  Consecutive
-// calls will return the same object.
-//
-// We don't protect this under mutex_ as a user is not supposed to
-// call this before main() starts, from which point on the return
-// value will never change.
-UnitTest * UnitTest::GetInstance() {
-  // When compiled with MSVC 7.1 in optimized mode, destroying the
-  // UnitTest object upon exiting the program messes up the exit code,
-  // causing successful tests to appear failed.  We have to use a
-  // different implementation in this case to bypass the compiler bug.
-  // This implementation makes the compiler happy, at the cost of
-  // leaking the UnitTest object.
-
-  // CodeGear C++Builder insists on a public destructor for the
-  // default implementation.  Use this implementation to keep good OO
-  // design with private destructor.
-
-#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-  static UnitTest* const instance = new UnitTest;
-  return instance;
-#else
-  static UnitTest instance;
-  return &instance;
-#endif  // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__)
-}
-
-// Gets the number of successful test cases.
-int UnitTest::successful_test_case_count() const {
-  return impl()->successful_test_case_count();
-}
-
-// Gets the number of failed test cases.
-int UnitTest::failed_test_case_count() const {
-  return impl()->failed_test_case_count();
-}
-
-// Gets the number of all test cases.
-int UnitTest::total_test_case_count() const {
-  return impl()->total_test_case_count();
-}
-
-// Gets the number of all test cases that contain at least one test
-// that should run.
-int UnitTest::test_case_to_run_count() const {
-  return impl()->test_case_to_run_count();
-}
-
-// Gets the number of successful tests.
-int UnitTest::successful_test_count() const {
-  return impl()->successful_test_count();
-}
-
-// Gets the number of failed tests.
-int UnitTest::failed_test_count() const { return impl()->failed_test_count(); }
-
-// Gets the number of disabled tests.
-int UnitTest::disabled_test_count() const {
-  return impl()->disabled_test_count();
-}
-
-// Gets the number of all tests.
-int UnitTest::total_test_count() const { return impl()->total_test_count(); }
-
-// Gets the number of tests that should run.
-int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
-
-// Gets the elapsed time, in milliseconds.
-internal::TimeInMillis UnitTest::elapsed_time() const {
-  return impl()->elapsed_time();
-}
-
-// Returns true iff the unit test passed (i.e. all test cases passed).
-bool UnitTest::Passed() const { return impl()->Passed(); }
-
-// Returns true iff the unit test failed (i.e. some test case failed
-// or something outside of all tests failed).
-bool UnitTest::Failed() const { return impl()->Failed(); }
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-const TestCase* UnitTest::GetTestCase(int i) const {
-  return impl()->GetTestCase(i);
-}
-
-// Gets the i-th test case among all the test cases. i can range from 0 to
-// total_test_case_count() - 1. If i is not in that range, returns NULL.
-TestCase* UnitTest::GetMutableTestCase(int i) {
-  return impl()->GetMutableTestCase(i);
-}
-
-// Returns the list of event listeners that can be used to track events
-// inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
-
-// Registers and returns a global test environment.  When a test
-// program is run, all global test environments will be set-up in the
-// order they were registered.  After all tests in the program have
-// finished, all global test environments will be torn-down in the
-// *reverse* order they were registered.
-//
-// The UnitTest object takes ownership of the given environment.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-Environment* UnitTest::AddEnvironment(Environment* env) {
-  if (env == NULL) {
-    return NULL;
-  }
-
-  impl_->environments().push_back(env);
-  return env;
-}
-
-// Adds a TestPartResult to the current TestResult object.  All Google Test
-// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
-// this to report their results.  The user code should use the
-// assertion macros instead of calling this directly.
-// L < mutex_
-void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
-                                 const char* file_name,
-                                 int line_number,
-                                 const internal::String& message,
-                                 const internal::String& os_stack_trace) {
-  Message msg;
-  msg << message;
-
-  internal::MutexLock lock(&mutex_);
-  if (impl_->gtest_trace_stack().size() > 0) {
-    msg << "\n" << GTEST_NAME_ << " trace:";
-
-    for (int i = static_cast<int>(impl_->gtest_trace_stack().size());
-         i > 0; --i) {
-      const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
-    }
-  }
-
-  if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) {
-    msg << internal::kStackTraceMarker << os_stack_trace;
-  }
-
-  const TestPartResult result =
-    TestPartResult(result_type, file_name, line_number,
-                   msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
-
-  if (result_type != TestPartResult::kSuccess) {
-    // gtest_break_on_failure takes precedence over
-    // gtest_throw_on_failure.  This allows a user to set the latter
-    // in the code (perhaps in order to use Google Test assertions
-    // with another testing framework) and specify the former on the
-    // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
-#if GTEST_OS_WINDOWS
-      // Using DebugBreak on Windows allows gtest to still break into a debugger
-      // when a failure happens and both the --gtest_break_on_failure and
-      // the --gtest_catch_exceptions flags are specified.
-      DebugBreak();
-#else
-      // Dereference NULL through a volatile pointer to prevent the compiler
-      // from removing. We use this rather than abort() or __builtin_trap() for
-      // portability: Symbian doesn't implement abort() well, and some debuggers
-      // don't correctly trap abort().
-      *static_cast<volatile int*>(NULL) = 1;
-#endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
-#if GTEST_HAS_EXCEPTIONS
-      throw GoogleTestFailureException(result);
-#else
-      // We cannot call abort() as it generates a pop-up in debug mode
-      // that cannot be suppressed in VC 7.1 or below.
-      exit(1);
-#endif
-    }
-  }
-}
-
-// Creates and adds a property to the current TestResult. If a property matching
-// the supplied value already exists, updates its value instead.
-void UnitTest::RecordPropertyForCurrentTest(const char* key,
-                                            const char* value) {
-  const TestProperty test_property(key, value);
-  impl_->current_test_result()->RecordProperty(test_property);
-}
-
-// Runs all tests in this UnitTest object and prints the result.
-// Returns 0 if successful, or 1 otherwise.
-//
-// We don't protect this under mutex_, as we only support calling it
-// from the main thread.
-int UnitTest::Run() {
-  // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
-  // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
-
-#if GTEST_HAS_SEH
-  const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
-
-  // Either the user wants Google Test to catch exceptions thrown by the
-  // tests or this is executing in the context of death test child
-  // process. In either case the user does not want to see pop-up dialogs
-  // about crashes - they are expected.
-  if (impl()->catch_exceptions() || in_death_test_child_process) {
-
-# if !GTEST_OS_WINDOWS_MOBILE
-    // SetErrorMode doesn't exist on CE.
-    SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
-                 SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
-
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
-    // Death test children can be terminated with _abort().  On Windows,
-    // _abort() can show a dialog with a warning message.  This forces the
-    // abort message to go to stderr instead.
-    _set_error_mode(_OUT_TO_STDERR);
-# endif
-
-# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE
-    // In the debug version, Visual Studio pops up a separate dialog
-    // offering a choice to debug the aborted program. We need to suppress
-    // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
-    // executed. Google Test will notify the user of any unexpected
-    // failure via stderr.
-    //
-    // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
-    // Users of prior VC versions shall suffer the agony and pain of
-    // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
-    // debug mode when compiled with VC 7.1 or lower.
-    if (!GTEST_FLAG(break_on_failure))
-      _set_abort_behavior(
-          0x0,                                    // Clear the following flags:
-          _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
-# endif
-
-  }
-#endif  // GTEST_HAS_SEH
-
-  return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
-}
-
-// Returns the working directory when the first TEST() or TEST_F() was
-// executed.
-const char* UnitTest::original_working_dir() const {
-  return impl_->original_working_dir_.c_str();
-}
-
-// Returns the TestCase object for the test that's currently running,
-// or NULL if no test is running.
-// L < mutex_
-const TestCase* UnitTest::current_test_case() const {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_case();
-}
-
-// Returns the TestInfo object for the test that's currently running,
-// or NULL if no test is running.
-// L < mutex_
-const TestInfo* UnitTest::current_test_info() const {
-  internal::MutexLock lock(&mutex_);
-  return impl_->current_test_info();
-}
-
-// Returns the random seed used at the start of the current test run.
-int UnitTest::random_seed() const { return impl_->random_seed(); }
-
-#if GTEST_HAS_PARAM_TEST
-// Returns ParameterizedTestCaseRegistry object used to keep track of
-// value-parameterized tests and instantiate and register them.
-// L < mutex_
-internal::ParameterizedTestCaseRegistry&
-    UnitTest::parameterized_test_registry() {
-  return impl_->parameterized_test_registry();
-}
-#endif  // GTEST_HAS_PARAM_TEST
-
-// Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
-
-// Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
-
-// Pushes a trace defined by SCOPED_TRACE() on to the per-thread
-// Google Test trace stack.
-// L < mutex_
-void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().push_back(trace);
-}
-
-// Pops a trace from the per-thread Google Test trace stack.
-// L < mutex_
-void UnitTest::PopGTestTrace() {
-  internal::MutexLock lock(&mutex_);
-  impl_->gtest_trace_stack().pop_back();
-}
-
-namespace internal {
-
-UnitTestImpl::UnitTestImpl(UnitTest* parent)
-    : parent_(parent),
-#ifdef _MSC_VER
-# pragma warning(push)                    // Saves the current warning state.
-# pragma warning(disable:4355)            // Temporarily disables warning 4355
-                                         // (using this in initializer).
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-# pragma warning(pop)                     // Restores the warning state again.
-#else
-      default_global_test_part_result_reporter_(this),
-      default_per_thread_test_part_result_reporter_(this),
-#endif  // _MSC_VER
-      global_test_part_result_repoter_(
-          &default_global_test_part_result_reporter_),
-      per_thread_test_part_result_reporter_(
-          &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
-      parameterized_test_registry_(),
-      parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
-      last_death_test_case_(-1),
-      current_test_case_(NULL),
-      current_test_info_(NULL),
-      ad_hoc_test_result_(),
-      os_stack_trace_getter_(NULL),
-      post_flag_parse_init_performed_(false),
-      random_seed_(0),  // Will be overridden by the flag before first use.
-      random_(0),  // Will be reseeded before first use.
-      elapsed_time_(0),
-#if GTEST_HAS_DEATH_TEST
-      internal_run_death_test_flag_(NULL),
-      death_test_factory_(new DefaultDeathTestFactory),
-#endif
-      // Will be overridden by the flag before first use.
-      catch_exceptions_(false) {
-  listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter);
-}
-
-UnitTestImpl::~UnitTestImpl() {
-  // Deletes every TestCase.
-  ForEach(test_cases_, internal::Delete<TestCase>);
-
-  // Deletes every Environment.
-  ForEach(environments_, internal::Delete<Environment>);
-
-  delete os_stack_trace_getter_;
-}
-
-#if GTEST_HAS_DEATH_TEST
-// Disables event forwarding if the control is currently in a death test
-// subprocess. Must not be called before InitGoogleTest.
-void UnitTestImpl::SuppressTestEventsIfInSubprocess() {
-  if (internal_run_death_test_flag_.get() != NULL)
-    listeners()->SuppressEventForwarding();
-}
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Initializes event listeners performing XML output as specified by
-// UnitTestOptions. Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureXmlOutput() {
-  const String& output_format = UnitTestOptions::GetOutputFormat();
-  if (output_format == "xml") {
-    listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
-        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
-  } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
-  }
-}
-
-#if GTEST_CAN_STREAM_RESULTS_
-// Initializes event listeners for streaming test results in String form.
-// Must not be called before InitGoogleTest.
-void UnitTestImpl::ConfigureStreamingOutput() {
-  const string& target = GTEST_FLAG(stream_result_to);
-  if (!target.empty()) {
-    const size_t pos = target.find(':');
-    if (pos != string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
-    } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
-    }
-  }
-}
-#endif  // GTEST_CAN_STREAM_RESULTS_
-
-// Performs initialization dependent upon flag values obtained in
-// ParseGoogleTestFlagsOnly.  Is called from InitGoogleTest after the call to
-// ParseGoogleTestFlagsOnly.  In case a user neglects to call InitGoogleTest
-// this function is also called from RunAllTests.  Since this function can be
-// called more than once, it has to be idempotent.
-void UnitTestImpl::PostFlagParsingInit() {
-  // Ensures that this function does not execute more than once.
-  if (!post_flag_parse_init_performed_) {
-    post_flag_parse_init_performed_ = true;
-
-#if GTEST_HAS_DEATH_TEST
-    InitDeathTestSubprocessControlInfo();
-    SuppressTestEventsIfInSubprocess();
-#endif  // GTEST_HAS_DEATH_TEST
-
-    // Registers parameterized tests. This makes parameterized tests
-    // available to the UnitTest reflection API without running
-    // RUN_ALL_TESTS.
-    RegisterParameterizedTests();
-
-    // Configures listeners for XML output. This makes it possible for users
-    // to shut down the default XML output before invoking RUN_ALL_TESTS.
-    ConfigureXmlOutput();
-
-#if GTEST_CAN_STREAM_RESULTS_
-    // Configures listeners for streaming test results to the specified server.
-    ConfigureStreamingOutput();
-#endif  // GTEST_CAN_STREAM_RESULTS_
-  }
-}
-
-// A predicate that checks the name of a TestCase against a known
-// value.
-//
-// This is used for implementation of the UnitTest class only.  We put
-// it in the anonymous namespace to prevent polluting the outer
-// namespace.
-//
-// TestCaseNameIs is copyable.
-class TestCaseNameIs {
- public:
-  // Constructor.
-  explicit TestCaseNameIs(const String& name)
-      : name_(name) {}
-
-  // Returns true iff the name of test_case matches name_.
-  bool operator()(const TestCase* test_case) const {
-    return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0;
-  }
-
- private:
-  String name_;
-};
-
-// Finds and returns a TestCase with the given name.  If one doesn't
-// exist, creates one and returns it.  It's the CALLER'S
-// RESPONSIBILITY to ensure that this function is only called WHEN THE
-// TESTS ARE NOT SHUFFLED.
-//
-// Arguments:
-//
-//   test_case_name: name of the test case
-//   type_param:     the name of the test case's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test case.
-//   set_up_tc:      pointer to the function that sets up the test case
-//   tear_down_tc:   pointer to the function that tears down the test case
-TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
-                                    const char* type_param,
-                                    Test::SetUpTestCaseFunc set_up_tc,
-                                    Test::TearDownTestCaseFunc tear_down_tc) {
-  // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
-                   TestCaseNameIs(test_case_name));
-
-  if (test_case != test_cases_.end())
-    return *test_case;
-
-  // No.  Let's create one.
-  TestCase* const new_test_case =
-      new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc);
-
-  // Is this a death test case?
-  if (internal::UnitTestOptions::MatchesFilter(String(test_case_name),
-                                               kDeathTestCaseFilter)) {
-    // Yes.  Inserts the test case after the last death test case
-    // defined so far.  This only works when the test cases haven't
-    // been shuffled.  Otherwise we may end up running a death test
-    // after a non-death test.
-    ++last_death_test_case_;
-    test_cases_.insert(test_cases_.begin() + last_death_test_case_,
-                       new_test_case);
-  } else {
-    // No.  Appends to the end of the list.
-    test_cases_.push_back(new_test_case);
-  }
-
-  test_case_indices_.push_back(static_cast<int>(test_case_indices_.size()));
-  return new_test_case;
-}
-
-// Helpers for setting up / tearing down the given environment.  They
-// are for use in the ForEach() function.
-static void SetUpEnvironment(Environment* env) { env->SetUp(); }
-static void TearDownEnvironment(Environment* env) { env->TearDown(); }
-
-// Runs all tests in this UnitTest object, prints the result, and
-// returns true if all tests are successful.  If any exception is
-// thrown during a test, the test is considered to be failed, but the
-// rest of the tests will still be run.
-//
-// When parameterized tests are enabled, it expands and registers
-// parameterized tests first in RegisterParameterizedTests().
-// All other functions called from RunAllTests() may safely assume that
-// parameterized tests are ready to be counted and run.
-bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
-
-  // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
-
-  // Repeats the call to the post-flag parsing initialization in case the
-  // user didn't call InitGoogleTest.
-  PostFlagParsingInit();
-
-  // Even if sharding is not on, test runners may want to use the
-  // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding
-  // protocol.
-  internal::WriteToShardStatusFileIfNeeded();
-
-  // True iff we are in a subprocess for running a thread-safe-style
-  // death test.
-  bool in_subprocess_for_death_test = false;
-
-#if GTEST_HAS_DEATH_TEST
-  in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL);
-#endif  // GTEST_HAS_DEATH_TEST
-
-  const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
-                                        in_subprocess_for_death_test);
-
-  // Compares the full test names with the filter to decide which
-  // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
-
-  // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
-    // This must be called *after* FilterTests() has been called.
-    ListTestsMatchingFilter();
-    return true;
-  }
-
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
-
-  // True iff at least one test has failed.
-  bool failed = false;
-
-  TestEventListener* repeater = listeners()->repeater();
-
-  repeater->OnTestProgramStart(*parent_);
-
-  // How many times to repeat the tests?  We don't want to repeat them
-  // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
-  // Repeats forever if the repeat count is negative.
-  const bool forever = repeat < 0;
-  for (int i = 0; forever || i != repeat; i++) {
-    // We want to preserve failures generated by ad-hoc test
-    // assertions executed before RUN_ALL_TESTS().
-    ClearNonAdHocTestResult();
-
-    const TimeInMillis start = GetTimeInMillis();
-
-    // Shuffles test cases and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
-      random()->Reseed(random_seed_);
-      // This should be done before calling OnTestIterationStart(),
-      // such that a test event listener can see the actual test order
-      // in the event.
-      ShuffleTests();
-    }
-
-    // Tells the unit test event listeners that the tests are about to start.
-    repeater->OnTestIterationStart(*parent_, i);
-
-    // Runs each test case if there is at least one test to run.
-    if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
-
-      // Runs the tests only if there was no fatal failure during global
-      // set-up.
-      if (!Test::HasFatalFailure()) {
-        for (int test_index = 0; test_index < total_test_case_count();
-             test_index++) {
-          GetMutableTestCase(test_index)->Run();
-        }
-      }
-
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
-    }
-
-    elapsed_time_ = GetTimeInMillis() - start;
-
-    // Tells the unit test event listener that the tests have just finished.
-    repeater->OnTestIterationEnd(*parent_, i);
-
-    // Gets the result and clears it.
-    if (!Passed()) {
-      failed = true;
-    }
-
-    // Restores the original test order after the iteration.  This
-    // allows the user to quickly repro a failure that happens in the
-    // N-th iteration without repeating the first (N - 1) iterations.
-    // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in
-    // case the user somehow changes the value of the flag somewhere
-    // (it's always safe to unshuffle the tests).
-    UnshuffleTests();
-
-    if (GTEST_FLAG(shuffle)) {
-      // Picks a new random seed for each iteration.
-      random_seed_ = GetNextRandomSeed(random_seed_);
-    }
-  }
-
-  repeater->OnTestProgramEnd(*parent_);
-
-  return !failed;
-}
-
-// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file
-// if the variable is present. If a file already exists at this location, this
-// function will write over it. If the variable is present, but the file cannot
-// be created, prints an error and exits.
-void WriteToShardStatusFileIfNeeded() {
-  const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile);
-  if (test_shard_file != NULL) {
-    FILE* const file = posix::FOpen(test_shard_file, "w");
-    if (file == NULL) {
-      ColoredPrintf(COLOR_RED,
-                    "Could not write to the test shard status file \"%s\" "
-                    "specified by the %s environment variable.\n",
-                    test_shard_file, kTestShardStatusFile);
-      fflush(stdout);
-      exit(EXIT_FAILURE);
-    }
-    fclose(file);
-  }
-}
-
-// Checks whether sharding is enabled by examining the relevant
-// environment variable values. If the variables are present,
-// but inconsistent (i.e., shard_index >= total_shards), prints
-// an error and exits. If in_subprocess_for_death_test, sharding is
-// disabled because it must only be applied to the original test
-// process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
-                 bool in_subprocess_for_death_test) {
-  if (in_subprocess_for_death_test) {
-    return false;
-  }
-
-  const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1);
-  const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1);
-
-  if (total_shards == -1 && shard_index == -1) {
-    return false;
-  } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (total_shards != -1 && shard_index == -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, msg.GetString().c_str());
-    fflush(stdout);
-    exit(EXIT_FAILURE);
-  }
-
-  return total_shards > 1;
-}
-
-// Parses the environment variable var as an Int32. If it is unset,
-// returns default_val. If it is not an Int32, prints an error
-// and aborts.
-Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) {
-  const char* str_val = posix::GetEnv(var);
-  if (str_val == NULL) {
-    return default_val;
-  }
-
-  Int32 result;
-  if (!ParseInt32(Message() << "The value of environment variable " << var,
-                  str_val, &result)) {
-    exit(EXIT_FAILURE);
-  }
-  return result;
-}
-
-// Given the total number of shards, the shard index, and the test id,
-// returns true iff the test should be run on this shard. The test id is
-// some arbitrary but unique non-negative integer assigned to each test
-// method. Assumes that 0 <= shard_index < total_shards.
-bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
-  return (test_id % total_shards) == shard_index;
-}
-
-// Compares the name of each test with the user-specified filter to
-// decide whether the test should be run, then records the result in
-// each TestCase and TestInfo object.
-// If shard_tests == true, further filters tests based on sharding
-// variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
-int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
-  // num_runnable_tests are the number of tests that will
-  // run across all shards (i.e., match filter and are not disabled).
-  // num_selected_tests are the number of tests to be run on
-  // this shard.
-  int num_runnable_tests = 0;
-  int num_selected_tests = 0;
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    TestCase* const test_case = test_cases_[i];
-    const String &test_case_name = test_case->name();
-    test_case->set_should_run(false);
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      TestInfo* const test_info = test_case->test_info_list()[j];
-      const String test_name(test_info->name());
-      // A test is disabled if test case name or test name matches
-      // kDisableTestFilter.
-      const bool is_disabled =
-          internal::UnitTestOptions::MatchesFilter(test_case_name,
-                                                   kDisableTestFilter) ||
-          internal::UnitTestOptions::MatchesFilter(test_name,
-                                                   kDisableTestFilter);
-      test_info->is_disabled_ = is_disabled;
-
-      const bool matches_filter =
-          internal::UnitTestOptions::FilterMatchesTest(test_case_name,
-                                                       test_name);
-      test_info->matches_filter_ = matches_filter;
-
-      const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
-          matches_filter;
-
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
-
-      num_runnable_tests += is_runnable;
-      num_selected_tests += is_selected;
-
-      test_info->should_run_ = is_selected;
-      test_case->set_should_run(test_case->should_run() || is_selected);
-    }
-  }
-  return num_selected_tests;
-}
-
-// Prints the names of the tests matching the user-specified filter flag.
-void UnitTestImpl::ListTestsMatchingFilter() {
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    const TestCase* const test_case = test_cases_[i];
-    bool printed_test_case_name = false;
-
-    for (size_t j = 0; j < test_case->test_info_list().size(); j++) {
-      const TestInfo* const test_info =
-          test_case->test_info_list()[j];
-      if (test_info->matches_filter_) {
-        if (!printed_test_case_name) {
-          printed_test_case_name = true;
-          printf("%s.\n", test_case->name());
-        }
-        printf("  %s\n", test_info->name());
-      }
-    }
-  }
-  fflush(stdout);
-}
-
-// Sets the OS stack trace getter.
-//
-// Does nothing if the input and the current OS stack trace getter are
-// the same; otherwise, deletes the old getter and makes the input the
-// current getter.
-void UnitTestImpl::set_os_stack_trace_getter(
-    OsStackTraceGetterInterface* getter) {
-  if (os_stack_trace_getter_ != getter) {
-    delete os_stack_trace_getter_;
-    os_stack_trace_getter_ = getter;
-  }
-}
-
-// Returns the current OS stack trace getter if it is not NULL;
-// otherwise, creates an OsStackTraceGetter, makes it the current
-// getter, and returns it.
-OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
-  if (os_stack_trace_getter_ == NULL) {
-    os_stack_trace_getter_ = new OsStackTraceGetter;
-  }
-
-  return os_stack_trace_getter_;
-}
-
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
-TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
-}
-
-// Shuffles all test cases, and the tests within each test case,
-// making sure that death tests are still run first.
-void UnitTestImpl::ShuffleTests() {
-  // Shuffles the death test cases.
-  ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_);
-
-  // Shuffles the non-death test cases.
-  ShuffleRange(random(), last_death_test_case_ + 1,
-               static_cast<int>(test_cases_.size()), &test_case_indices_);
-
-  // Shuffles the tests inside each test case.
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    test_cases_[i]->ShuffleTests(random());
-  }
-}
-
-// Restores the test cases and tests to their order before the first shuffle.
-void UnitTestImpl::UnshuffleTests() {
-  for (size_t i = 0; i < test_cases_.size(); i++) {
-    // Unshuffles the tests in each test case.
-    test_cases_[i]->UnshuffleTests();
-    // Resets the index of each test case.
-    test_case_indices_[i] = static_cast<int>(i);
-  }
-}
-
-// Returns the current OS stack trace as a String.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-String GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                       int skip_count) {
-  // We pass skip_count + 1 to skip this wrapper function in addition
-  // to what the user really wants to skip.
-  return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
-}
-
-// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to
-// suppress unreachable code warnings.
-namespace {
-class ClassUniqueToAlwaysTrue {};
-}
-
-bool IsTrue(bool condition) { return condition; }
-
-bool AlwaysTrue() {
-#if GTEST_HAS_EXCEPTIONS
-  // This condition is always false so AlwaysTrue() never actually throws,
-  // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
-#endif  // GTEST_HAS_EXCEPTIONS
-  return true;
-}
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-bool SkipPrefix(const char* prefix, const char** pstr) {
-  const size_t prefix_len = strlen(prefix);
-  if (strncmp(*pstr, prefix, prefix_len) == 0) {
-    *pstr += prefix_len;
-    return true;
-  }
-  return false;
-}
-
-// Parses a string as a command line flag.  The string should have
-// the format "--flag=value".  When def_optional is true, the "=value"
-// part can be omitted.
-//
-// Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
-  // str and flag must not be NULL.
-  if (str == NULL || flag == NULL) return NULL;
-
-  // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const String flag_str = String::Format("--%s%s", GTEST_FLAG_PREFIX_, flag);
-  const size_t flag_len = flag_str.length();
-  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL;
-
-  // Skips the flag name.
-  const char* flag_end = str + flag_len;
-
-  // When def_optional is true, it's OK to not have a "=value" part.
-  if (def_optional && (flag_end[0] == '\0')) {
-    return flag_end;
-  }
-
-  // If def_optional is true and there are more characters after the
-  // flag name, or if def_optional is false, there must be a '=' after
-  // the flag name.
-  if (flag_end[0] != '=') return NULL;
-
-  // Returns the string after "=".
-  return flag_end + 1;
-}
-
-// Parses a string for a bool flag, in the form of either
-// "--flag=value" or "--flag".
-//
-// In the former case, the value is taken as true as long as it does
-// not start with '0', 'f', or 'F'.
-//
-// In the latter case, the value is taken as true.
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Converts the string value to a bool.
-  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
-  return true;
-}
-
-// Parses a string for an Int32 flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
-}
-
-// Parses a string for a string flag, in the form of
-// "--flag=value".
-//
-// On success, stores the value of the flag in *value, and returns
-// true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, String* value) {
-  // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
-
-  // Aborts if the parsing failed.
-  if (value_str == NULL) return false;
-
-  // Sets *value to the value of the flag.
-  *value = value_str;
-  return true;
-}
-
-// Determines whether a string has a prefix that Google Test uses for its
-// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_.
-// If Google Test detects that a command line flag has its prefix but is not
-// recognized, it will print its help message. Flags starting with
-// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
-// internal flags and do not trigger the help message.
-static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
-          SkipPrefix("/", &str)) &&
-         !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
-         (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
-          SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str));
-}
-
-// Prints a string containing code-encoded text.  The following escape
-// sequences can be used in the string to control the text color:
-//
-//   @@    prints a single '@' character.
-//   @R    changes the color to red.
-//   @G    changes the color to green.
-//   @Y    changes the color to yellow.
-//   @D    changes to the default terminal text color.
-//
-// TODO(wan@google.com): Write tests for this once we add stdout
-// capturing to Google Test.
-static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
-
-  // Conceptually, we split the string into segments divided by escape
-  // sequences.  Then we print one segment at a time.  At the end of
-  // each iteration, the str pointer advances to the beginning of the
-  // next segment.
-  for (;;) {
-    const char* p = strchr(str, '@');
-    if (p == NULL) {
-      ColoredPrintf(color, "%s", str);
-      return;
-    }
-
-    ColoredPrintf(color, "%s", String(str, p - str).c_str());
-
-    const char ch = p[1];
-    str = p + 2;
-    if (ch == '@') {
-      ColoredPrintf(color, "@");
-    } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
-    } else if (ch == 'R') {
-      color = COLOR_RED;
-    } else if (ch == 'G') {
-      color = COLOR_GREEN;
-    } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
-    } else {
-      --str;
-    }
-  }
-}
-
-static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
-    "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
-    "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
-    "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.  The type parameter CharType can be
-// instantiated to either char or wchar_t.
-template <typename CharType>
-void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
-  for (int i = 1; i < *argc; i++) {
-    const String arg_string = StreamableToString(argv[i]);
-    const char* const arg = arg_string.c_str();
-
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
-
-    // Do we see a Google Test flag?
-    if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                      &GTEST_FLAG(also_run_disabled_tests)) ||
-        ParseBoolFlag(arg, kBreakOnFailureFlag,
-                      &GTEST_FLAG(break_on_failure)) ||
-        ParseBoolFlag(arg, kCatchExceptionsFlag,
-                      &GTEST_FLAG(catch_exceptions)) ||
-        ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-        ParseStringFlag(arg, kDeathTestStyleFlag,
-                        &GTEST_FLAG(death_test_style)) ||
-        ParseBoolFlag(arg, kDeathTestUseFork,
-                      &GTEST_FLAG(death_test_use_fork)) ||
-        ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-        ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                        &GTEST_FLAG(internal_run_death_test)) ||
-        ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-        ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-        ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-        ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-        ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-        ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-        ParseInt32Flag(arg, kStackTraceDepthFlag,
-                       &GTEST_FLAG(stack_trace_depth)) ||
-        ParseStringFlag(arg, kStreamResultToFlag,
-                        &GTEST_FLAG(stream_result_to)) ||
-        ParseBoolFlag(arg, kThrowOnFailureFlag,
-                      &GTEST_FLAG(throw_on_failure))
-        ) {
-      // Yes.  Shift the remainder of the argv list left by one.  Note
-      // that argv has (*argc + 1) elements, the last one always being
-      // NULL.  The following loop moves the trailing NULL element as
-      // well.
-      for (int j = i; j != *argc; j++) {
-        argv[j] = argv[j + 1];
-      }
-
-      // Decrements the argument count.
-      (*argc)--;
-
-      // We also need to decrement the iterator as we just removed
-      // an element.
-      i--;
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
-      // Both help flag and unrecognized Google Test flags (excluding
-      // internal ones) trigger help display.
-      g_help_flag = true;
-    }
-  }
-
-  if (g_help_flag) {
-    // We print the help here instead of in RUN_ALL_TESTS(), as the
-    // latter may not be called at all if the user is using Google
-    // Test with another testing framework.
-    PrintColorEncoded(kColorEncodedHelpMessage);
-  }
-}
-
-// Parses the command line for Google Test flags, without initializing
-// other parts of Google Test.
-void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
-  ParseGoogleTestFlagsOnlyImpl(argc, argv);
-}
-
-// The internal implementation of InitGoogleTest().
-//
-// The type parameter CharType can be instantiated to either char or
-// wchar_t.
-template <typename CharType>
-void InitGoogleTestImpl(int* argc, CharType** argv) {
-  g_init_gtest_count++;
-
-  // We don't want to run the initialization code twice.
-  if (g_init_gtest_count != 1) return;
-
-  if (*argc <= 0) return;
-
-  internal::g_executable_path = internal::StreamableToString(argv[0]);
-
-#if GTEST_HAS_DEATH_TEST
-
-  g_argvs.clear();
-  for (int i = 0; i != *argc; i++) {
-    g_argvs.push_back(StreamableToString(argv[i]));
-  }
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-  ParseGoogleTestFlagsOnly(argc, argv);
-  GetUnitTestImpl()->PostFlagParsingInit();
-}
-
-}  // namespace internal
-
-// Initializes Google Test.  This must be called before calling
-// RUN_ALL_TESTS().  In particular, it parses a command line for the
-// flags that Google Test recognizes.  Whenever a Google Test flag is
-// seen, it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Test flag variables are
-// updated.
-//
-// Calling the function for the second time has no user-visible effect.
-void InitGoogleTest(int* argc, char** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-void InitGoogleTest(int* argc, wchar_t** argv) {
-  internal::InitGoogleTestImpl(argc, argv);
-}
-
-}  // namespace testing
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
-//
-// This file implements death tests.
-
-
-#if GTEST_HAS_DEATH_TEST
-
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-// Constants.
-
-// The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
-
-GTEST_DEFINE_string_(
-    death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
-    "Indicates how to run a death test in a forked child process: "
-    "\"threadsafe\" (child process re-executes the test binary "
-    "from the beginning, running only the specific death test) or "
-    "\"fast\" (child process runs the death test immediately "
-    "after forking).");
-
-GTEST_DEFINE_bool_(
-    death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
-    "Instructs to use fork()/_exit() instead of clone() in death tests. "
-    "Ignored and always uses fork() on POSIX systems where clone() is not "
-    "implemented. Useful when running under valgrind or similar tools if "
-    "those do not support clone(). Valgrind 3.3.1 will just fail if "
-    "it sees an unsupported combination of clone() flags. "
-    "It is not recommended to use this flag w/o valgrind though it will "
-    "work in 99% of the cases. Once valgrind is fixed, this flag will "
-    "most likely be removed.");
-
-namespace internal {
-GTEST_DEFINE_string_(
-    internal_run_death_test, "",
-    "Indicates the file, line number, temporal index of "
-    "the single death test to run, and a file descriptor to "
-    "which a success code may be sent, all separated by "
-    "colons.  This flag is specified if and only if the current "
-    "process is a sub-process launched for running a thread-safe "
-    "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
-
-#if GTEST_HAS_DEATH_TEST
-
-// ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
-
-// ExitedWithCode function-call operator.
-bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
-
-  return exit_status == exit_code_;
-
-# else
-
-  return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
-
-# endif  // GTEST_OS_WINDOWS
-}
-
-# if !GTEST_OS_WINDOWS
-// KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
-
-// KilledBySignal function-call operator.
-bool KilledBySignal::operator()(int exit_status) const {
-  return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
-}
-# endif  // !GTEST_OS_WINDOWS
-
-namespace internal {
-
-// Utilities needed for death tests.
-
-// Generates a textual description of a given exit code, in the format
-// specified by wait(2).
-static String ExitSummary(int exit_code) {
-  Message m;
-
-# if GTEST_OS_WINDOWS
-
-  m << "Exited with exit status " << exit_code;
-
-# else
-
-  if (WIFEXITED(exit_code)) {
-    m << "Exited with exit status " << WEXITSTATUS(exit_code);
-  } else if (WIFSIGNALED(exit_code)) {
-    m << "Terminated by signal " << WTERMSIG(exit_code);
-  }
-#  ifdef WCOREDUMP
-  if (WCOREDUMP(exit_code)) {
-    m << " (core dumped)";
-  }
-#  endif
-# endif  // GTEST_OS_WINDOWS
-
-  return m.GetString();
-}
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-bool ExitedUnsuccessfully(int exit_status) {
-  return !ExitedWithCode(0)(exit_status);
-}
-
-# if !GTEST_OS_WINDOWS
-// Generates a textual failure message when a death test finds more than
-// one thread running, or cannot determine the number of threads, prior
-// to executing the given statement.  It is the responsibility of the
-// caller not to pass a thread_count of 1.
-static String DeathTestThreadWarning(size_t thread_count) {
-  Message msg;
-  msg << "Death tests use fork(), which is unsafe particularly"
-      << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
-    msg << "couldn't detect the number of threads.";
-  else
-    msg << "detected " << thread_count << " threads.";
-  return msg.GetString();
-}
-# endif  // !GTEST_OS_WINDOWS
-
-// Flag characters for reporting a death test that did not die.
-static const char kDeathTestLived = 'L';
-static const char kDeathTestReturned = 'R';
-static const char kDeathTestThrew = 'T';
-static const char kDeathTestInternalError = 'I';
-
-// An enumeration describing all of the possible ways that a death test can
-// conclude.  DIED means that the process died while executing the test
-// code; LIVED means that process lived beyond the end of the test code;
-// RETURNED means that the test statement attempted to execute a return
-// statement, which is not allowed; THREW means that the test statement
-// returned control by throwing an exception.  IN_PROGRESS means the test
-// has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
-// AbortReason, DeathTestOutcome, and flag characters above.
-enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
-
-// Routine for aborting the program which is safe to call from an
-// exec-style death test child process, in which case the error
-// message is propagated back to the parent process.  Otherwise, the
-// message is simply printed to stderr.  In either case, the program
-// then exits with status 1.
-void DeathTestAbort(const String& message) {
-  // On a POSIX system, this function may be called from a threadsafe-style
-  // death test child process, which operates on a very small stack.  Use
-  // the heap for any additional non-minuscule memory requirements.
-  const InternalRunDeathTestFlag* const flag =
-      GetUnitTestImpl()->internal_run_death_test_flag();
-  if (flag != NULL) {
-    FILE* parent = posix::FDOpen(flag->write_fd(), "w");
-    fputc(kDeathTestInternalError, parent);
-    fprintf(parent, "%s", message.c_str());
-    fflush(parent);
-    _exit(1);
-  } else {
-    fprintf(stderr, "%s", message.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-
-// A replacement for CHECK that calls DeathTestAbort if the assertion
-// fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort(::testing::internal::String::Format( \
-          "CHECK failed: File %s, line %d: %s", \
-          __FILE__, __LINE__, #expression)); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
-// evaluating any system call that fulfills two conditions: it must return
-// -1 on failure, and set errno to EINTR when it is interrupted and
-// should be tried again.  The macro expands to a loop that repeatedly
-// evaluates the expression as long as it evaluates to -1 and sets
-// errno to EINTR.  If the expression evaluates to -1 but errno is
-// something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort(::testing::internal::String::Format( \
-          "CHECK failed: File %s, line %d: %s != -1", \
-          __FILE__, __LINE__, #expression)); \
-    } \
-  } while (::testing::internal::AlwaysFalse())
-
-// Returns the message describing the last system error in errno.
-String GetLastErrnoDescription() {
-    return String(errno == 0 ? "" : posix::StrError(errno));
-}
-
-// This is called from a death test parent process to read a failure
-// message from the death test child process and log it with the FATAL
-// severity. On Windows, the message is read from a pipe handle. On other
-// platforms, it is read from a file descriptor.
-static void FailFromInternalError(int fd) {
-  Message error;
-  char buffer[256];
-  int num_read;
-
-  do {
-    while ((num_read = posix::Read(fd, buffer, 255)) > 0) {
-      buffer[num_read] = '\0';
-      error << buffer;
-    }
-  } while (num_read == -1 && errno == EINTR);
-
-  if (num_read == 0) {
-    GTEST_LOG_(FATAL) << error.GetString();
-  } else {
-    const int last_error = errno;
-    GTEST_LOG_(FATAL) << "Error while reading death test internal: "
-                      << GetLastErrnoDescription() << " [" << last_error << "]";
-  }
-}
-
-// Death test constructor.  Increments the running death test count
-// for the current test.
-DeathTest::DeathTest() {
-  TestInfo* const info = GetUnitTestImpl()->current_test_info();
-  if (info == NULL) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
-  }
-}
-
-// Creates and returns a death test by dispatching to the current
-// death test factory.
-bool DeathTest::Create(const char* statement, const RE* regex,
-                       const char* file, int line, DeathTest** test) {
-  return GetUnitTestImpl()->death_test_factory()->Create(
-      statement, regex, file, line, test);
-}
-
-const char* DeathTest::LastMessage() {
-  return last_death_test_message_.c_str();
-}
-
-void DeathTest::set_last_death_test_message(const String& message) {
-  last_death_test_message_ = message;
-}
-
-String DeathTest::last_death_test_message_;
-
-// Provides cross platform implementation for some death functionality.
-class DeathTestImpl : public DeathTest {
- protected:
-  DeathTestImpl(const char* a_statement, const RE* a_regex)
-      : statement_(a_statement),
-        regex_(a_regex),
-        spawned_(false),
-        status_(-1),
-        outcome_(IN_PROGRESS),
-        read_fd_(-1),
-        write_fd_(-1) {}
-
-  // read_fd_ is expected to be closed and cleared by a derived class.
-  ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); }
-
-  void Abort(AbortReason reason);
-  virtual bool Passed(bool status_ok);
-
-  const char* statement() const { return statement_; }
-  const RE* regex() const { return regex_; }
-  bool spawned() const { return spawned_; }
-  void set_spawned(bool is_spawned) { spawned_ = is_spawned; }
-  int status() const { return status_; }
-  void set_status(int a_status) { status_ = a_status; }
-  DeathTestOutcome outcome() const { return outcome_; }
-  void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; }
-  int read_fd() const { return read_fd_; }
-  void set_read_fd(int fd) { read_fd_ = fd; }
-  int write_fd() const { return write_fd_; }
-  void set_write_fd(int fd) { write_fd_ = fd; }
-
-  // Called in the parent process only. Reads the result code of the death
-  // test child process via a pipe, interprets it to set the outcome_
-  // member, and closes read_fd_.  Outputs diagnostics and terminates in
-  // case of unexpected codes.
-  void ReadAndInterpretStatusByte();
-
- private:
-  // The textual content of the code this object is testing.  This class
-  // doesn't own this string and should not attempt to delete it.
-  const char* const statement_;
-  // The regular expression which test output must match.  DeathTestImpl
-  // doesn't own this object and should not attempt to delete it.
-  const RE* const regex_;
-  // True if the death test child process has been successfully spawned.
-  bool spawned_;
-  // The exit status of the child process.
-  int status_;
-  // How the death test concluded.
-  DeathTestOutcome outcome_;
-  // Descriptor to the read end of the pipe to the child process.  It is
-  // always -1 in the child process.  The child keeps its write end of the
-  // pipe in write_fd_.
-  int read_fd_;
-  // Descriptor to the child's write end of the pipe to the parent process.
-  // It is always -1 in the parent process.  The parent keeps its end of the
-  // pipe in read_fd_.
-  int write_fd_;
-};
-
-// Called in the parent process only. Reads the result code of the death
-// test child process via a pipe, interprets it to set the outcome_
-// member, and closes read_fd_.  Outputs diagnostics and terminates in
-// case of unexpected codes.
-void DeathTestImpl::ReadAndInterpretStatusByte() {
-  char flag;
-  int bytes_read;
-
-  // The read() here blocks until data is available (signifying the
-  // failure of the death test) or until the pipe is closed (signifying
-  // its success), so it's okay to call this in the parent before
-  // the child process has exited.
-  do {
-    bytes_read = posix::Read(read_fd(), &flag, 1);
-  } while (bytes_read == -1 && errno == EINTR);
-
-  if (bytes_read == 0) {
-    set_outcome(DIED);
-  } else if (bytes_read == 1) {
-    switch (flag) {
-      case kDeathTestReturned:
-        set_outcome(RETURNED);
-        break;
-      case kDeathTestThrew:
-        set_outcome(THREW);
-        break;
-      case kDeathTestLived:
-        set_outcome(LIVED);
-        break;
-      case kDeathTestInternalError:
-        FailFromInternalError(read_fd());  // Does not return.
-        break;
-      default:
-        GTEST_LOG_(FATAL) << "Death test child process reported "
-                          << "unexpected status byte ("
-                          << static_cast<unsigned int>(flag) << ")";
-    }
-  } else {
-    GTEST_LOG_(FATAL) << "Read from death test child process failed: "
-                      << GetLastErrnoDescription();
-  }
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd()));
-  set_read_fd(-1);
-}
-
-// Signals that the death test code which should have exited, didn't.
-// Should be called only in a death test child process.
-// Writes a status byte to the child's status file descriptor, then
-// calls _exit(1).
-void DeathTestImpl::Abort(AbortReason reason) {
-  // The parent process considers the death test to be a failure if
-  // it finds any data in our pipe.  So, here we write a single flag byte
-  // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
-
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
-  // We are leaking the descriptor here because on some platforms (i.e.,
-  // when built as Windows DLL), destructors of global objects will still
-  // run after calling _exit(). On such systems, write_fd_ will be
-  // indirectly closed from the destructor of UnitTestImpl, causing double
-  // close if it is also closed here. On debug configurations, double close
-  // may assert. As there are no in-process buffers to flush here, we are
-  // relying on the OS to close the descriptor after the process terminates
-  // when the destructors are not run.
-  _exit(1);  // Exits w/o any normal exit hooks (we were supposed to crash)
-}
-
-// Returns an indented copy of stderr output for a death test.
-// This makes distinguishing death test output lines from regular log lines
-// much easier.
-static ::std::string FormatDeathTestOutput(const ::std::string& output) {
-  ::std::string ret;
-  for (size_t at = 0; ; ) {
-    const size_t line_end = output.find('\n', at);
-    ret += "[  DEATH   ] ";
-    if (line_end == ::std::string::npos) {
-      ret += output.substr(at);
-      break;
-    }
-    ret += output.substr(at, line_end + 1 - at);
-    at = line_end + 1;
-  }
-  return ret;
-}
-
-// Assesses the success or failure of a death test, using both private
-// members which have previously been set, and one argument:
-//
-// Private data members:
-//   outcome:  An enumeration describing how the death test
-//             concluded: DIED, LIVED, THREW, or RETURNED.  The death test
-//             fails in the latter three cases.
-//   status:   The exit status of the child process. On *nix, it is in the
-//             in the format specified by wait(2). On Windows, this is the
-//             value supplied to the ExitProcess() API or a numeric code
-//             of the exception that terminated the program.
-//   regex:    A regular expression object to be applied to
-//             the test's captured standard error output; the death test
-//             fails if it does not match.
-//
-// Argument:
-//   status_ok: true if exit_status is acceptable in the context of
-//              this particular death test, which fails if it is false
-//
-// Returns true iff all of the above conditions are met.  Otherwise, the
-// first failing condition, in the order given above, is the one that is
-// reported. Also sets the last death test message string.
-bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
-
-  const String error_message = GetCapturedStderr();
-
-  bool success = false;
-  Message buffer;
-
-  buffer << "Death test: " << statement() << "\n";
-  switch (outcome()) {
-    case LIVED:
-      buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case THREW:
-      buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case RETURNED:
-      buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
-      break;
-    case DIED:
-      if (status_ok) {
-        const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
-        if (matched) {
-          success = true;
-        } else {
-          buffer << "    Result: died but not with expected error.\n"
-                 << "  Expected: " << regex()->pattern() << "\n"
-                 << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-        }
-      } else {
-        buffer << "    Result: died but not with expected exit code:\n"
-               << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
-      }
-      break;
-    case IN_PROGRESS:
-    default:
-      GTEST_LOG_(FATAL)
-          << "DeathTest::Passed somehow called before conclusion of test";
-  }
-
-  DeathTest::set_last_death_test_message(buffer.GetString());
-  return success;
-}
-
-# if GTEST_OS_WINDOWS
-// WindowsDeathTest implements death tests on Windows. Due to the
-// specifics of starting new processes on Windows, death tests there are
-// always threadsafe, and Google Test considers the
-// --gtest_death_test_style=fast setting to be equivalent to
-// --gtest_death_test_style=threadsafe there.
-//
-// A few implementation notes:  Like the Linux version, the Windows
-// implementation uses pipes for child-to-parent communication. But due to
-// the specifics of pipes on Windows, some extra steps are required:
-//
-// 1. The parent creates a communication pipe and stores handles to both
-//    ends of it.
-// 2. The parent starts the child and provides it with the information
-//    necessary to acquire the handle to the write end of the pipe.
-// 3. The child acquires the write end of the pipe and signals the parent
-//    using a Windows event.
-// 4. Now the parent can release the write end of the pipe on its side. If
-//    this is done before step 3, the object's reference count goes down to
-//    0 and it is destroyed, preventing the child from acquiring it. The
-//    parent now has to release it, or read operations on the read end of
-//    the pipe will not return when the child terminates.
-// 5. The parent reads child's output through the pipe (outcome code and
-//    any possible error messages) from the pipe, and its stderr and then
-//    determines whether to fail the test.
-//
-// Note: to distinguish Win32 API calls from the local method and function
-// calls, the former are explicitly resolved in the global namespace.
-//
-class WindowsDeathTest : public DeathTestImpl {
- public:
-  WindowsDeathTest(const char* a_statement,
-                   const RE* a_regex,
-                   const char* file,
-                   int line)
-      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-  virtual TestRole AssumeRole();
-
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-  // Handle to the write end of the pipe to the child process.
-  AutoHandle write_handle_;
-  // Child process handle.
-  AutoHandle child_handle_;
-  // Event the child process uses to signal the parent that it has
-  // acquired the handle to the write end of the pipe. After seeing this
-  // event the parent can release its own handles to make sure its
-  // ReadFile() calls return when the child terminates.
-  AutoHandle event_handle_;
-};
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  // Wait until the child either signals that it has acquired the write end
-  // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
-                                   FALSE,  // Waits for any of the handles.
-                                   INFINITE)) {
-    case WAIT_OBJECT_0:
-    case WAIT_OBJECT_0 + 1:
-      break;
-    default:
-      GTEST_DEATH_TEST_CHECK_(false);  // Should not get here.
-  }
-
-  // The child has acquired the write end of the pipe or exited.
-  // We release the handle on our side and continue.
-  write_handle_.Reset();
-  event_handle_.Reset();
-
-  ReadAndInterpretStatusByte();
-
-  // Waits for the child process to exit if it haven't already. This
-  // returns immediately if the child has already exited, regardless of
-  // whether previous calls to WaitForMultipleObjects synchronized on this
-  // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
-  DWORD status_code;
-  GTEST_DEATH_TEST_CHECK_(
-      ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
-  child_handle_.Reset();
-  set_status(static_cast<int>(status_code));
-  return status();
-}
-
-// The AssumeRole process for a Windows death test.  It creates a child
-// process with the same executable as the current process to run the
-// death test.  The child process is given the --gtest_filter and
-// --gtest_internal_run_death_test flags such that it knows to run the
-// current death test only.
-DeathTest::TestRole WindowsDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    // ParseInternalRunDeathTestFlag() has performed all the necessary
-    // processing.
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  // WindowsDeathTest uses an anonymous pipe to communicate results of
-  // a death test.
-  SECURITY_ATTRIBUTES handles_are_inheritable = {
-    sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
-  HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
-  write_handle_.Reset(write_handle);
-  event_handle_.Reset(::CreateEvent(
-      &handles_are_inheritable,
-      TRUE,    // The event will automatically reset to non-signaled state.
-      FALSE,   // The initial state is non-signalled.
-      NULL));  // The even is unnamed.
-  GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL);
-  const String filter_flag = String::Format("--%s%s=%s.%s",
-                                            GTEST_FLAG_PREFIX_, kFilterFlag,
-                                            info->test_case_name(),
-                                            info->name());
-  const String internal_flag = String::Format(
-    "--%s%s=%s|%d|%d|%u|%Iu|%Iu",
-      GTEST_FLAG_PREFIX_,
-      kInternalRunDeathTestFlag,
-      file_, line_,
-      death_test_index,
-      static_cast<unsigned int>(::GetCurrentProcessId()),
-      // size_t has the same with as pointers on both 32-bit and 64-bit
-      // Windows platforms.
-      // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      reinterpret_cast<size_t>(write_handle),
-      reinterpret_cast<size_t>(event_handle_.Get()));
-
-  char executable_path[_MAX_PATH + 1];  // NOLINT
-  GTEST_DEATH_TEST_CHECK_(
-      _MAX_PATH + 1 != ::GetModuleFileNameA(NULL,
-                                            executable_path,
-                                            _MAX_PATH));
-
-  String command_line = String::Format("%s %s \"%s\"",
-                                       ::GetCommandLineA(),
-                                       filter_flag.c_str(),
-                                       internal_flag.c_str());
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // Flush the log buffers since the log streams are shared with the child.
-  FlushInfoLog();
-
-  // The child process will share the standard handles with the parent.
-  STARTUPINFOA startup_info;
-  memset(&startup_info, 0, sizeof(STARTUPINFO));
-  startup_info.dwFlags = STARTF_USESTDHANDLES;
-  startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE);
-  startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE);
-  startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE);
-
-  PROCESS_INFORMATION process_info;
-  GTEST_DEATH_TEST_CHECK_(::CreateProcessA(
-      executable_path,
-      const_cast<char*>(command_line.c_str()),
-      NULL,   // Retuned process handle is not inheritable.
-      NULL,   // Retuned thread handle is not inheritable.
-      TRUE,   // Child inherits all inheritable handles (for write_handle_).
-      0x0,    // Default creation flags.
-      NULL,   // Inherit the parent's environment.
-      UnitTest::GetInstance()->original_working_dir(),
-      &startup_info,
-      &process_info) != FALSE);
-  child_handle_.Reset(process_info.hProcess);
-  ::CloseHandle(process_info.hThread);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-# else  // We are not on Windows.
-
-// ForkingDeathTest provides implementations for most of the abstract
-// methods of the DeathTest interface.  Only the AssumeRole method is
-// left undefined.
-class ForkingDeathTest : public DeathTestImpl {
- public:
-  ForkingDeathTest(const char* statement, const RE* regex);
-
-  // All of these virtual functions are inherited from DeathTest.
-  virtual int Wait();
-
- protected:
-  void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; }
-
- private:
-  // PID of child process during death test; 0 in the child process itself.
-  pid_t child_pid_;
-};
-
-// Constructs a ForkingDeathTest.
-ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex)
-    : DeathTestImpl(a_statement, a_regex),
-      child_pid_(-1) {}
-
-// Waits for the child in a death test to exit, returning its exit
-// status, or 0 if no child process exists.  As a side effect, sets the
-// outcome data member.
-int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
-
-  ReadAndInterpretStatusByte();
-
-  int status_value;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0));
-  set_status(status_value);
-  return status_value;
-}
-
-// A concrete death test class that forks, then immediately runs the test
-// in the child process.
-class NoExecDeathTest : public ForkingDeathTest {
- public:
-  NoExecDeathTest(const char* a_statement, const RE* a_regex) :
-      ForkingDeathTest(a_statement, a_regex) { }
-  virtual TestRole AssumeRole();
-};
-
-// The AssumeRole process for a fork-and-run death test.  It implements a
-// straightforward fork, with a simple pipe to transmit the status byte.
-DeathTest::TestRole NoExecDeathTest::AssumeRole() {
-  const size_t thread_count = GetThreadCount();
-  if (thread_count != 1) {
-    GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count);
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-
-  DeathTest::set_last_death_test_message("");
-  CaptureStderr();
-  // When we fork the process below, the log file buffers are copied, but the
-  // file descriptors are shared.  We flush all log files here so that closing
-  // the file descriptors in the child process doesn't throw off the
-  // synchronization between descriptors and buffers in the parent process.
-  // This is as close to the fork as possible to avoid a race condition in case
-  // there are multiple threads running before the death test, and another
-  // thread writes to the log file.
-  FlushInfoLog();
-
-  const pid_t child_pid = fork();
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  set_child_pid(child_pid);
-  if (child_pid == 0) {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0]));
-    set_write_fd(pipe_fd[1]);
-    // Redirects all logging to stderr in the child process to prevent
-    // concurrent writes to the log files.  We capture stderr in the parent
-    // process and append the child process' output to a log.
-    LogToStderr();
-    // Event forwarding to the listeners of event listener API mush be shut
-    // down in death test subprocesses.
-    GetUnitTestImpl()->listeners()->SuppressEventForwarding();
-    return EXECUTE_TEST;
-  } else {
-    GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-    set_read_fd(pipe_fd[0]);
-    set_spawned(true);
-    return OVERSEE_TEST;
-  }
-}
-
-// A concrete death test class that forks and re-executes the main
-// program from the beginning, with command-line flags set that cause
-// only this specific death test to be run.
-class ExecDeathTest : public ForkingDeathTest {
- public:
-  ExecDeathTest(const char* a_statement, const RE* a_regex,
-                const char* file, int line) :
-      ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
-  virtual TestRole AssumeRole();
- private:
-  // The name of the file in which the death test is located.
-  const char* const file_;
-  // The line number on which the death test is located.
-  const int line_;
-};
-
-// Utility class for accumulating command-line arguments.
-class Arguments {
- public:
-  Arguments() {
-    args_.push_back(NULL);
-  }
-
-  ~Arguments() {
-    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
-         ++i) {
-      free(*i);
-    }
-  }
-  void AddArgument(const char* argument) {
-    args_.insert(args_.end() - 1, posix::StrDup(argument));
-  }
-
-  template <typename Str>
-  void AddArguments(const ::std::vector<Str>& arguments) {
-    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
-      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
-    }
-  }
-  char* const* Argv() {
-    return &args_[0];
-  }
- private:
-  std::vector<char*> args_;
-};
-
-// A struct that encompasses the arguments to the child process of a
-// threadsafe-style death test process.
-struct ExecDeathTestArgs {
-  char* const* argv;  // Command-line arguments for the child's call to exec
-  int close_fd;       // File descriptor to close; the read end of a pipe
-};
-
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
-extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-// The main function for a threadsafe-style death test child process.
-// This function is called in a clone()-ed process and thus must avoid
-// any potentially unsafe operations like malloc or libc functions.
-static int ExecDeathTestChildMain(void* child_arg) {
-  ExecDeathTestArgs* const args = static_cast<ExecDeathTestArgs*>(child_arg);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd));
-
-  // We need to execute the test program in the same environment where
-  // it was originally invoked.  Therefore we change to the original
-  // working directory first.
-  const char* const original_dir =
-      UnitTest::GetInstance()->original_working_dir();
-  // We can safely call chdir() as it's a direct system call.
-  if (chdir(original_dir) != 0) {
-    DeathTestAbort(String::Format("chdir(\"%s\") failed: %s",
-                                  original_dir,
-                                  GetLastErrnoDescription().c_str()));
-    return EXIT_FAILURE;
-  }
-
-  // We can safely call execve() as it's a direct system call.  We
-  // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
-  // invoke the test program via a valid path that contains at least
-  // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(String::Format("execve(%s, ...) in %s failed: %s",
-                                args->argv[0],
-                                original_dir,
-                                GetLastErrnoDescription().c_str()));
-  return EXIT_FAILURE;
-}
-
-// Two utility routines that together determine the direction the stack
-// grows.
-// This could be accomplished more elegantly by a single recursive
-// function, but we want to guard against the unlikely possibility of
-// a smart compiler optimizing the recursion away.
-//
-// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
-// StackLowerThanAddress into StackGrowsDown, which then doesn't give
-// correct answer.
-bool StackLowerThanAddress(const void* ptr) GTEST_NO_INLINE_;
-bool StackLowerThanAddress(const void* ptr) {
-  int dummy;
-  return &dummy < ptr;
-}
-
-bool StackGrowsDown() {
-  int dummy;
-  return StackLowerThanAddress(&dummy);
-}
-
-// A threadsafe implementation of fork(2) for threadsafe-style death tests
-// that uses clone(2).  It dies with an error message if anything goes
-// wrong.
-static pid_t ExecDeathTestFork(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
-  pid_t child_pid = -1;
-
-#  if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
-
-  if (!use_fork) {
-    static const bool stack_grows_down = StackGrowsDown();
-    const size_t stack_size = getpagesize();
-    // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead.
-    void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
-                             MAP_ANON | MAP_PRIVATE, -1, 0);
-    GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED);
-    void* const stack_top =
-        static_cast<char*>(stack) + (stack_grows_down ? stack_size : 0);
-
-    child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args);
-
-    GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
-  }
-#  else
-  const bool use_fork = true;
-#  endif  // GTEST_HAS_CLONE
-
-  if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
-  }
-
-  GTEST_DEATH_TEST_CHECK_(child_pid != -1);
-  return child_pid;
-}
-
-// The AssumeRole process for a fork-and-exec death test.  It re-executes the
-// main program from the beginning, setting the --gtest_filter
-// and --gtest_internal_run_death_test flags to cause only the current
-// death test to be re-run.
-DeathTest::TestRole ExecDeathTest::AssumeRole() {
-  const UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const TestInfo* const info = impl->current_test_info();
-  const int death_test_index = info->result()->death_test_count();
-
-  if (flag != NULL) {
-    set_write_fd(flag->write_fd());
-    return EXECUTE_TEST;
-  }
-
-  int pipe_fd[2];
-  GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1);
-  // Clear the close-on-exec flag on the write end of the pipe, lest
-  // it be closed when the child process does an exec:
-  GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
-
-  const String filter_flag =
-      String::Format("--%s%s=%s.%s",
-                     GTEST_FLAG_PREFIX_, kFilterFlag,
-                     info->test_case_name(), info->name());
-  const String internal_flag =
-      String::Format("--%s%s=%s|%d|%d|%d",
-                     GTEST_FLAG_PREFIX_, kInternalRunDeathTestFlag,
-                     file_, line_, death_test_index, pipe_fd[1]);
-  Arguments args;
-  args.AddArguments(GetArgvs());
-  args.AddArgument(filter_flag.c_str());
-  args.AddArgument(internal_flag.c_str());
-
-  DeathTest::set_last_death_test_message("");
-
-  CaptureStderr();
-  // See the comment in NoExecDeathTest::AssumeRole for why the next line
-  // is necessary.
-  FlushInfoLog();
-
-  const pid_t child_pid = ExecDeathTestFork(args.Argv(), pipe_fd[0]);
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1]));
-  set_child_pid(child_pid);
-  set_read_fd(pipe_fd[0]);
-  set_spawned(true);
-  return OVERSEE_TEST;
-}
-
-# endif  // !GTEST_OS_WINDOWS
-
-// Creates a concrete DeathTest-derived class that depends on the
-// --gtest_death_test_style flag, and sets the pointer pointed to
-// by the "test" argument to its address.  If the test should be
-// skipped, sets that pointer to NULL.  Returns true, unless the
-// flag is set to an invalid value.
-bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
-                                     const char* file, int line,
-                                     DeathTest** test) {
-  UnitTestImpl* const impl = GetUnitTestImpl();
-  const InternalRunDeathTestFlag* const flag =
-      impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
-
-  if (flag != NULL) {
-    if (death_test_index > flag->index()) {
-      DeathTest::set_last_death_test_message(String::Format(
-          "Death test count (%d) somehow exceeded expected maximum (%d)",
-          death_test_index, flag->index()));
-      return false;
-    }
-
-    if (!(flag->file() == file && flag->line() == line &&
-          flag->index() == death_test_index)) {
-      *test = NULL;
-      return true;
-    }
-  }
-
-# if GTEST_OS_WINDOWS
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
-    *test = new WindowsDeathTest(statement, regex, file, line);
-  }
-
-# else
-
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
-    *test = new ExecDeathTest(statement, regex, file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
-    *test = new NoExecDeathTest(statement, regex);
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(String::Format(
-        "Unknown death test style \"%s\" encountered",
-        GTEST_FLAG(death_test_style).c_str()));
-    return false;
-  }
-
-  return true;
-}
-
-// Splits a given string on a given delimiter, populating a given
-// vector with the fields.  GTEST_HAS_DEATH_TEST implies that we have
-// ::std::string, so we can use it here.
-static void SplitString(const ::std::string& str, char delimiter,
-                        ::std::vector< ::std::string>* dest) {
-  ::std::vector< ::std::string> parsed;
-  ::std::string::size_type pos = 0;
-  while (::testing::internal::AlwaysTrue()) {
-    const ::std::string::size_type colon = str.find(delimiter, pos);
-    if (colon == ::std::string::npos) {
-      parsed.push_back(str.substr(pos));
-      break;
-    } else {
-      parsed.push_back(str.substr(pos, colon - pos));
-      pos = colon + 1;
-    }
-  }
-  dest->swap(parsed);
-}
-
-# if GTEST_OS_WINDOWS
-// Recreates the pipe and event handles from the provided parameters,
-// signals the event, and returns a file descriptor wrapped around the pipe
-// handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
-  AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
-  if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
-    DeathTestAbort(String::Format("Unable to open parent process %u",
-                                  parent_process_id));
-  }
-
-  // TODO(vladl@google.com): Replace the following check with a
-  // compile-time assertion when available.
-  GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
-
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
-  HANDLE dup_write_handle;
-
-  // The newly initialized handle is accessible only in in the parent
-  // process. To obtain one accessible within the child, we need to use
-  // DuplicateHandle.
-  if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
-                         ::GetCurrentProcess(), &dup_write_handle,
-                         0x0,    // Requested privileges ignored since
-                                 // DUPLICATE_SAME_ACCESS is used.
-                         FALSE,  // Request non-inheritable handler.
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort(String::Format(
-        "Unable to duplicate the pipe handle %Iu from the parent process %u",
-        write_handle_as_size_t, parent_process_id));
-  }
-
-  const HANDLE event_handle = reinterpret_cast<HANDLE>(event_handle_as_size_t);
-  HANDLE dup_event_handle;
-
-  if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
-                         DUPLICATE_SAME_ACCESS)) {
-    DeathTestAbort(String::Format(
-        "Unable to duplicate the event handle %Iu from the parent process %u",
-        event_handle_as_size_t, parent_process_id));
-  }
-
-  const int write_fd =
-      ::_open_osfhandle(reinterpret_cast<intptr_t>(dup_write_handle), O_APPEND);
-  if (write_fd == -1) {
-    DeathTestAbort(String::Format(
-        "Unable to convert pipe handle %Iu to a file descriptor",
-        write_handle_as_size_t));
-  }
-
-  // Signals the parent that the write end of the pipe has been acquired
-  // so the parent can release its own write end.
-  ::SetEvent(dup_event_handle);
-
-  return write_fd;
-}
-# endif  // GTEST_OS_WINDOWS
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return NULL;
-
-  // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
-  // can use it here.
-  int line = -1;
-  int index = -1;
-  ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
-  int write_fd = -1;
-
-# if GTEST_OS_WINDOWS
-
-  unsigned int parent_process_id = 0;
-  size_t write_handle_as_size_t = 0;
-  size_t event_handle_as_size_t = 0;
-
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
-    DeathTestAbort(String::Format(
-        "Bad --gtest_internal_run_death_test flag: %s",
-        GTEST_FLAG(internal_run_death_test).c_str()));
-  }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
-                                     event_handle_as_size_t);
-# else
-
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort(String::Format(
-        "Bad --gtest_internal_run_death_test flag: %s",
-        GTEST_FLAG(internal_run_death_test).c_str()));
-  }
-
-# endif  // GTEST_OS_WINDOWS
-
-  return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
-}
-
-}  // namespace internal
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: keith.ray@gmail.com (Keith Ray)
-
-
-#include <stdlib.h>
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
-#elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
-#elif GTEST_OS_SYMBIAN || GTEST_OS_NACL
-// Symbian OpenC and NaCl have PATH_MAX in sys/syslimits.h
-# include <sys/syslimits.h>
-#else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
-#elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
-#elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
-#else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
-#endif  // GTEST_OS_WINDOWS
-
-
-namespace testing {
-namespace internal {
-
-#if GTEST_OS_WINDOWS
-// On Windows, '\\' is the standard path separator, but many tools and the
-// Windows API also accept '/' as an alternate path separator. Unless otherwise
-// noted, a file path can contain either kind of path separators, or a mixture
-// of them.
-const char kPathSeparator = '\\';
-const char kAlternatePathSeparator = '/';
-const char kPathSeparatorString[] = "\\";
-const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
-// Windows CE doesn't have a current directory. You should not use
-// the current directory in tests on Windows CE, but this at least
-// provides a reasonable fallback.
-const char kCurrentDirectoryString[] = "\\";
-// Windows CE doesn't define INVALID_FILE_ATTRIBUTES
-const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
-const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
-#else
-const char kPathSeparator = '/';
-const char kPathSeparatorString[] = "/";
-const char kCurrentDirectoryString[] = "./";
-#endif  // GTEST_OS_WINDOWS
-
-// Returns whether the given character is a valid path separator.
-static bool IsPathSeparator(char c) {
-#if GTEST_HAS_ALT_PATH_SEP_
-  return (c == kPathSeparator) || (c == kAlternatePathSeparator);
-#else
-  return c == kPathSeparator;
-#endif
-}
-
-// Returns the current working directory, or "" if unsuccessful.
-FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE
-  // Windows CE doesn't have a current directory, so we just return
-  // something reasonable.
-  return FilePath(kCurrentDirectoryString);
-#elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
-#else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
-  return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns a copy of the FilePath with the case-insensitive extension removed.
-// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-// FilePath("dir/file"). If a case-insensitive extension is not
-// found, returns a copy of the original FilePath.
-FilePath FilePath::RemoveExtension(const char* extension) const {
-  String dot_extension(String::Format(".%s", extension));
-  if (pathname_.EndsWithCaseInsensitive(dot_extension.c_str())) {
-    return FilePath(String(pathname_.c_str(), pathname_.length() - 4));
-  }
-  return *this;
-}
-
-// Returns a pointer to the last occurence of a valid path separator in
-// the FilePath. On Windows, for example, both '/' and '\' are valid path
-// separators. Returns NULL if no path separator was found.
-const char* FilePath::FindLastPathSeparator() const {
-  const char* const last_sep = strrchr(c_str(), kPathSeparator);
-#if GTEST_HAS_ALT_PATH_SEP_
-  const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator);
-  // Comparing two pointers of which only one is NULL is undefined.
-  if (last_alt_sep != NULL &&
-      (last_sep == NULL || last_alt_sep > last_sep)) {
-    return last_alt_sep;
-  }
-#endif
-  return last_sep;
-}
-
-// Returns a copy of the FilePath with the directory part removed.
-// Example: FilePath("path/to/file").RemoveDirectoryName() returns
-// FilePath("file"). If there is no directory part ("just_a_file"), it returns
-// the FilePath unmodified. If there is no file part ("just_a_dir/") it
-// returns an empty FilePath ("").
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveDirectoryName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  return last_sep ? FilePath(String(last_sep + 1)) : *this;
-}
-
-// RemoveFileName returns the directory path with the filename removed.
-// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-// If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-// not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-// On Windows platform, '\' is the path separator, otherwise it is '/'.
-FilePath FilePath::RemoveFileName() const {
-  const char* const last_sep = FindLastPathSeparator();
-  String dir;
-  if (last_sep) {
-    dir = String(c_str(), last_sep + 1 - c_str());
-  } else {
-    dir = kCurrentDirectoryString;
-  }
-  return FilePath(dir);
-}
-
-// Helper functions for naming files in a directory for xml output.
-
-// Given directory = "dir", base_name = "test", number = 0,
-// extension = "xml", returns "dir/test.xml". If number is greater
-// than zero (e.g., 12), returns "dir/test_12.xml".
-// On Windows platform, uses \ as the separator rather than /.
-FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
-                                const char* extension) {
-  String file;
-  if (number == 0) {
-    file = String::Format("%s.%s", base_name.c_str(), extension);
-  } else {
-    file = String::Format("%s_%d.%s", base_name.c_str(), number, extension);
-  }
-  return ConcatPaths(directory, FilePath(file));
-}
-
-// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml".
-// On Windows, uses \ as the separator rather than /.
-FilePath FilePath::ConcatPaths(const FilePath& directory,
-                               const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
-  const FilePath dir(directory.RemoveTrailingPathSeparator());
-  return FilePath(String::Format("%s%c%s", dir.c_str(), kPathSeparator,
-                                 relative_path.c_str()));
-}
-
-// Returns true if pathname describes something findable in the file-system,
-// either a file, directory, or whatever.
-bool FilePath::FileOrDirectoryExists() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  return attributes != kInvalidFileAttributes;
-#else
-  posix::StatStruct file_stat;
-  return posix::Stat(pathname_.c_str(), &file_stat) == 0;
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
-
-// Returns true if pathname describes a directory in the file-system
-// that exists.
-bool FilePath::DirectoryExists() const {
-  bool result = false;
-#if GTEST_OS_WINDOWS
-  // Don't strip off trailing separator if path is a root directory on
-  // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
-#else
-  const FilePath& path(*this);
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-  LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
-  const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
-  if ((attributes != kInvalidFileAttributes) &&
-      (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-    result = true;
-  }
-#else
-  posix::StatStruct file_stat;
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  return result;
-}
-
-// Returns true if pathname describes a root directory. (Windows has one
-// root directory per disk drive.)
-bool FilePath::IsRootDirectory() const {
-#if GTEST_OS_WINDOWS
-  // TODO(wan@google.com): on Windows a network share like
-  // \\server\share can be a root directory, although it cannot be the
-  // current directory.  Handle this properly.
-  return pathname_.length() == 3 && IsAbsolutePath();
-#else
-  return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]);
-#endif
-}
-
-// Returns true if pathname describes an absolute path.
-bool FilePath::IsAbsolutePath() const {
-  const char* const name = pathname_.c_str();
-#if GTEST_OS_WINDOWS
-  return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
-#else
-  return IsPathSeparator(name[0]);
-#endif
-}
-
-// Returns a pathname for a file that does not currently exist. The pathname
-// will be directory/base_name.extension or
-// directory/base_name_<number>.extension if directory/base_name.extension
-// already exists. The number will be incremented until a pathname is found
-// that does not already exist.
-// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-// There could be a race condition if two or more processes are calling this
-// function at the same time -- they could both pick the same filename.
-FilePath FilePath::GenerateUniqueFileName(const FilePath& directory,
-                                          const FilePath& base_name,
-                                          const char* extension) {
-  FilePath full_pathname;
-  int number = 0;
-  do {
-    full_pathname.Set(MakeFileName(directory, base_name, number++, extension));
-  } while (full_pathname.FileOrDirectoryExists());
-  return full_pathname;
-}
-
-// Returns true if FilePath ends with a path separator, which indicates that
-// it is intended to represent a directory. Returns false otherwise.
-// This does NOT check that a directory (or file) actually exists.
-bool FilePath::IsDirectory() const {
-  return !pathname_.empty() &&
-         IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]);
-}
-
-// Create directories so that path exists. Returns true if successful or if
-// the directories already exist; returns false if unable to create directories
-// for any reason.
-bool FilePath::CreateDirectoriesRecursively() const {
-  if (!this->IsDirectory()) {
-    return false;
-  }
-
-  if (pathname_.length() == 0 || this->DirectoryExists()) {
-    return true;
-  }
-
-  const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName());
-  return parent.CreateDirectoriesRecursively() && this->CreateFolder();
-}
-
-// Create the directory so that path exists. Returns true if successful or
-// if the directory already exists; returns false if unable to create the
-// directory for any reason, including if the parent directory does not
-// exist. Not named "CreateDirectory" because that's a macro on Windows.
-bool FilePath::CreateFolder() const {
-#if GTEST_OS_WINDOWS_MOBILE
-  FilePath removed_sep(this->RemoveTrailingPathSeparator());
-  LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
-  int result = CreateDirectory(unicode, NULL) ? 0 : -1;
-  delete [] unicode;
-#elif GTEST_OS_WINDOWS
-  int result = _mkdir(pathname_.c_str());
-#else
-  int result = mkdir(pathname_.c_str(), 0777);
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-  if (result == -1) {
-    return this->DirectoryExists();  // An error is OK if the directory exists.
-  }
-  return true;  // No error.
-}
-
-// If input name has a trailing separator character, remove it and return the
-// name, otherwise return the name string unmodified.
-// On Windows platform, uses \ as the separator, other platforms use /.
-FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(String(pathname_.c_str(), pathname_.length() - 1))
-      : *this;
-}
-
-// Removes any redundant separators that might be in the pathname.
-// For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-// redundancies that might be in a pathname involving "." or "..".
-// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
-void FilePath::Normalize() {
-  if (pathname_.c_str() == NULL) {
-    pathname_ = "";
-    return;
-  }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
-
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
-    } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src))
-        src++;
-    }
-    dest_ptr++;
-  }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
-}
-
-}  // namespace internal
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-
-#include <limits.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>  // For TerminateProcess()
-#elif GTEST_OS_WINDOWS
-# include <io.h>
-# include <sys/stat.h>
-#else
-# include <unistd.h>
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-#if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
-#endif  // GTEST_OS_MAC
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-namespace internal {
-
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_MAC
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-size_t GetThreadCount() {
-  const task_t task = mach_task_self();
-  mach_msg_type_number_t thread_count;
-  thread_act_array_t thread_list;
-  const kern_return_t status = task_threads(task, &thread_list, &thread_count);
-  if (status == KERN_SUCCESS) {
-    // task_threads allocates resources in thread_list and we need to free them
-    // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
-                  sizeof(thread_t) * thread_count);
-    return static_cast<size_t>(thread_count);
-  } else {
-    return 0;
-  }
-}
-
-#else
-
-size_t GetThreadCount() {
-  // There's no portable way to detect the number of threads, so we just
-  // return 0 to indicate that we cannot detect it.
-  return 0;
-}
-
-#endif  // GTEST_OS_MAC
-
-#if GTEST_USES_POSIX_RE
-
-// Implements RE.  Currently only needed for death tests.
-
-RE::~RE() {
-  if (is_valid_) {
-    // regfree'ing an invalid regex might crash because the content
-    // of the regex is undefined. Since the regex's are essentially
-    // the same, one cannot be valid (or invalid) without the other
-    // being so too.
-    regfree(&partial_regex_);
-    regfree(&full_regex_);
-  }
-  free(const_cast<char*>(pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.full_regex_, str, 1, &match, 0) == 0;
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  if (!re.is_valid_) return false;
-
-  regmatch_t match;
-  return regexec(&re.partial_regex_, str, 1, &match, 0) == 0;
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = posix::StrDup(regex);
-
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match.
-  const size_t full_regex_len = strlen(regex) + 10;
-  char* const full_pattern = new char[full_regex_len];
-
-  snprintf(full_pattern, full_regex_len, "^(%s)$", regex);
-  is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0;
-  // We want to call regcomp(&partial_regex_, ...) even if the
-  // previous expression returns false.  Otherwise partial_regex_ may
-  // not be properly initialized can may cause trouble when it's
-  // freed.
-  //
-  // Some implementation of POSIX regex (e.g. on at least some
-  // versions of Cygwin) doesn't accept the empty string as a valid
-  // regex.  We change it to an equivalent form "()" to be safe.
-  if (is_valid_) {
-    const char* const partial_regex = (*regex == '\0') ? "()" : regex;
-    is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0;
-  }
-  EXPECT_TRUE(is_valid_)
-      << "Regular expression \"" << regex
-      << "\" is not a valid POSIX Extended regular expression.";
-
-  delete[] full_pattern;
-}
-
-#elif GTEST_USES_SIMPLE_RE
-
-// Returns true iff ch appears anywhere in str (excluding the
-// terminating '\0' character).
-bool IsInSet(char ch, const char* str) {
-  return ch != '\0' && strchr(str, ch) != NULL;
-}
-
-// Returns true iff ch belongs to the given classification.  Unlike
-// similar functions in <ctype.h>, these aren't affected by the
-// current locale.
-bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; }
-bool IsAsciiPunct(char ch) {
-  return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
-}
-bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
-bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
-bool IsAsciiWordChar(char ch) {
-  return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
-}
-
-// Returns true iff "\\c" is a supported escape sequence.
-bool IsValidEscape(char c) {
-  return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW"));
-}
-
-// Returns true iff the given atom (specified by escaped and pattern)
-// matches ch.  The result is undefined if the atom is invalid.
-bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
-  if (escaped) {  // "\\p" where p is pattern_char.
-    switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
-    }
-    return IsAsciiPunct(pattern_char) && pattern_char == ch;
-  }
-
-  return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
-}
-
-// Helper function used by ValidateRegex() to format error messages.
-String FormatRegexSyntaxError(const char* regex, int index) {
-  return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
-}
-
-// Generates non-fatal failures and returns false if regex is invalid;
-// otherwise returns true.
-bool ValidateRegex(const char* regex) {
-  if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
-    // assertion failures to match where the regex is used in user
-    // code.
-    ADD_FAILURE() << "NULL is not a valid simple regular expression.";
-    return false;
-  }
-
-  bool is_valid = true;
-
-  // True iff ?, *, or + can follow the previous atom.
-  bool prev_repeatable = false;
-  for (int i = 0; regex[i]; i++) {
-    if (regex[i] == '\\') {  // An escape sequence
-      i++;
-      if (regex[i] == '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "'\\' cannot appear at the end.";
-        return false;
-      }
-
-      if (!IsValidEscape(regex[i])) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
-                      << "invalid escape sequence \"\\" << regex[i] << "\".";
-        is_valid = false;
-      }
-      prev_repeatable = true;
-    } else {  // Not an escape sequence.
-      const char ch = regex[i];
-
-      if (ch == '^' && i > 0) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'^' can only appear at the beginning.";
-        is_valid = false;
-      } else if (ch == '$' && regex[i + 1] != '\0') {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'$' can only appear at the end.";
-        is_valid = false;
-      } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
-        is_valid = false;
-      } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
-        is_valid = false;
-      }
-
-      prev_repeatable = !IsInSet(ch, "^$?*+");
-    }
-  }
-
-  return is_valid;
-}
-
-// Matches a repeated regex atom followed by a valid simple regular
-// expression.  The regex atom is defined as c if escaped is false,
-// or \c otherwise.  repeat is the repetition meta character (?, *,
-// or +).  The behavior is undefined if str contains too many
-// characters to be indexable by size_t, in which case the test will
-// probably time out anyway.  We are fine with this limitation as
-// std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
-  const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
-  // We cannot call numeric_limits::max() as it conflicts with the
-  // max() macro on Windows.
-
-  for (size_t i = 0; i <= max_count; ++i) {
-    // We know that the atom matches each of the first i characters in str.
-    if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
-      // We have enough matches at the head, and the tail matches too.
-      // Since we only care about *whether* the pattern matches str
-      // (as opposed to *how* it matches), there is no need to find a
-      // greedy match.
-      return true;
-    }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
-  }
-  return false;
-}
-
-// Returns true iff regex matches a prefix of str.  regex must be a
-// valid simple regular expression and not start with "^", or the
-// result is undefined.
-bool MatchRegexAtHead(const char* regex, const char* str) {
-  if (*regex == '\0')  // An empty regex matches a prefix of anything.
-    return true;
-
-  // "$" only matches the end of a string.  Note that regex being
-  // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
-
-  // Is the first thing in regex an escape sequence?
-  const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
-  if (IsRepeat(regex[1])) {
-    // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
-    // here's an indirect recursion.  It terminates as the regex gets
-    // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
-  } else {
-    // regex isn't empty, isn't "$", and doesn't start with a
-    // repetition.  We match the first atom of regex with the first
-    // character of str and recurse.
-    return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
-  }
-}
-
-// Returns true iff regex matches any substring of str.  regex must be
-// a valid simple regular expression, or the result is undefined.
-//
-// The algorithm is recursive, but the recursion depth doesn't exceed
-// the regex length, so we won't need to worry about running out of
-// stack space normally.  In rare cases the time complexity can be
-// exponential with respect to the regex length + the string length,
-// but usually it's must faster (often close to linear).
-bool MatchRegexAnywhere(const char* regex, const char* str) {
-  if (regex == NULL || str == NULL)
-    return false;
-
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
-
-  // A successful match can be anywhere in str.
-  do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
-  } while (*str++ != '\0');
-  return false;
-}
-
-// Implements the RE class.
-
-RE::~RE() {
-  free(const_cast<char*>(pattern_));
-  free(const_cast<char*>(full_pattern_));
-}
-
-// Returns true iff regular expression re matches the entire str.
-bool RE::FullMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
-}
-
-// Returns true iff regular expression re matches a substring of str
-// (including str itself).
-bool RE::PartialMatch(const char* str, const RE& re) {
-  return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
-}
-
-// Initializes an RE from its string representation.
-void RE::Init(const char* regex) {
-  pattern_ = full_pattern_ = NULL;
-  if (regex != NULL) {
-    pattern_ = posix::StrDup(regex);
-  }
-
-  is_valid_ = ValidateRegex(regex);
-  if (!is_valid_) {
-    // No need to calculate the full pattern when the regex is invalid.
-    return;
-  }
-
-  const size_t len = strlen(regex);
-  // Reserves enough bytes to hold the regular expression used for a
-  // full match: we need space to prepend a '^', append a '$', and
-  // terminate the string with '\0'.
-  char* buffer = static_cast<char*>(malloc(len + 3));
-  full_pattern_ = buffer;
-
-  if (*regex != '^')
-    *buffer++ = '^';  // Makes sure full_pattern_ starts with '^'.
-
-  // We don't use snprintf or strncpy, as they trigger a warning when
-  // compiled with VC++ 8.0.
-  memcpy(buffer, regex, len);
-  buffer += len;
-
-  if (len == 0 || regex[len - 1] != '$')
-    *buffer++ = '$';  // Makes sure full_pattern_ ends with '$'.
-
-  *buffer = '\0';
-}
-
-#endif  // GTEST_USES_POSIX_RE
-
-const char kUnknownFile[] = "unknown file";
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
-  const char* const file_name = file == NULL ? kUnknownFile : file;
-
-  if (line < 0) {
-    return String::Format("%s:", file_name).c_str();
-  }
-#ifdef _MSC_VER
-  return String::Format("%s(%d):", file_name, line).c_str();
-#else
-  return String::Format("%s:%d:", file_name, line).c_str();
-#endif  // _MSC_VER
-}
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-// Note that FormatCompilerIndependentFileLocation() does NOT append colon
-// to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
-  const char* const file_name = file == NULL ? kUnknownFile : file;
-
-  if (line < 0)
-    return file_name;
-  else
-    return String::Format("%s:%d", file_name, line).c_str();
-}
-
-
-GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
-    : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
-}
-
-// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-GTestLog::~GTestLog() {
-  GetStream() << ::std::endl;
-  if (severity_ == GTEST_FATAL) {
-    fflush(stderr);
-    posix::Abort();
-  }
-}
-// Disable Microsoft deprecation warnings for POSIX functions called from
-// this class (creat, dup, dup2, and close)
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4996)
-#endif  // _MSC_VER
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Object that captures an output stream (stdout/stderr).
-class CapturedStream {
- public:
-  // The ctor redirects the stream to a temporary file.
-  CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-
-    ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
-                                            0,  // Generate unique file name.
-                                            temp_file_path);
-    GTEST_CHECK_(success != 0)
-        << "Unable to create a temporary file in " << temp_dir_path;
-    const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
-    filename_ = temp_file_path;
-# else
-    // There's no guarantee that a test has write access to the
-    // current directory, so we create the temporary file in the /tmp
-    // directory instead.
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-    const int captured_fd = mkstemp(name_template);
-    filename_ = name_template;
-# endif  // GTEST_OS_WINDOWS
-    fflush(NULL);
-    dup2(captured_fd, fd_);
-    close(captured_fd);
-  }
-
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
-
-  String GetCapturedString() {
-    if (uncaptured_fd_ != -1) {
-      // Restores the original stream.
-      fflush(NULL);
-      dup2(uncaptured_fd_, fd_);
-      close(uncaptured_fd_);
-      uncaptured_fd_ = -1;
-    }
-
-    FILE* const file = posix::FOpen(filename_.c_str(), "r");
-    const String content = ReadEntireFile(file);
-    posix::FClose(file);
-    return content;
-  }
-
- private:
-  // Reads the entire content of a file as a String.
-  static String ReadEntireFile(FILE* file);
-
-  // Returns the size (in bytes) of a file.
-  static size_t GetFileSize(FILE* file);
-
-  const int fd_;  // A stream to capture.
-  int uncaptured_fd_;
-  // Name of the temporary file holding the stderr output.
-  ::std::string filename_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
-};
-
-// Returns the size (in bytes) of a file.
-size_t CapturedStream::GetFileSize(FILE* file) {
-  fseek(file, 0, SEEK_END);
-  return static_cast<size_t>(ftell(file));
-}
-
-// Reads the entire content of a file as a string.
-String CapturedStream::ReadEntireFile(FILE* file) {
-  const size_t file_size = GetFileSize(file);
-  char* const buffer = new char[file_size];
-
-  size_t bytes_last_read = 0;  // # of bytes read in the last fread()
-  size_t bytes_read = 0;       // # of bytes read so far
-
-  fseek(file, 0, SEEK_SET);
-
-  // Keeps reading the file until we cannot read further or the
-  // pre-determined file size is reached.
-  do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
-    bytes_read += bytes_last_read;
-  } while (bytes_last_read > 0 && bytes_read < file_size);
-
-  const String content(buffer, bytes_read);
-  delete[] buffer;
-
-  return content;
-}
-
-# ifdef _MSC_VER
-#  pragma warning(pop)
-# endif  // _MSC_VER
-
-static CapturedStream* g_captured_stderr = NULL;
-static CapturedStream* g_captured_stdout = NULL;
-
-// Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
-  if (*stream != NULL) {
-    GTEST_LOG_(FATAL) << "Only one " << stream_name
-                      << " capturer can exist at a time.";
-  }
-  *stream = new CapturedStream(fd);
-}
-
-// Stops capturing the output stream and returns the captured string.
-String GetCapturedStream(CapturedStream** captured_stream) {
-  const String content = (*captured_stream)->GetCapturedString();
-
-  delete *captured_stream;
-  *captured_stream = NULL;
-
-  return content;
-}
-
-// Starts capturing stdout.
-void CaptureStdout() {
-  CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
-}
-
-// Starts capturing stderr.
-void CaptureStderr() {
-  CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr);
-}
-
-// Stops capturing stdout and returns the captured string.
-String GetCapturedStdout() { return GetCapturedStream(&g_captured_stdout); }
-
-// Stops capturing stderr and returns the captured string.
-String GetCapturedStderr() { return GetCapturedStream(&g_captured_stderr); }
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-#if GTEST_HAS_DEATH_TEST
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<String> g_argvs;
-
-// Returns the command line as a vector of strings.
-const ::std::vector<String>& GetArgvs() { return g_argvs; }
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-#if GTEST_OS_WINDOWS_MOBILE
-namespace posix {
-void Abort() {
-  DebugBreak();
-  TerminateProcess(GetCurrentProcess(), 1);
-}
-}  // namespace posix
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Returns the name of the environment variable corresponding to the
-// given flag.  For example, FlagToEnvVar("foo") will return
-// "GTEST_FOO" in the open-source version.
-static String FlagToEnvVar(const char* flag) {
-  const String full_flag =
-      (Message() << GTEST_FLAG_PREFIX_ << flag).GetString();
-
-  Message env_var;
-  for (size_t i = 0; i != full_flag.length(); i++) {
-    env_var << ToUpper(full_flag.c_str()[i]);
-  }
-
-  return env_var.GetString();
-}
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes
-// the result to *value and returns true; otherwise leaves *value
-// unchanged and returns false.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
-  // Parses the environment variable as a decimal integer.
-  char* end = NULL;
-  const long long_value = strtol(str, &end, 10);  // NOLINT
-
-  // Has strtol() consumed all characters in the string?
-  if (*end != '\0') {
-    // No - an invalid character was encountered.
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value \"" << str << "\".\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  // Is the parsed value in the range of an Int32?
-  const Int32 result = static_cast<Int32>(long_value);
-  if (long_value == LONG_MAX || long_value == LONG_MIN ||
-      // The parsed value overflows as a long.  (strtol() returns
-      // LONG_MAX or LONG_MIN when the input overflows.)
-      result != long_value
-      // The parsed value overflows as an Int32.
-      ) {
-    Message msg;
-    msg << "WARNING: " << src_text
-        << " is expected to be a 32-bit integer, but actually"
-        << " has value " << str << ", which overflows.\n";
-    printf("%s", msg.GetString().c_str());
-    fflush(stdout);
-    return false;
-  }
-
-  *value = result;
-  return true;
-}
-
-// Reads and returns the Boolean environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-//
-// The value is considered true iff it's not "0".
-bool BoolFromGTestEnv(const char* flag, bool default_value) {
-  const String env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  return string_value == NULL ?
-      default_value : strcmp(string_value, "0") != 0;
-}
-
-// Reads and returns a 32-bit integer stored in the environment
-// variable corresponding to the given flag; if it isn't set or
-// doesn't represent a valid 32-bit integer, returns default_value.
-Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
-  const String env_var = FlagToEnvVar(flag);
-  const char* const string_value = posix::GetEnv(env_var.c_str());
-  if (string_value == NULL) {
-    // The environment variable is not set.
-    return default_value;
-  }
-
-  Int32 result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
-    printf("The default value %s is used.\n",
-           (Message() << default_value).GetString().c_str());
-    fflush(stdout);
-    return default_value;
-  }
-
-  return result;
-}
-
-// Reads and returns the string environment variable corresponding to
-// the given flag; if it's not set, returns default_value.
-const char* StringFromGTestEnv(const char* flag, const char* default_value) {
-  const String env_var = FlagToEnvVar(flag);
-  const char* const value = posix::GetEnv(env_var.c_str());
-  return value == NULL ? default_value : value;
-}
-
-}  // namespace internal
-}  // namespace testing
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Google Test - The Google C++ Testing Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// It uses the << operator when possible, and prints the bytes in the
-// object otherwise.  A user can override its behavior for a class
-// type Foo by defining either operator<<(::std::ostream&, const Foo&)
-// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that
-// defines Foo.
-
-#include <ctype.h>
-#include <stdio.h>
-#include <ostream>  // NOLINT
-#include <string>
-
-namespace testing {
-
-namespace {
-
-using ::std::ostream;
-
-#if GTEST_OS_WINDOWS_MOBILE  // Windows CE does not define _snprintf_s.
-# define snprintf _snprintf
-#elif _MSC_VER >= 1400  // VC 8.0 and later deprecate snprintf and _snprintf.
-# define snprintf _snprintf_s
-#elif _MSC_VER
-# define snprintf _snprintf
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-// Prints a segment of bytes in the given object.
-void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start,
-                                size_t count, ostream* os) {
-  char text[5] = "";
-  for (size_t i = 0; i != count; i++) {
-    const size_t j = start + i;
-    if (i != 0) {
-      // Organizes the bytes into groups of 2 for easy parsing by
-      // human.
-      if ((j % 2) == 0)
-        *os << ' ';
-      else
-        *os << '-';
-    }
-    snprintf(text, sizeof(text), "%02X", obj_bytes[j]);
-    *os << text;
-  }
-}
-
-// Prints the bytes in the given value to the given ostream.
-void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
-                              ostream* os) {
-  // Tells the user how big the object is.
-  *os << count << "-byte object <";
-
-  const size_t kThreshold = 132;
-  const size_t kChunkSize = 64;
-  // If the object size is bigger than kThreshold, we'll have to omit
-  // some details by printing only the first and the last kChunkSize
-  // bytes.
-  // TODO(wan): let the user control the threshold using a flag.
-  if (count < kThreshold) {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
-  } else {
-    PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
-    *os << " ... ";
-    // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
-    PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
-  }
-  *os << ">";
-}
-
-}  // namespace
-
-namespace internal2 {
-
-// Delegates to PrintBytesInObjectToImpl() to print the bytes in the
-// given object.  The delegation simplifies the implementation, which
-// uses the << operator and thus is easier done outside of the
-// ::testing::internal namespace, which contains a << operator that
-// sometimes conflicts with the one in STL.
-void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
-                          ostream* os) {
-  PrintBytesInObjectToImpl(obj_bytes, count, os);
-}
-
-}  // namespace internal2
-
-namespace internal {
-
-// Depending on the value of a char (or wchar_t), we print it in one
-// of three formats:
-//   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
-//   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
-
-// Returns true if c is a printable ASCII character.  We test the
-// value of c directly instead of calling isprint(), which is buggy on
-// Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
-
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
-static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  switch (static_cast<wchar_t>(c)) {
-    case L'\0':
-      *os << "\\0";
-      break;
-    case L'\'':
-      *os << "\\'";
-      break;
-    case L'\\':
-      *os << "\\\\";
-      break;
-    case L'\a':
-      *os << "\\a";
-      break;
-    case L'\b':
-      *os << "\\b";
-      break;
-    case L'\f':
-      *os << "\\f";
-      break;
-    case L'\n':
-      *os << "\\n";
-      break;
-    case L'\r':
-      *os << "\\r";
-      break;
-    case L'\t':
-      *os << "\\t";
-      break;
-    case L'\v':
-      *os << "\\v";
-      break;
-    default:
-      if (IsPrintableAscii(c)) {
-        *os << static_cast<char>(c);
-        return kAsIs;
-      } else {
-        *os << String::Format("\\x%X", static_cast<UnsignedChar>(c));
-        return kHexEscape;
-      }
-  }
-  return kSpecialEscape;
-}
-
-// Prints a char c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsWideStringLiteralTo(wchar_t c, ostream* os) {
-  switch (c) {
-    case L'\'':
-      *os << "'";
-      return kAsIs;
-    case L'"':
-      *os << "\\\"";
-      return kSpecialEscape;
-    default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
-  }
-}
-
-// Prints a char c as if it's part of a string literal, escaping it when
-// necessary; returns how c was formatted.
-static CharFormat PrintAsNarrowStringLiteralTo(char c, ostream* os) {
-  return PrintAsWideStringLiteralTo(static_cast<unsigned char>(c), os);
-}
-
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
-void PrintCharAndCodeTo(Char c, ostream* os) {
-  // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
-  *os << "'";
-
-  // To aid user debugging, we also print c's code in decimal, unless
-  // it's 0 (in which case c was printed as '\\0', making the code
-  // obvious).
-  if (c == 0)
-    return;
-  *os << " (" << String::Format("%d", c).c_str();
-
-  // For more convenience, we print c's code again in hexidecimal,
-  // unless c was already printed in the form '\x##' or the code is in
-  // [1, 9].
-  if (format == kHexEscape || (1 <= c && c <= 9)) {
-    // Do nothing.
-  } else {
-    *os << String::Format(", 0x%X",
-                          static_cast<UnsignedChar>(c)).c_str();
-  }
-  *os << ")";
-}
-
-void PrintTo(unsigned char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
-}
-
-// Prints the given array of characters to the ostream.
-// The array starts at *begin, the length is len, it may include '\0' characters
-// and may not be null-terminated.
-static void PrintCharsAsStringTo(const char* begin, size_t len, ostream* os) {
-  *os << "\"";
-  bool is_previous_hex = false;
-  for (size_t index = 0; index < len; ++index) {
-    const char cur = begin[index];
-    if (is_previous_hex && IsXDigit(cur)) {
-      // Previous character is of '\x..' form and this character can be
-      // interpreted as another hexadecimal digit in its number. Break string to
-      // disambiguate.
-      *os << "\" \"";
-    }
-    is_previous_hex = PrintAsNarrowStringLiteralTo(cur, os) == kHexEscape;
-  }
-  *os << "\"";
-}
-
-// Prints a (const) char array of 'len' elements, starting at address 'begin'.
-void UniversalPrintArray(const char* begin, size_t len, ostream* os) {
-  PrintCharsAsStringTo(begin, len, os);
-}
-
-// Prints the given array of wide characters to the ostream.
-// The array starts at *begin, the length is len, it may include L'\0'
-// characters and may not be null-terminated.
-static void PrintWideCharsAsStringTo(const wchar_t* begin, size_t len,
-                                     ostream* os) {
-  *os << "L\"";
-  bool is_previous_hex = false;
-  for (size_t index = 0; index < len; ++index) {
-    const wchar_t cur = begin[index];
-    if (is_previous_hex && isascii(cur) && IsXDigit(static_cast<char>(cur))) {
-      // Previous character is of '\x..' form and this character can be
-      // interpreted as another hexadecimal digit in its number. Break string to
-      // disambiguate.
-      *os << "\" L\"";
-    }
-    is_previous_hex = PrintAsWideStringLiteralTo(cur, os) == kHexEscape;
-  }
-  *os << "\"";
-}
-
-// Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
-  if (s == NULL) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
-  }
-}
-
-// MSVC compiler can be configured to define whar_t as a typedef
-// of unsigned short. Defining an overload for const wchar_t* in that case
-// would cause pointers to unsigned shorts be printed as wide strings,
-// possibly accessing more memory than intended and causing invalid
-// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when
-// wchar_t is implemented as a native type.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == NULL) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintWideCharsAsStringTo(s, wcslen(s), os);
-  }
-}
-#endif  // wchar_t is native
-
-// Prints a ::string object.
-#if GTEST_HAS_GLOBAL_STRING
-void PrintStringTo(const ::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-void PrintStringTo(const ::std::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
-}
-
-// Prints a ::wstring object.
-#if GTEST_HAS_GLOBAL_WSTRING
-void PrintWideStringTo(const ::wstring& s, ostream* os) {
-  PrintWideCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#if GTEST_HAS_STD_WSTRING
-void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
-  PrintWideCharsAsStringTo(s.data(), s.size(), os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-}  // namespace internal
-
-}  // namespace testing
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
-
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
-#undef GTEST_IMPLEMENTATION_
-
-namespace testing {
-
-using internal::GetUnitTestImpl;
-
-// Gets the summary of the failure message by omitting the stack trace
-// in it.
-internal::String TestPartResult::ExtractSummary(const char* message) {
-  const char* const stack_trace = strstr(message, internal::kStackTraceMarker);
-  return stack_trace == NULL ? internal::String(message) :
-      internal::String(message, stack_trace - message);
-}
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
-  return os
-      << result.file_name() << ":" << result.line_number() << ": "
-      << (result.type() == TestPartResult::kSuccess ? "Success" :
-          result.type() == TestPartResult::kFatalFailure ? "Fatal failure" :
-          "Non-fatal failure") << ":\n"
-      << result.message() << std::endl;
-}
-
-// Appends a TestPartResult to the array.
-void TestPartResultArray::Append(const TestPartResult& result) {
-  array_.push_back(result);
-}
-
-// Returns the TestPartResult at the given index (0-based).
-const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const {
-  if (index < 0 || index >= size()) {
-    printf("\nInvalid index (%d) into TestPartResultArray.\n", index);
-    internal::posix::Abort();
-  }
-
-  return array_[index];
-}
-
-// Returns the number of TestPartResult objects in the array.
-int TestPartResultArray::size() const {
-  return static_cast<int>(array_.size());
-}
-
-namespace internal {
-
-HasNewFatalFailureHelper::HasNewFatalFailureHelper()
-    : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
-}
-
-HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
-  GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(
-      original_reporter_);
-}
-
-void HasNewFatalFailureHelper::ReportTestPartResult(
-    const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
-  original_reporter_->ReportTestPartResult(result);
-}
-
-}  // namespace internal
-
-}  // namespace testing
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-
-namespace testing {
-namespace internal {
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// Skips to the first non-space char in str. Returns an empty string if str
-// contains only whitespace characters.
-static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
-  return str;
-}
-
-// Verifies that registered_tests match the test names in
-// defined_test_names_; returns registered_tests if successful, or
-// aborts the program otherwise.
-const char* TypedTestCasePState::VerifyRegisteredTestNames(
-    const char* file, int line, const char* registered_tests) {
-  typedef ::std::set<const char*>::const_iterator DefinedTestIter;
-  registered_ = true;
-
-  // Skip initial whitespace in registered_tests since some
-  // preprocessors prefix stringizied literals with whitespace.
-  registered_tests = SkipSpaces(registered_tests);
-
-  Message errors;
-  ::std::set<String> tests;
-  for (const char* names = registered_tests; names != NULL;
-       names = SkipComma(names)) {
-    const String name = GetPrefixUntilComma(names);
-    if (tests.count(name) != 0) {
-      errors << "Test " << name << " is listed more than once.\n";
-      continue;
-    }
-
-    bool found = false;
-    for (DefinedTestIter it = defined_test_names_.begin();
-         it != defined_test_names_.end();
-         ++it) {
-      if (name == *it) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
-      tests.insert(name);
-    } else {
-      errors << "No test named " << name
-             << " can be found in this test case.\n";
-    }
-  }
-
-  for (DefinedTestIter it = defined_test_names_.begin();
-       it != defined_test_names_.end();
-       ++it) {
-    if (tests.count(*it) == 0) {
-      errors << "You forgot to list test " << *it << ".\n";
-    }
-  }
-
-  const String& errors_str = errors.GetString();
-  if (errors_str != "") {
-    fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(),
-            errors_str.c_str());
-    fflush(stderr);
-    posix::Abort();
-  }
-
-  return registered_tests;
-}
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
diff --git a/kokkos/kokkos/TPL/gtest/gtest.h b/kokkos/kokkos/TPL/gtest/gtest.h
deleted file mode 100644
index 5fc6f9e..0000000
--- a/kokkos/kokkos/TPL/gtest/gtest.h
+++ /dev/null
@@ -1,19537 +0,0 @@
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines the public API for Google Test.  It should be
-// included by any test program that uses Google Test.
-//
-// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
-// leave some internal implementation details in this header file.
-// They are clearly marked by comments like this:
-//
-//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-//
-// Such code is NOT meant to be used by a user directly, and is subject
-// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
-// program!
-//
-// Acknowledgment: Google Test borrowed the idea of automatic test
-// registration from Barthelemy Dagenais' (barthelemy@prologique.com)
-// easyUnit framework.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_H_
-
-#include <limits>
-#include <vector>
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file declares functions and macros used internally by
-// Google Test.  They are subject to change without notice.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan)
-//
-// Low-level types and utilities for porting Google Test to various
-// platforms.  They are subject to change without notice.  DO NOT USE
-// THEM IN USER CODE.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-
-// The user can define the following macros in the build script to
-// control Google Test's behavior.  If the user doesn't define a macro
-// in this list, Google Test will define it.
-//
-//   GTEST_HAS_CLONE          - Define it to 1/0 to indicate that clone(2)
-//                              is/isn't available.
-//   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
-//                              are enabled.
-//   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::string, which is different to std::string).
-//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::wstring, which is different to std::wstring).
-//   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
-//                              expressions are/aren't available.
-//   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
-//                              is/isn't available.
-//   GTEST_HAS_RTTI           - Define it to 1/0 to indicate that RTTI is/isn't
-//                              enabled.
-//   GTEST_HAS_STD_WSTRING    - Define it to 1/0 to indicate that
-//                              std::wstring does/doesn't work (Google Test can
-//                              be used where std::wstring is unavailable).
-//   GTEST_HAS_TR1_TUPLE      - Define it to 1/0 to indicate tr1::tuple
-//                              is/isn't available.
-//   GTEST_HAS_SEH            - Define it to 1/0 to indicate whether the
-//                              compiler supports Microsoft's "Structured
-//                              Exception Handling".
-//   GTEST_HAS_STREAM_REDIRECTION
-//                            - Define it to 1/0 to indicate whether the
-//                              platform supports I/O stream redirection using
-//                              dup() and dup2().
-//   GTEST_USE_OWN_TR1_TUPLE  - Define it to 1/0 to indicate whether Google
-//                              Test's own tr1 tuple implementation should be
-//                              used.  Unused when the user sets
-//                              GTEST_HAS_TR1_TUPLE to 0.
-//   GTEST_LINKED_AS_SHARED_LIBRARY
-//                            - Define to 1 when compiling tests that use
-//                              Google Test as a shared library (known as
-//                              DLL on Windows).
-//   GTEST_CREATE_SHARED_LIBRARY
-//                            - Define to 1 when compiling Google Test itself
-//                              as a shared library.
-
-// This header defines the following utilities:
-//
-// Macros indicating the current platform (defined to 1 if compiled on
-// the given platform; otherwise undefined):
-//   GTEST_OS_AIX      - IBM AIX
-//   GTEST_OS_CYGWIN   - Cygwin
-//   GTEST_OS_HPUX     - HP-UX
-//   GTEST_OS_LINUX    - Linux
-//     GTEST_OS_LINUX_ANDROID - Google Android
-//   GTEST_OS_MAC      - Mac OS X
-//   GTEST_OS_NACL     - Google Native Client (NaCl)
-//   GTEST_OS_SOLARIS  - Sun Solaris
-//   GTEST_OS_SYMBIAN  - Symbian
-//   GTEST_OS_WINDOWS  - Windows (Desktop, MinGW, or Mobile)
-//     GTEST_OS_WINDOWS_DESKTOP  - Windows Desktop
-//     GTEST_OS_WINDOWS_MINGW    - MinGW
-//     GTEST_OS_WINDOWS_MOBILE   - Windows Mobile
-//   GTEST_OS_ZOS      - z/OS
-//
-// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the
-// most stable support.  Since core members of the Google Test project
-// don't have access to other platforms, support for them may be less
-// stable.  If you notice any problems on your platform, please notify
-// googletestframework@googlegroups.com (patches for fixing them are
-// even more welcome!).
-//
-// Note that it is possible that none of the GTEST_OS_* macros are defined.
-//
-// Macros indicating available Google Test features (defined to 1 if
-// the corresponding feature is supported; otherwise undefined):
-//   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
-//                            tests)
-//   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
-//   GTEST_HAS_TYPED_TEST   - typed tests
-//   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
-//   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
-//                            GTEST_HAS_POSIX_RE (see above) which users can
-//                            define themselves.
-//   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above two are mutually exclusive.
-//   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
-//
-// Macros for basic C++ coding:
-//   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
-//   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
-//                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-//   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
-//
-// Synchronization:
-//   Mutex, MutexLock, ThreadLocal, GetThreadCount()
-//                  - synchronization primitives.
-//   GTEST_IS_THREADSAFE - defined to 1 to indicate that the above
-//                         synchronization primitives have real implementations
-//                         and Google Test is thread-safe; or 0 otherwise.
-//
-// Template meta programming:
-//   is_pointer     - as in TR1; needed on Symbian and IBM XL C/C++ only.
-//   IteratorTraits - partial implementation of std::iterator_traits, which
-//                    is not available in libCstd when compiled with Sun C++.
-//
-// Smart pointers:
-//   scoped_ptr     - as in TR2.
-//
-// Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like
-//                    platforms, or a reduced regular exception syntax on
-//                    other platforms, including Windows.
-//
-// Logging:
-//   GTEST_LOG_()   - logs messages at the specified severity level.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-//
-// Stdout and stderr capturing:
-//   CaptureStdout()     - starts capturing stdout.
-//   GetCapturedStdout() - stops capturing stdout and returns the captured
-//                         string.
-//   CaptureStderr()     - starts capturing stderr.
-//   GetCapturedStderr() - stops capturing stderr and returns the captured
-//                         string.
-//
-// Integer types:
-//   TypeWithSize   - maps an integer to a int type.
-//   Int32, UInt32, Int64, UInt64, TimeInMillis
-//                  - integers of known sizes.
-//   BiggestInt     - the biggest signed integer type.
-//
-// Command-line utilities:
-//   GTEST_FLAG()       - references a flag.
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
-//   GetArgvs()         - returns the command line as a vector of strings.
-//
-// Environment variable utilities:
-//   GetEnv()             - gets the value of an environment variable.
-//   BoolFromGTestEnv()   - parses a bool environment variable.
-//   Int32FromGTestEnv()  - parses an Int32 environment variable.
-//   StringFromGTestEnv() - parses a string environment variable.
-
-#include <ctype.h>   // for isspace, etc
-#include <stddef.h>  // for ptrdiff_t
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
-#endif  // !_WIN32_WCE
-
-#include <iostream>  // NOLINT
-#include <sstream>  // NOLINT
-#include <string>  // NOLINT
-
-#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-#define GTEST_FLAG_PREFIX_ "gtest_"
-#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-#define GTEST_NAME_ "Google Test"
-#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/"
-
-// Determines the version of gcc that is used to compile this.
-#ifdef __GNUC__
-// 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
-#endif  // __GNUC__
-
-// Determines the platform on which Google Test is compiled.
-#ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-#elif defined __SYMBIAN32__
-# define GTEST_OS_SYMBIAN 1
-#elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(__MINGW__) || defined(__MINGW32__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
-#elif defined __APPLE__
-# define GTEST_OS_MAC 1
-#elif defined __linux__
-# define GTEST_OS_LINUX 1
-# ifdef ANDROID
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif  // ANDROID
-#elif defined __MVS__
-# define GTEST_OS_ZOS 1
-#elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
-#elif defined(_AIX)
-# define GTEST_OS_AIX 1
-#elif defined(__hpux)
-# define GTEST_OS_HPUX 1
-#elif defined __native_client__
-# define GTEST_OS_NACL 1
-#endif  // __CYGWIN__
-
-// Brings in definitions for functions used in the testing::internal::posix
-// namespace (read, write, close, chdir, isatty, stat). We do not currently
-// use them on Windows Mobile.
-#if !GTEST_OS_WINDOWS
-// This assumes that non-Windows OSes provide unistd.h. For OSes where this
-// is not the case, we need to include headers that provide the functions
-// mentioned above.
-# include <unistd.h>
-# if !GTEST_OS_NACL
-// TODO(vladl@google.com): Remove this condition when Native Client SDK adds
-// strings.h (tracked in
-// http://code.google.com/p/nativeclient/issues/detail?id=1175).
-#  include <strings.h>  // Native Client doesn't provide strings.h.
-# endif
-#elif !GTEST_OS_WINDOWS_MOBILE
-# include <direct.h>
-# include <io.h>
-#endif
-
-// Defines this to true iff Google Test can use POSIX regular expressions.
-#ifndef GTEST_HAS_POSIX_RE
-# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
-#endif
-
-#if GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_HAS_POSIX_RE
-
-#ifndef GTEST_HAS_EXCEPTIONS
-// The user didn't tell us whether exceptions are enabled, so we need
-// to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
-// macro to enable exceptions, so we'll do the same.
-// Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__GNUC__) && __EXCEPTIONS
-// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
-// Sun Pro CC supports exceptions.  However, there is no compile-time way of
-// detecting whether they are enabled or not.  Therefore, we assume that
-// they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
-// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
-// Exception handling is in effect by default in HP aCC compiler. It has to
-// be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
-// For other compilers, we assume exceptions are disabled to be
-// conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
-#endif  // GTEST_HAS_EXCEPTIONS
-
-#if !defined(GTEST_HAS_STD_STRING)
-// Even though we don't use this macro any longer, we keep it in case
-// some clients still depend on it.
-# define GTEST_HAS_STD_STRING 1
-#elif !GTEST_HAS_STD_STRING
-// The user told us that ::std::string isn't available.
-# error "Google Test cannot be used where ::std::string isn't available."
-#endif  // !defined(GTEST_HAS_STD_STRING)
-
-#ifndef GTEST_HAS_GLOBAL_STRING
-// The user didn't tell us whether ::string is available, so we need
-// to figure it out.
-
-# define GTEST_HAS_GLOBAL_STRING 0
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-#ifndef GTEST_HAS_STD_WSTRING
-// The user didn't tell us whether ::std::wstring is available, so we need
-// to figure it out.
-// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
-//   is available.
-
-// Cygwin 1.7 and below doesn't support ::std::wstring.
-// Solaris' libc++ doesn't support it either.  Android has
-// no support for it at least as recent as Froyo (2.2).
-# define GTEST_HAS_STD_WSTRING \
-    (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS))
-
-#endif  // GTEST_HAS_STD_WSTRING
-
-#ifndef GTEST_HAS_GLOBAL_WSTRING
-// The user didn't tell us whether ::wstring is available, so we need
-// to figure it out.
-# define GTEST_HAS_GLOBAL_WSTRING \
-    (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING)
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// Determines whether RTTI is available.
-#ifndef GTEST_HAS_RTTI
-// The user didn't tell us whether RTTI is enabled, so we need to
-// figure it out.
-
-# ifdef _MSC_VER
-
-#  ifdef _CPPRTTI  // MSVC defines this macro iff RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled.
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302)
-
-#  ifdef __GXX_RTTI
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
-
-// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
-// both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
-
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
-
-# else
-
-// For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
-
-# endif  // _MSC_VER
-
-#endif  // GTEST_HAS_RTTI
-
-// It's this header's responsibility to #include <typeinfo> when RTTI
-// is enabled.
-#if GTEST_HAS_RTTI
-# include <typeinfo>
-#endif
-
-// Determines whether Google Test can use the pthreads library.
-#ifndef GTEST_HAS_PTHREAD
-// The user didn't tell us explicitly, so we assume pthreads support is
-// available on Linux and Mac.
-//
-// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
-// to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX)
-#endif  // GTEST_HAS_PTHREAD
-
-#if GTEST_HAS_PTHREAD
-// gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
-// true.
-# include <pthread.h>  // NOLINT
-
-// For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
-#endif
-
-// Determines whether Google Test can use tr1/tuple.  You can define
-// this macro to 0 to prevent Google Test from using tuple (any
-// feature depending on tuple with be disabled in this mode).
-#ifndef GTEST_HAS_TR1_TUPLE
-// The user didn't tell us not to do it, so we assume it's OK.
-# define GTEST_HAS_TR1_TUPLE 1
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Determines whether Google Test's own tr1 tuple implementation
-// should be used.
-#ifndef GTEST_USE_OWN_TR1_TUPLE
-// The user didn't tell us, so we need to figure it out.
-
-// We use our own TR1 tuple if we aren't sure the user has an
-// implementation of it already.  At this time, GCC 4.0.0+ and MSVC
-// 2010 are the only mainstream compilers that come with a TR1 tuple
-// implementation.  NVIDIA's CUDA NVCC compiler pretends to be GCC by
-// defining __GNUC__ and friends, but cannot compile GCC's tuple
-// implementation.  MSVC 2008 (9.0) provides TR1 tuple in a 323 MB
-// Feature Pack download, which we cannot assume the user has.
-# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000)) \
-    || _MSC_VER >= 1600
-#  define GTEST_USE_OWN_TR1_TUPLE 0
-# else
-#  define GTEST_USE_OWN_TR1_TUPLE 1
-# endif
-
-#endif  // GTEST_USE_OWN_TR1_TUPLE
-
-// To avoid conditional compilation everywhere, we make it
-// gtest-port.h's responsibility to #include the header implementing
-// tr1/tuple.
-#if GTEST_HAS_TR1_TUPLE
-
-# if GTEST_USE_OWN_TR1_TUPLE
-// This file was GENERATED by a script.  DO NOT EDIT BY HAND!!!
-
-// Copyright 2009 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Implements a subset of TR1 tuple needed by Google Test and Google Mock.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-
-#include <utility>  // For ::std::pair.
-
-// The compiler used in Symbian has a bug that prevents us from declaring the
-// tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
-// private as public.
-// Sun Studio versions < 12 also have the above bug.
-#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public:
-#else
-# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \
-    template <GTEST_10_TYPENAMES_(U)> friend class tuple; \
-   private:
-#endif
-
-// GTEST_n_TUPLE_(T) is the type of an n-tuple.
-#define GTEST_0_TUPLE_(T) tuple<>
-#define GTEST_1_TUPLE_(T) tuple<T##0, void, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_2_TUPLE_(T) tuple<T##0, T##1, void, void, void, void, void, \
-    void, void, void>
-#define GTEST_3_TUPLE_(T) tuple<T##0, T##1, T##2, void, void, void, void, \
-    void, void, void>
-#define GTEST_4_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, void, void, void, \
-    void, void, void>
-#define GTEST_5_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, void, void, \
-    void, void, void>
-#define GTEST_6_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, void, \
-    void, void, void>
-#define GTEST_7_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    void, void, void>
-#define GTEST_8_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, void, void>
-#define GTEST_9_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, void>
-#define GTEST_10_TUPLE_(T) tuple<T##0, T##1, T##2, T##3, T##4, T##5, T##6, \
-    T##7, T##8, T##9>
-
-// GTEST_n_TYPENAMES_(T) declares a list of n typenames.
-#define GTEST_0_TYPENAMES_(T)
-#define GTEST_1_TYPENAMES_(T) typename T##0
-#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1
-#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2
-#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3
-#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4
-#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5
-#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6
-#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, typename T##7
-#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8
-#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \
-    typename T##3, typename T##4, typename T##5, typename T##6, \
-    typename T##7, typename T##8, typename T##9
-
-// In theory, defining stuff in the ::std namespace is undefined
-// behavior.  We can do this as we are playing the role of a standard
-// library vendor.
-namespace std {
-namespace tr1 {
-
-template <typename T0 = void, typename T1 = void, typename T2 = void,
-    typename T3 = void, typename T4 = void, typename T5 = void,
-    typename T6 = void, typename T7 = void, typename T8 = void,
-    typename T9 = void>
-class tuple;
-
-// Anything in namespace gtest_internal is Google Test's INTERNAL
-// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code.
-namespace gtest_internal {
-
-// ByRef<T>::type is T if T is a reference; otherwise it's const T&.
-template <typename T>
-struct ByRef { typedef const T& type; };  // NOLINT
-template <typename T>
-struct ByRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for ByRef.
-#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef<T>::type
-
-// AddRef<T>::type is T if T is a reference; otherwise it's T&.  This
-// is the same as tr1::add_reference<T>::type.
-template <typename T>
-struct AddRef { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddRef<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper for AddRef.
-#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef<T>::type
-
-// A helper for implementing get<k>().
-template <int k> class Get;
-
-// A helper for implementing tuple_element<k, T>.  kIndexValid is true
-// iff k < the number of fields in tuple type T.
-template <bool kIndexValid, int kIndex, class Tuple>
-struct TupleElement;
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 0, GTEST_10_TUPLE_(T)> { typedef T0 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 1, GTEST_10_TUPLE_(T)> { typedef T1 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 2, GTEST_10_TUPLE_(T)> { typedef T2 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 3, GTEST_10_TUPLE_(T)> { typedef T3 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 4, GTEST_10_TUPLE_(T)> { typedef T4 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 5, GTEST_10_TUPLE_(T)> { typedef T5 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 6, GTEST_10_TUPLE_(T)> { typedef T6 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 7, GTEST_10_TUPLE_(T)> { typedef T7 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 8, GTEST_10_TUPLE_(T)> { typedef T8 type; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct TupleElement<true, 9, GTEST_10_TUPLE_(T)> { typedef T9 type; };
-
-}  // namespace gtest_internal
-
-template <>
-class tuple<> {
- public:
-  tuple() {}
-  tuple(const tuple& /* t */)  {}
-  tuple& operator=(const tuple& /* t */) { return *this; }
-};
-
-template <GTEST_1_TYPENAMES_(T)>
-class GTEST_1_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {}
-
-  tuple(const tuple& t) : f0_(t.f0_) {}
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_1_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_1_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    return *this;
-  }
-
-  T0 f0_;
-};
-
-template <GTEST_2_TYPENAMES_(T)>
-class GTEST_2_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0),
-      f1_(f1) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {}
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {}
-  template <typename U0, typename U1>
-  tuple(const ::std::pair<U0, U1>& p) : f0_(p.first), f1_(p.second) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_2_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-  template <typename U0, typename U1>
-  tuple& operator=(const ::std::pair<U0, U1>& p) {
-    f0_ = p.first;
-    f1_ = p.second;
-    return *this;
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_2_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-};
-
-template <GTEST_3_TYPENAMES_(T)>
-class GTEST_3_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_3_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_3_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-};
-
-template <GTEST_4_TYPENAMES_(T)>
-class GTEST_4_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {}
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_4_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_4_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-};
-
-template <GTEST_5_TYPENAMES_(T)>
-class GTEST_5_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3,
-      GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_) {}
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_5_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_5_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-};
-
-template <GTEST_6_TYPENAMES_(T)>
-class GTEST_6_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_) {}
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_6_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_6_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-};
-
-template <GTEST_7_TYPENAMES_(T)>
-class GTEST_7_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_7_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_7_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-};
-
-template <GTEST_8_TYPENAMES_(T)>
-class GTEST_8_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6,
-      GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_8_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_8_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-};
-
-template <GTEST_9_TYPENAMES_(T)>
-class GTEST_9_TUPLE_(T) {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4),
-      f5_(f5), f6_(f6), f7_(f7), f8_(f8) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_9_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_9_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-};
-
-template <GTEST_10_TYPENAMES_(T)>
-class tuple {
- public:
-  template <int k> friend class gtest_internal::Get;
-
-  tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(),
-      f9_() {}
-
-  explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1,
-      GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4,
-      GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7,
-      GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2),
-      f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {}
-
-  tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_),
-      f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {}
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_),
-      f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_),
-      f9_(t.f9_) {}
-
-  tuple& operator=(const tuple& t) { return CopyFrom(t); }
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& operator=(const GTEST_10_TUPLE_(U)& t) {
-    return CopyFrom(t);
-  }
-
-  GTEST_DECLARE_TUPLE_AS_FRIEND_
-
-  template <GTEST_10_TYPENAMES_(U)>
-  tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) {
-    f0_ = t.f0_;
-    f1_ = t.f1_;
-    f2_ = t.f2_;
-    f3_ = t.f3_;
-    f4_ = t.f4_;
-    f5_ = t.f5_;
-    f6_ = t.f6_;
-    f7_ = t.f7_;
-    f8_ = t.f8_;
-    f9_ = t.f9_;
-    return *this;
-  }
-
-  T0 f0_;
-  T1 f1_;
-  T2 f2_;
-  T3 f3_;
-  T4 f4_;
-  T5 f5_;
-  T6 f6_;
-  T7 f7_;
-  T8 f8_;
-  T9 f9_;
-};
-
-// 6.1.3.2 Tuple creation functions.
-
-// Known limitations: we don't support passing an
-// std::tr1::reference_wrapper<T> to make_tuple().  And we don't
-// implement tie().
-
-inline tuple<> make_tuple() { return tuple<>(); }
-
-template <GTEST_1_TYPENAMES_(T)>
-inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) {
-  return GTEST_1_TUPLE_(T)(f0);
-}
-
-template <GTEST_2_TYPENAMES_(T)>
-inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) {
-  return GTEST_2_TUPLE_(T)(f0, f1);
-}
-
-template <GTEST_3_TYPENAMES_(T)>
-inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) {
-  return GTEST_3_TUPLE_(T)(f0, f1, f2);
-}
-
-template <GTEST_4_TYPENAMES_(T)>
-inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3) {
-  return GTEST_4_TUPLE_(T)(f0, f1, f2, f3);
-}
-
-template <GTEST_5_TYPENAMES_(T)>
-inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4) {
-  return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4);
-}
-
-template <GTEST_6_TYPENAMES_(T)>
-inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5) {
-  return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5);
-}
-
-template <GTEST_7_TYPENAMES_(T)>
-inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6) {
-  return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6);
-}
-
-template <GTEST_8_TYPENAMES_(T)>
-inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) {
-  return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7);
-}
-
-template <GTEST_9_TYPENAMES_(T)>
-inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8) {
-  return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8);
-}
-
-template <GTEST_10_TYPENAMES_(T)>
-inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2,
-    const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7,
-    const T8& f8, const T9& f9) {
-  return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9);
-}
-
-// 6.1.3.3 Tuple helper classes.
-
-template <typename Tuple> struct tuple_size;
-
-template <GTEST_0_TYPENAMES_(T)>
-struct tuple_size<GTEST_0_TUPLE_(T)> { static const int value = 0; };
-
-template <GTEST_1_TYPENAMES_(T)>
-struct tuple_size<GTEST_1_TUPLE_(T)> { static const int value = 1; };
-
-template <GTEST_2_TYPENAMES_(T)>
-struct tuple_size<GTEST_2_TUPLE_(T)> { static const int value = 2; };
-
-template <GTEST_3_TYPENAMES_(T)>
-struct tuple_size<GTEST_3_TUPLE_(T)> { static const int value = 3; };
-
-template <GTEST_4_TYPENAMES_(T)>
-struct tuple_size<GTEST_4_TUPLE_(T)> { static const int value = 4; };
-
-template <GTEST_5_TYPENAMES_(T)>
-struct tuple_size<GTEST_5_TUPLE_(T)> { static const int value = 5; };
-
-template <GTEST_6_TYPENAMES_(T)>
-struct tuple_size<GTEST_6_TUPLE_(T)> { static const int value = 6; };
-
-template <GTEST_7_TYPENAMES_(T)>
-struct tuple_size<GTEST_7_TUPLE_(T)> { static const int value = 7; };
-
-template <GTEST_8_TYPENAMES_(T)>
-struct tuple_size<GTEST_8_TUPLE_(T)> { static const int value = 8; };
-
-template <GTEST_9_TYPENAMES_(T)>
-struct tuple_size<GTEST_9_TUPLE_(T)> { static const int value = 9; };
-
-template <GTEST_10_TYPENAMES_(T)>
-struct tuple_size<GTEST_10_TUPLE_(T)> { static const int value = 10; };
-
-template <int k, class Tuple>
-struct tuple_element {
-  typedef typename gtest_internal::TupleElement<
-      k < (tuple_size<Tuple>::value), k, Tuple>::type type;
-};
-
-#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element<k, Tuple >::type
-
-// 6.1.3.4 Element access.
-
-namespace gtest_internal {
-
-template <>
-class Get<0> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  Field(Tuple& t) { return t.f0_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple))
-  ConstField(const Tuple& t) { return t.f0_; }
-};
-
-template <>
-class Get<1> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  Field(Tuple& t) { return t.f1_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple))
-  ConstField(const Tuple& t) { return t.f1_; }
-};
-
-template <>
-class Get<2> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  Field(Tuple& t) { return t.f2_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple))
-  ConstField(const Tuple& t) { return t.f2_; }
-};
-
-template <>
-class Get<3> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  Field(Tuple& t) { return t.f3_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple))
-  ConstField(const Tuple& t) { return t.f3_; }
-};
-
-template <>
-class Get<4> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  Field(Tuple& t) { return t.f4_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple))
-  ConstField(const Tuple& t) { return t.f4_; }
-};
-
-template <>
-class Get<5> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  Field(Tuple& t) { return t.f5_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple))
-  ConstField(const Tuple& t) { return t.f5_; }
-};
-
-template <>
-class Get<6> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  Field(Tuple& t) { return t.f6_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple))
-  ConstField(const Tuple& t) { return t.f6_; }
-};
-
-template <>
-class Get<7> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  Field(Tuple& t) { return t.f7_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple))
-  ConstField(const Tuple& t) { return t.f7_; }
-};
-
-template <>
-class Get<8> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  Field(Tuple& t) { return t.f8_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple))
-  ConstField(const Tuple& t) { return t.f8_; }
-};
-
-template <>
-class Get<9> {
- public:
-  template <class Tuple>
-  static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  Field(Tuple& t) { return t.f9_; }  // NOLINT
-
-  template <class Tuple>
-  static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple))
-  ConstField(const Tuple& t) { return t.f9_; }
-};
-
-}  // namespace gtest_internal
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T)))
-get(GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::Field(t);
-}
-
-template <int k, GTEST_10_TYPENAMES_(T)>
-GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k,  GTEST_10_TUPLE_(T)))
-get(const GTEST_10_TUPLE_(T)& t) {
-  return gtest_internal::Get<k>::ConstField(t);
-}
-
-// 6.1.3.5 Relational operators
-
-// We only implement == and !=, as we don't have a need for the rest yet.
-
-namespace gtest_internal {
-
-// SameSizeTuplePrefixComparator<k, k>::Eq(t1, t2) returns true if the
-// first k fields of t1 equals the first k fields of t2.
-// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if
-// k1 != k2.
-template <int kSize1, int kSize2>
-struct SameSizeTuplePrefixComparator;
-
-template <>
-struct SameSizeTuplePrefixComparator<0, 0> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) {
-    return true;
-  }
-};
-
-template <int k>
-struct SameSizeTuplePrefixComparator<k, k> {
-  template <class Tuple1, class Tuple2>
-  static bool Eq(const Tuple1& t1, const Tuple2& t2) {
-    return SameSizeTuplePrefixComparator<k - 1, k - 1>::Eq(t1, t2) &&
-        ::std::tr1::get<k - 1>(t1) == ::std::tr1::get<k - 1>(t2);
-  }
-};
-
-}  // namespace gtest_internal
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator==(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) {
-  return gtest_internal::SameSizeTuplePrefixComparator<
-      tuple_size<GTEST_10_TUPLE_(T)>::value,
-      tuple_size<GTEST_10_TUPLE_(U)>::value>::Eq(t, u);
-}
-
-template <GTEST_10_TYPENAMES_(T), GTEST_10_TYPENAMES_(U)>
-inline bool operator!=(const GTEST_10_TUPLE_(T)& t,
-                       const GTEST_10_TUPLE_(U)& u) { return !(t == u); }
-
-// 6.1.4 Pairs.
-// Unimplemented.
-
-}  // namespace tr1
-}  // namespace std
-
-#undef GTEST_0_TUPLE_
-#undef GTEST_1_TUPLE_
-#undef GTEST_2_TUPLE_
-#undef GTEST_3_TUPLE_
-#undef GTEST_4_TUPLE_
-#undef GTEST_5_TUPLE_
-#undef GTEST_6_TUPLE_
-#undef GTEST_7_TUPLE_
-#undef GTEST_8_TUPLE_
-#undef GTEST_9_TUPLE_
-#undef GTEST_10_TUPLE_
-
-#undef GTEST_0_TYPENAMES_
-#undef GTEST_1_TYPENAMES_
-#undef GTEST_2_TYPENAMES_
-#undef GTEST_3_TYPENAMES_
-#undef GTEST_4_TYPENAMES_
-#undef GTEST_5_TYPENAMES_
-#undef GTEST_6_TYPENAMES_
-#undef GTEST_7_TYPENAMES_
-#undef GTEST_8_TYPENAMES_
-#undef GTEST_9_TYPENAMES_
-#undef GTEST_10_TYPENAMES_
-
-#undef GTEST_DECLARE_TUPLE_AS_FRIEND_
-#undef GTEST_BY_REF_
-#undef GTEST_ADD_REF_
-#undef GTEST_TUPLE_ELEMENT_
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
-# elif GTEST_OS_SYMBIAN
-
-// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
-// use STLport's tuple implementation, which unfortunately doesn't
-// work as the copy of STLport distributed with Symbian is incomplete.
-// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to
-// use its own tuple implementation.
-#  ifdef BOOST_HAS_TR1_TUPLE
-#   undef BOOST_HAS_TR1_TUPLE
-#  endif  // BOOST_HAS_TR1_TUPLE
-
-// This prevents <boost/tr1/detail/config.hpp>, which defines
-// BOOST_HAS_TR1_TUPLE, from being #included by Boost's <tuple>.
-#  define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED
-#  include <tuple>
-
-# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000)
-// GCC 4.0+ implements tr1/tuple in the <tr1/tuple> header.  This does
-// not conform to the TR1 spec, which requires the header to be <tuple>.
-
-#  if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-// Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
-// which is #included by <tr1/tuple>, to not compile when RTTI is
-// disabled.  _TR1_FUNCTIONAL is the header guard for
-// <tr1/functional>.  Hence the following #define is a hack to prevent
-// <tr1/functional> from being included.
-#   define _TR1_FUNCTIONAL 1
-#   include <tr1/tuple>
-#   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if he chooses to.
-#  else
-#   include <tr1/tuple>  // NOLINT
-#  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
-
-# else
-// If the compiler is not GCC 4.0+, we assume the user is using a
-// spec-conforming TR1 implementation.
-#  include <tuple>  // NOLINT
-# endif  // GTEST_USE_OWN_TR1_TUPLE
-
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Determines whether clone(2) is supported.
-// Usually it will only be available on Linux, excluding
-// Linux on the Itanium architecture.
-// Also see http://linux.die.net/man/2/clone.
-#ifndef GTEST_HAS_CLONE
-// The user didn't tell us, so we need to figure it out.
-
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  define GTEST_HAS_CLONE 1
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
-
-#endif  // GTEST_HAS_CLONE
-
-// Determines whether to support stream redirection. This is used to test
-// output correctness and to implement death tests.
-#ifndef GTEST_HAS_STREAM_REDIRECTION
-// By default, we assume that stream redirection is supported on all
-// platforms except known mobile ones.
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-// Determines whether to support death tests.
-// Google Test does not support death tests for VC 7.1 and earlier as
-// abort() in a VC 7.1 application compiled as GUI in debug config
-// pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
-     GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX)
-# define GTEST_HAS_DEATH_TEST 1
-# include <vector>  // NOLINT
-#endif
-
-// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
-// all the compilers we care about are adequate for supporting
-// value-parameterized tests.
-#define GTEST_HAS_PARAM_TEST 1
-
-// Determines whether to support type-driven tests.
-
-// Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
-// Sun Pro CC, IBM Visual Age, and HP aCC support.
-#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \
-    defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
-#endif
-
-// Determines whether to support Combine(). This only makes sense when
-// value-parameterized tests are enabled.  The implementation doesn't
-// work on Sun Studio since it doesn't understand templated conversion
-// operators.
-#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
-# define GTEST_HAS_COMBINE 1
-#endif
-
-// Determines whether the system compiler uses UTF-16 for encoding wide strings.
-#define GTEST_WIDE_STRING_USES_UTF16_ \
-    (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX)
-
-// Determines whether test results can be streamed to a socket.
-#if GTEST_OS_LINUX
-# define GTEST_CAN_STREAM_RESULTS_ 1
-#endif
-
-// Defines some utility macros.
-
-// The GNU compiler emits a warning if nested "if" statements are followed by
-// an "else" statement and braces are not used to explicitly disambiguate the
-// "else" binding.  This leads to problems with code like:
-//
-//   if (gate)
-//     ASSERT_*(condition) << "Some message";
-//
-// The "switch (0) case 0:" idiom is used to suppress this.
-#ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
-#else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
-#endif
-
-// Use this annotation at the end of a struct/class definition to
-// prevent the compiler from optimizing away instances that are never
-// used.  This is useful when all interesting logic happens inside the
-// c'tor and / or d'tor.  Example:
-//
-//   struct Foo {
-//     Foo() { ... }
-//   } GTEST_ATTRIBUTE_UNUSED_;
-//
-// Also use it after a variable or parameter declaration to tell the
-// compiler the variable/parameter does not have to be used.
-#if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-#else
-# define GTEST_ATTRIBUTE_UNUSED_
-#endif
-
-// A macro to disallow operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type)\
-  void operator=(type const &)
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
-  type(type const &);\
-  GTEST_DISALLOW_ASSIGN_(type)
-
-// Tell the compiler to warn about unused return values for functions declared
-// with this macro.  The macro should be used on function declarations
-// following the argument list:
-//
-//   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
-#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
-#else
-# define GTEST_MUST_USE_RESULT_
-#endif  // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC
-
-// Determine whether the compiler supports Microsoft's Structured Exception
-// Handling.  This is supported by several Windows compilers but generally
-// does not exist on any other system.
-#ifndef GTEST_HAS_SEH
-// The user didn't tell us, so we need to figure it out.
-
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
-// Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
-
-#endif  // GTEST_HAS_SEH
-
-#ifdef _MSC_VER
-
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
-
-#endif  // _MSC_VER
-
-#ifndef GTEST_API_
-# define GTEST_API_
-#endif
-
-#ifdef __GNUC__
-// Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
-#else
-# define GTEST_NO_INLINE_
-#endif
-
-namespace testing {
-
-class Message;
-
-namespace internal {
-
-class String;
-
-// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time
-// expression is true. For example, you could use it to verify the
-// size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES,
-//                         content_type_names_incorrect_size);
-//
-// or to make sure a struct is smaller than a certain size:
-//
-//   GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large);
-//
-// The second argument to the macro is the name of the variable. If
-// the expression is false, most compilers will issue a warning/error
-// containing the name of the variable.
-
-template <bool>
-struct CompileAssert {
-};
-
-#define GTEST_COMPILE_ASSERT_(expr, msg) \
-  typedef ::testing::internal::CompileAssert<(bool(expr))> \
-      msg[bool(expr) ? 1 : -1]
-
-// Implementation details of GTEST_COMPILE_ASSERT_:
-//
-// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1
-//   elements (and thus is invalid) when the expression is false.
-//
-// - The simpler definition
-//
-//    #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1]
-//
-//   does not work, as gcc supports variable-length arrays whose sizes
-//   are determined at run-time (this is gcc's extension and not part
-//   of the C++ standard).  As a result, gcc fails to reject the
-//   following code with the simple definition:
-//
-//     int foo;
-//     GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is
-//                                      // not a compile-time constant.
-//
-// - By using the type CompileAssert<(bool(expr))>, we ensures that
-//   expr is a compile-time constant.  (Template arguments must be
-//   determined at compile-time.)
-//
-// - The outter parentheses in CompileAssert<(bool(expr))> are necessary
-//   to work around a bug in gcc 3.4.4 and 4.0.1.  If we had written
-//
-//     CompileAssert<bool(expr)>
-//
-//   instead, these compilers will refuse to compile
-//
-//     GTEST_COMPILE_ASSERT_(5 > 0, some_message);
-//
-//   (They seem to think the ">" in "5 > 0" marks the end of the
-//   template argument list.)
-//
-// - The array size is (bool(expr) ? 1 : -1), instead of simply
-//
-//     ((expr) ? 1 : -1).
-//
-//   This is to avoid running into a bug in MS VC 7.1, which
-//   causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1.
-
-// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h.
-//
-// This template is declared, but intentionally undefined.
-template <typename T1, typename T2>
-struct StaticAssertTypeEqHelper;
-
-template <typename T>
-struct StaticAssertTypeEqHelper<T, T> {};
-
-#if GTEST_HAS_GLOBAL_STRING
-typedef ::string string;
-#else
-typedef ::std::string string;
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-typedef ::wstring wstring;
-#elif GTEST_HAS_STD_WSTRING
-typedef ::std::wstring wstring;
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-// A helper for suppressing warnings on constant condition.  It just
-// returns 'condition'.
-GTEST_API_ bool IsTrue(bool condition);
-
-// Defines scoped_ptr.
-
-// This implementation of scoped_ptr is PARTIAL - it only contains
-// enough stuff to satisfy Google Test's need.
-template <typename T>
-class scoped_ptr {
- public:
-  typedef T element_type;
-
-  explicit scoped_ptr(T* p = NULL) : ptr_(p) {}
-  ~scoped_ptr() { reset(); }
-
-  T& operator*() const { return *ptr_; }
-  T* operator->() const { return ptr_; }
-  T* get() const { return ptr_; }
-
-  T* release() {
-    T* const ptr = ptr_;
-    ptr_ = NULL;
-    return ptr;
-  }
-
-  void reset(T* p = NULL) {
-    if (p != ptr_) {
-      if (IsTrue(sizeof(T) > 0)) {  // Makes sure T is a complete type.
-        delete ptr_;
-      }
-      ptr_ = p;
-    }
-  }
- private:
-  T* ptr_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr);
-};
-
-// Defines RE.
-
-// A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
-// Regular Expression syntax.
-class GTEST_API_ RE {
- public:
-  // A copy constructor is required by the Standard to initialize object
-  // references from r-values.
-  RE(const RE& other) { Init(other.pattern()); }
-
-  // Constructs an RE from a string.
-  RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-#if GTEST_HAS_GLOBAL_STRING
-
-  RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  RE(const char* regex) { Init(regex); }  // NOLINT
-  ~RE();
-
-  // Returns the string representation of the regex.
-  const char* pattern() const { return pattern_; }
-
-  // FullMatch(str, re) returns true iff regular expression re matches
-  // the entire str.
-  // PartialMatch(str, re) returns true iff regular expression re
-  // matches a substring of str (including str itself).
-  //
-  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
-  // when str contains NUL characters.
-  static bool FullMatch(const ::std::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::std::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-#if GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const ::string& str, const RE& re) {
-    return FullMatch(str.c_str(), re);
-  }
-  static bool PartialMatch(const ::string& str, const RE& re) {
-    return PartialMatch(str.c_str(), re);
-  }
-
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  static bool FullMatch(const char* str, const RE& re);
-  static bool PartialMatch(const char* str, const RE& re);
-
- private:
-  void Init(const char* regex);
-
-  // We use a const char* instead of a string, as Google Test may be used
-  // where string is not available.  We also do not use Google Test's own
-  // String type here, in order to simplify dependencies between the
-  // files.
-  const char* pattern_;
-  bool is_valid_;
-
-#if GTEST_USES_POSIX_RE
-
-  regex_t full_regex_;     // For FullMatch().
-  regex_t partial_regex_;  // For PartialMatch().
-
-#else  // GTEST_USES_SIMPLE_RE
-
-  const char* full_pattern_;  // For FullMatch();
-
-#endif
-
-  GTEST_DISALLOW_ASSIGN_(RE);
-};
-
-// Formats a source file path and a line number as they would appear
-// in an error message from the compiler used to compile this code.
-GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
-
-// Formats a file location for compiler-independent XML output.
-// Although this function is not platform dependent, we put it next to
-// FormatFileLocation in order to contrast the two functions.
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
-                                                               int line);
-
-// Defines logging utilities:
-//   GTEST_LOG_(severity) - logs messages at the specified severity level. The
-//                          message itself is streamed into the macro.
-//   LogToStderr()  - directs all log messages to stderr.
-//   FlushInfoLog() - flushes informational log messages.
-
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
-
-// Formats log entry severity, provides a stream object for streaming the
-// log message, and terminates the message with a newline when going out of
-// scope.
-class GTEST_API_ GTestLog {
- public:
-  GTestLog(GTestLogSeverity severity, const char* file, int line);
-
-  // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
-  ~GTestLog();
-
-  ::std::ostream& GetStream() { return ::std::cerr; }
-
- private:
-  const GTestLogSeverity severity_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
-};
-
-#define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
-
-inline void LogToStderr() {}
-inline void FlushInfoLog() { fflush(NULL); }
-
-// INTERNAL IMPLEMENTATION - DO NOT USE.
-//
-// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
-// is not satisfied.
-//  Synopsys:
-//    GTEST_CHECK_(boolean_condition);
-//     or
-//    GTEST_CHECK_(boolean_condition) << "Additional message";
-//
-//    This checks the condition and if the condition is not satisfied
-//    it prints message about the condition violation, including the
-//    condition itself, plus additional message streamed into it, if any,
-//    and then it aborts the program. It aborts the program irrespective of
-//    whether it is built in the debug mode or not.
-#define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
-
-// An all-mode assert to verify that the given POSIX-style function
-// call returns 0 (indicating success).  Known limitation: this
-// doesn't expand to a balanced 'if' statement, so enclose the macro
-// in {} if you need to use it as the only statement in an 'if'
-// branch.
-#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Use ImplicitCast_ as a safe version of static_cast for upcasting in
-// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a
-// const Foo*).  When you use ImplicitCast_, the compiler checks that
-// the cast is safe.  Such explicit ImplicitCast_s are necessary in
-// surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
-//
-// The syntax for using ImplicitCast_ is the same as for static_cast:
-//
-//   ImplicitCast_<ToType>(expr)
-//
-// ImplicitCast_ would have been part of the C++ standard library,
-// but the proposal was submitted too late.  It will probably make
-// its way into the language in the future.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., implicit_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
-
-// When you upcast (that is, cast a pointer from type Foo to type
-// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
-// always succeed.  When you downcast (that is, cast a pointer from
-// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because
-// how do you know the pointer is really of type SubclassOfFoo?  It
-// could be a bare Foo, or of type DifferentSubclassOfFoo.  Thus,
-// when you downcast, you should use this macro.  In debug mode, we
-// use dynamic_cast<> to double-check the downcast is legal (we die
-// if it's not).  In normal mode, we do the efficient static_cast<>
-// instead.  Thus, it's important to test in debug mode to make sure
-// the cast is legal!
-//    This is the only place in the code we should use dynamic_cast<>.
-// In particular, you SHOULDN'T be using dynamic_cast<> in order to
-// do RTTI (eg code like this:
-//    if (dynamic_cast<Subclass1>(foo)) HandleASubclass1Object(foo);
-//    if (dynamic_cast<Subclass2>(foo)) HandleASubclass2Object(foo);
-// You should design the code some other way not to need this.
-//
-// This relatively ugly name is intentional. It prevents clashes with
-// similar functions users may have (e.g., down_cast). The internal
-// namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
-  // Ensures that To is a sub-type of From *.  This test is here only
-  // for compile-time type checking, and has no overhead in an
-  // optimized build at run-time, as it will be optimized away
-  // completely.
-  if (false) {
-    const To to = NULL;
-    ::testing::internal::ImplicitCast_<From*>(to);
-  }
-
-#if GTEST_HAS_RTTI
-  // RTTI: debug mode only!
-  GTEST_CHECK_(f == NULL || dynamic_cast<To>(f) != NULL);
-#endif
-  return static_cast<To>(f);
-}
-
-// Downcasts the pointer of type Base to Derived.
-// Derived must be a subclass of Base. The parameter MUST
-// point to a class of type Derived, not any subclass of it.
-// When RTTI is available, the function performs a runtime
-// check to enforce this.
-template <class Derived, class Base>
-Derived* CheckedDowncastToActualType(Base* base) {
-#if GTEST_HAS_RTTI
-  GTEST_CHECK_(typeid(*base) == typeid(Derived));
-  return dynamic_cast<Derived*>(base);  // NOLINT
-#else
-  return static_cast<Derived*>(base);  // Poor man's downcast.
-#endif
-}
-
-#if GTEST_HAS_STREAM_REDIRECTION
-
-// Defines the stderr capturer:
-//   CaptureStdout     - starts capturing stdout.
-//   GetCapturedStdout - stops capturing stdout and returns the captured string.
-//   CaptureStderr     - starts capturing stderr.
-//   GetCapturedStderr - stops capturing stderr and returns the captured string.
-//
-GTEST_API_ void CaptureStdout();
-GTEST_API_ String GetCapturedStdout();
-GTEST_API_ void CaptureStderr();
-GTEST_API_ String GetCapturedStderr();
-
-#endif  // GTEST_HAS_STREAM_REDIRECTION
-
-
-#if GTEST_HAS_DEATH_TEST
-
-// A copy of all command line arguments.  Set by InitGoogleTest().
-extern ::std::vector<String> g_argvs;
-
-// GTEST_HAS_DEATH_TEST implies we have ::std::string.
-const ::std::vector<String>& GetArgvs();
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-// Defines synchronization primitives.
-
-#if GTEST_HAS_PTHREAD
-
-// Sleeps for (roughly) n milli-seconds.  This function is only for
-// testing Google Test's own constructs.  Don't use it in user tests,
-// either directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, NULL);
-}
-
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {}
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() { notified_ = true; }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    while(!notified_) {
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  volatile bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-// As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
-// Consequently, it cannot select a correct instantiation of ThreadWithParam
-// in order to call its Run(). Introducing ThreadWithParamBase as a
-// non-templated base class for ThreadWithParam allows us to bypass this
-// problem.
-class ThreadWithParamBase {
- public:
-  virtual ~ThreadWithParamBase() {}
-  virtual void Run() = 0;
-};
-
-// pthread_create() accepts a pointer to a function type with the C linkage.
-// According to the Standard (7.5/1), function types with different linkages
-// are different even if they are otherwise identical.  Some compilers (for
-// example, SunStudio) treat them as different types.  Since class methods
-// cannot be defined with C-linkage we need to define a free C-function to
-// pass into pthread_create().
-extern "C" inline void* ThreadFuncWithCLinkage(void* thread) {
-  static_cast<ThreadWithParamBase*>(thread)->Run();
-  return NULL;
-}
-
-// Helper class for testing Google Test's multi-threading constructs.
-// To use it, write:
-//
-//   void ThreadFunc(int param) { /* Do things with param */ }
-//   Notification thread_can_start;
-//   ...
-//   // The thread_can_start parameter is optional; you can supply NULL.
-//   ThreadWithParam<int> thread(&ThreadFunc, 5, &thread_can_start);
-//   thread_can_start.Notify();
-//
-// These classes are only for testing Google Test's own constructs. Do
-// not use them in user tests, either directly or indirectly.
-template <typename T>
-class ThreadWithParam : public ThreadWithParamBase {
- public:
-  typedef void (*UserThreadFunc)(T);
-
-  ThreadWithParam(
-      UserThreadFunc func, T param, Notification* thread_can_start)
-      : func_(func),
-        param_(param),
-        thread_can_start_(thread_can_start),
-        finished_(false) {
-    ThreadWithParamBase* const base = this;
-    // The thread can be created only after all fields except thread_
-    // have been initialized.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base));
-  }
-  ~ThreadWithParam() { Join(); }
-
-  void Join() {
-    if (!finished_) {
-      GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0));
-      finished_ = true;
-    }
-  }
-
-  virtual void Run() {
-    if (thread_can_start_ != NULL)
-      thread_can_start_->WaitForNotification();
-    func_(param_);
-  }
-
- private:
-  const UserThreadFunc func_;  // User-supplied thread function.
-  const T param_;  // User-supplied parameter to the thread function.
-  // When non-NULL, used to block execution until the controller thread
-  // notifies.
-  Notification* const thread_can_start_;
-  bool finished_;  // true iff we know that the thread function has finished.
-  pthread_t thread_;  // The native thread object.
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
-};
-
-// MutexBase and Mutex implement mutex on pthreads-based platforms. They
-// are used in conjunction with class MutexLock:
-//
-//   Mutex mutex;
-//   ...
-//   MutexLock lock(&mutex);  // Acquires the mutex and releases it at the end
-//                            // of the current scope.
-//
-// MutexBase implements behavior for both statically and dynamically
-// allocated mutexes.  Do not use MutexBase directly.  Instead, write
-// the following to define a static mutex:
-//
-//   GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex);
-//
-// You can forward declare a static mutex like this:
-//
-//   GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex);
-//
-// To create a dynamic mutex, just define an object of type Mutex.
-class MutexBase {
- public:
-  // Acquires this mutex.
-  void Lock() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_));
-    owner_ = pthread_self();
-  }
-
-  // Releases this mutex.
-  void Unlock() {
-    // We don't protect writing to owner_ here, as it's the caller's
-    // responsibility to ensure that the current thread holds the
-    // mutex when this is called.
-    owner_ = 0;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_));
-  }
-
-  // Does nothing if the current thread holds the mutex. Otherwise, crashes
-  // with high probability.
-  void AssertHeld() const {
-    GTEST_CHECK_(owner_ == pthread_self())
-        << "The current thread is not holding the mutex @" << this;
-  }
-
-  // A static mutex may be used before main() is entered.  It may even
-  // be used before the dynamic initialization stage.  Therefore we
-  // must be able to initialize a static mutex object at link time.
-  // This means MutexBase has to be a POD and its member variables
-  // have to be public.
- public:
-  pthread_mutex_t mutex_;  // The underlying pthread mutex.
-  pthread_t owner_;  // The thread holding the mutex; 0 means no one holds it.
-};
-
-// Forward-declares a static mutex.
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::MutexBase mutex
-
-// Defines and statically (i.e. at link time) initializes a static mutex.
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, 0 }
-
-// The Mutex class can only be used for mutexes created at runtime. It
-// shares its API with MutexBase otherwise.
-class Mutex : public MutexBase {
- public:
-  Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL));
-    owner_ = 0;
-  }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
-};
-
-// We cannot name this class MutexLock as the ctor declaration would
-// conflict with a macro named MutexLock, which is defined on some
-// platforms.  Hence the typedef trick below.
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
-
-  ~GTestMutexLock() { mutex_->Unlock(); }
-
- private:
-  MutexBase* const mutex_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
-};
-
-typedef GTestMutexLock MutexLock;
-
-// Helpers for ThreadLocal.
-
-// pthread_key_create() requires DeleteThreadLocalValue() to have
-// C-linkage.  Therefore it cannot be templatized to access
-// ThreadLocal<T>.  Hence the need for class
-// ThreadLocalValueHolderBase.
-class ThreadLocalValueHolderBase {
- public:
-  virtual ~ThreadLocalValueHolderBase() {}
-};
-
-// Called by pthread to delete thread-local data stored by
-// pthread_setspecific().
-extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
-  delete static_cast<ThreadLocalValueHolderBase*>(value_holder);
-}
-
-// Implements thread-local storage on pthreads-based systems.
-//
-//   // Thread 1
-//   ThreadLocal<int> tl(100);  // 100 is the default value for each thread.
-//
-//   // Thread 2
-//   tl.set(150);  // Changes the value for thread 2 only.
-//   EXPECT_EQ(150, tl.get());
-//
-//   // Thread 1
-//   EXPECT_EQ(100, tl.get());  // In thread 1, tl has the original value.
-//   tl.set(200);
-//   EXPECT_EQ(200, tl.get());
-//
-// The template type argument T must have a public copy constructor.
-// In addition, the default ThreadLocal constructor requires T to have
-// a public default constructor.
-//
-// An object managed for a thread by a ThreadLocal instance is deleted
-// when the thread exits.  Or, if the ThreadLocal instance dies in
-// that thread, when the ThreadLocal dies.  It's the user's
-// responsibility to ensure that all other threads using a ThreadLocal
-// have exited when it dies, or the per-thread objects for those
-// threads will not be deleted.
-//
-// Google Test only uses global ThreadLocal objects.  That means they
-// will die after main() has returned.  Therefore, no per-thread
-// object managed by Google Test will be leaked as long as all threads
-// using Google Test have exited when main() returns.
-template <typename T>
-class ThreadLocal {
- public:
-  ThreadLocal() : key_(CreateKey()),
-                  default_() {}
-  explicit ThreadLocal(const T& value) : key_(CreateKey()),
-                                         default_(value) {}
-
-  ~ThreadLocal() {
-    // Destroys the managed object for the current thread, if any.
-    DeleteThreadLocalValue(pthread_getspecific(key_));
-
-    // Releases resources associated with the key.  This will *not*
-    // delete managed objects for other threads.
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_));
-  }
-
-  T* pointer() { return GetOrCreateValue(); }
-  const T* pointer() const { return GetOrCreateValue(); }
-  const T& get() const { return *pointer(); }
-  void set(const T& value) { *pointer() = value; }
-
- private:
-  // Holds a value of type T.
-  class ValueHolder : public ThreadLocalValueHolderBase {
-   public:
-    explicit ValueHolder(const T& value) : value_(value) {}
-
-    T* pointer() { return &value_; }
-
-   private:
-    T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
-  };
-
-  static pthread_key_t CreateKey() {
-    pthread_key_t key;
-    // When a thread exits, DeleteThreadLocalValue() will be called on
-    // the object managed for that thread.
-    GTEST_CHECK_POSIX_SUCCESS_(
-        pthread_key_create(&key, &DeleteThreadLocalValue));
-    return key;
-  }
-
-  T* GetOrCreateValue() const {
-    ThreadLocalValueHolderBase* const holder =
-        static_cast<ThreadLocalValueHolderBase*>(pthread_getspecific(key_));
-    if (holder != NULL) {
-      return CheckedDowncastToActualType<ValueHolder>(holder)->pointer();
-    }
-
-    ValueHolder* const new_holder = new ValueHolder(default_);
-    ThreadLocalValueHolderBase* const holder_base = new_holder;
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base));
-    return new_holder->pointer();
-  }
-
-  // A key pthreads uses for looking up per-thread values.
-  const pthread_key_t key_;
-  const T default_;  // The default value for each thread.
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
-};
-
-# define GTEST_IS_THREADSAFE 1
-
-#else  // GTEST_HAS_PTHREAD
-
-// A dummy implementation of synchronization primitives (mutex, lock,
-// and thread-local variable).  Necessary for compiling Google Test where
-// mutex is not supported - using Google Test in multiple threads is not
-// supported on such platforms.
-
-class Mutex {
- public:
-  Mutex() {}
-  void AssertHeld() const {}
-};
-
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-  extern ::testing::internal::Mutex mutex
-
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
-
-class GTestMutexLock {
- public:
-  explicit GTestMutexLock(Mutex*) {}  // NOLINT
-};
-
-typedef GTestMutexLock MutexLock;
-
-template <typename T>
-class ThreadLocal {
- public:
-  ThreadLocal() : value_() {}
-  explicit ThreadLocal(const T& value) : value_(value) {}
-  T* pointer() { return &value_; }
-  const T* pointer() const { return &value_; }
-  const T& get() const { return value_; }
-  void set(const T& value) { value_ = value; }
- private:
-  T value_;
-};
-
-// The above synchronization primitives have dummy implementations.
-// Therefore Google Test is not thread-safe.
-# define GTEST_IS_THREADSAFE 0
-
-#endif  // GTEST_HAS_PTHREAD
-
-// Returns the number of threads running in the process, or 0 to indicate that
-// we cannot detect it.
-GTEST_API_ size_t GetThreadCount();
-
-// Passing non-POD classes through ellipsis (...) crashes the ARM
-// compiler and generates a warning in Sun Studio.  The Nokia Symbian
-// and the IBM XL C/C++ compiler try to instantiate a copy constructor
-// for objects passed through ellipsis (...), failing for uncopyable
-// objects.  We define this to ensure that only POD is passed through
-// ellipsis on these systems.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_ELLIPSIS_NEEDS_POD_ 1
-#else
-# define GTEST_CAN_COMPARE_NULL 1
-#endif
-
-// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between
-// const T& and const T* in a function template.  These compilers
-// _can_ decide between class template specializations for T and T*,
-// so a tr1::type_traits-like is_pointer works.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__)
-# define GTEST_NEEDS_IS_POINTER_ 1
-#endif
-
-template <bool bool_value>
-struct bool_constant {
-  typedef bool_constant<bool_value> type;
-  static const bool value = bool_value;
-};
-template <bool bool_value> const bool bool_constant<bool_value>::value;
-
-typedef bool_constant<false> false_type;
-typedef bool_constant<true> true_type;
-
-template <typename T>
-struct is_pointer : public false_type {};
-
-template <typename T>
-struct is_pointer<T*> : public true_type {};
-
-template <typename Iterator>
-struct IteratorTraits {
-  typedef typename Iterator::value_type value_type;
-};
-
-template <typename T>
-struct IteratorTraits<T*> {
-  typedef T value_type;
-};
-
-template <typename T>
-struct IteratorTraits<const T*> {
-  typedef T value_type;
-};
-
-#if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
-// The biggest signed integer type the compiler supports.
-typedef __int64 BiggestInt;
-#else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
-typedef long long BiggestInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-
-// Utilities for char.
-
-// isspace(int ch) and friends accept an unsigned char or EOF.  char
-// may be signed, depending on the compiler (or compiler flags).
-// Therefore we need to cast a char to unsigned char before calling
-// isspace(), etc.
-
-inline bool IsAlpha(char ch) {
-  return isalpha(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsAlNum(char ch) {
-  return isalnum(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsDigit(char ch) {
-  return isdigit(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsLower(char ch) {
-  return islower(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsSpace(char ch) {
-  return isspace(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsUpper(char ch) {
-  return isupper(static_cast<unsigned char>(ch)) != 0;
-}
-inline bool IsXDigit(char ch) {
-  return isxdigit(static_cast<unsigned char>(ch)) != 0;
-}
-
-inline char ToLower(char ch) {
-  return static_cast<char>(tolower(static_cast<unsigned char>(ch)));
-}
-inline char ToUpper(char ch) {
-  return static_cast<char>(toupper(static_cast<unsigned char>(ch)));
-}
-
-// The testing::internal::posix namespace holds wrappers for common
-// POSIX functions.  These wrappers hide the differences between
-// Windows/MSVC and POSIX systems.  Since some compilers define these
-// standard functions as macros, the wrapper cannot have the same name
-// as the wrapped function.
-
-namespace posix {
-
-// Functions with a different name on Windows.
-
-#if GTEST_OS_WINDOWS
-
-typedef struct _stat StatStruct;
-
-# ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
-#  else
-inline int IsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return _stricmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
-
-# if GTEST_OS_WINDOWS_MOBILE
-inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
-// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
-// time and thus not defined there.
-# else
-inline int FileNo(FILE* file) { return _fileno(file); }
-inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
-inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif  // GTEST_OS_WINDOWS_MOBILE
-
-#else
-
-typedef struct stat StatStruct;
-
-inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
-inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
-inline int StrCaseCmp(const char* s1, const char* s2) {
-  return strcasecmp(s1, s2);
-}
-inline char* StrDup(const char* src) { return strdup(src); }
-inline int RmDir(const char* dir) { return rmdir(dir); }
-inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
-
-#endif  // GTEST_OS_WINDOWS
-
-// Functions deprecated by MSVC 8.0.
-
-#ifdef _MSC_VER
-// Temporarily disable warning 4996 (deprecated function).
-# pragma warning(push)
-# pragma warning(disable:4996)
-#endif
-
-inline const char* StrNCpy(char* dest, const char* src, size_t n) {
-  return strncpy(dest, src, n);
-}
-
-// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and
-// StrError() aren't needed on Windows CE at this time and thus not
-// defined there.
-
-#if !GTEST_OS_WINDOWS_MOBILE
-inline int ChDir(const char* dir) { return chdir(dir); }
-#endif
-inline FILE* FOpen(const char* path, const char* mode) {
-  return fopen(path, mode);
-}
-#if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
-  return freopen(path, mode, stream);
-}
-inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
-#endif
-inline int FClose(FILE* fp) { return fclose(fp); }
-#if !GTEST_OS_WINDOWS_MOBILE
-inline int Read(int fd, void* buf, unsigned int count) {
-  return static_cast<int>(read(fd, buf, count));
-}
-inline int Write(int fd, const void* buf, unsigned int count) {
-  return static_cast<int>(write(fd, buf, count));
-}
-inline int Close(int fd) { return close(fd); }
-inline const char* StrError(int errnum) { return strerror(errnum); }
-#endif
-inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE
-  // We are on Windows CE, which has no environment variables.
-  return NULL;
-#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9)
-  // Environment variables which we programmatically clear will be set to the
-  // empty string rather than unset (NULL).  Handle that case.
-  const char* const env = getenv(name);
-  return (env != NULL && env[0] != '\0') ? env : NULL;
-#else
-  return getenv(name);
-#endif
-}
-
-#ifdef _MSC_VER
-# pragma warning(pop)  // Restores the warning state.
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
-// Windows CE has no C library. The abort() function is used in
-// several places in Google Test. This implementation provides a reasonable
-// imitation of standard behaviour.
-void Abort();
-#else
-inline void Abort() { abort(); }
-#endif  // GTEST_OS_WINDOWS_MOBILE
-
-}  // namespace posix
-
-// The maximum number a BiggestInt can represent.  This definition
-// works no matter BiggestInt is represented in one's complement or
-// two's complement.
-//
-// We cannot rely on numeric_limits in STL, as __int64 and long long
-// are not part of standard C++ and numeric_limits doesn't need to be
-// defined for them.
-const BiggestInt kMaxBiggestInt =
-    ~(static_cast<BiggestInt>(1) << (8*sizeof(BiggestInt) - 1));
-
-// This template class serves as a compile-time function from size to
-// type.  It maps a size in bytes to a primitive type with that
-// size. e.g.
-//
-//   TypeWithSize<4>::UInt
-//
-// is typedef-ed to be unsigned int (unsigned integer made up of 4
-// bytes).
-//
-// Such functionality should belong to STL, but I cannot find it
-// there.
-//
-// Google Test uses this class in the implementation of floating-point
-// comparison.
-//
-// For now it only handles UInt (unsigned int) as that's all Google Test
-// needs.  Other types can be easily added in the future if need
-// arises.
-template <size_t size>
-class TypeWithSize {
- public:
-  // This prevents the user from using TypeWithSize<N> with incorrect
-  // values of N.
-  typedef void UInt;
-};
-
-// The specialization for size 4.
-template <>
-class TypeWithSize<4> {
- public:
-  // unsigned int has size 4 in both gcc and MSVC.
-  //
-  // As base/basictypes.h doesn't compile on Windows, we cannot use
-  // uint32, uint64, and etc here.
-  typedef int Int;
-  typedef unsigned int UInt;
-};
-
-// The specialization for size 8.
-template <>
-class TypeWithSize<8> {
- public:
-
-#if GTEST_OS_WINDOWS
-  typedef __int64 Int;
-  typedef unsigned __int64 UInt;
-#else
-  typedef long long Int;  // NOLINT
-  typedef unsigned long long UInt;  // NOLINT
-#endif  // GTEST_OS_WINDOWS
-};
-
-// Integer types of known sizes.
-typedef TypeWithSize<4>::Int Int32;
-typedef TypeWithSize<4>::UInt UInt32;
-typedef TypeWithSize<8>::Int Int64;
-typedef TypeWithSize<8>::UInt UInt64;
-typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
-
-// Utilities for command line flags and environment variables.
-
-// Macro for referencing flags.
-#define GTEST_FLAG(name) FLAGS_gtest_##name
-
-// Macros for declaring flags.
-#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-#define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-#define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::testing::internal::String GTEST_FLAG(name)
-
-// Macros for defining flags.
-#define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::testing::internal::String GTEST_FLAG(name) = (default_val)
-
-// Parses 'str' for a 32-bit signed integer.  If successful, writes the result
-// to *value and returns true; otherwise leaves *value unchanged and returns
-// false.
-// TODO(chandlerc): Find a better way to refactor flag and environment parsing
-// out of both gtest-port.cc and gtest.cc to avoid exporting this utility
-// function.
-bool ParseInt32(const Message& src_text, const char* str, Int32* value);
-
-// Parses a bool/Int32/string from the environment variable
-// corresponding to the given Google Test flag.
-bool BoolFromGTestEnv(const char* flag, bool default_val);
-GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
-const char* StringFromGTestEnv(const char* flag, const char* default_val);
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-
-#if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
-#endif  // GTEST_OS_LINUX
-
-#include <ctype.h>
-#include <string.h>
-#include <iomanip>
-#include <limits>
-#include <set>
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file declares the String class and functions used internally by
-// Google Test.  They are subject to change without notice. They should not used
-// by code external to Google Test.
-//
-// This header file is #included by <gtest/internal/gtest-internal.h>.
-// It should not be #included by other files.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-
-#ifdef __BORLANDC__
-// string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
-#endif
-
-#include <string.h>
-
-#include <string>
-
-namespace testing {
-namespace internal {
-
-// String - a UTF-8 string class.
-//
-// For historic reasons, we don't use std::string.
-//
-// TODO(wan@google.com): replace this class with std::string or
-// implement it in terms of the latter.
-//
-// Note that String can represent both NULL and the empty string,
-// while std::string cannot represent NULL.
-//
-// NULL and the empty string are considered different.  NULL is less
-// than anything (including the empty string) except itself.
-//
-// This class only provides minimum functionality necessary for
-// implementing Google Test.  We do not intend to implement a full-fledged
-// string class here.
-//
-// Since the purpose of this class is to provide a substitute for
-// std::string on platforms where it cannot be used, we define a copy
-// constructor and assignment operators such that we don't need
-// conditional compilation in a lot of places.
-//
-// In order to make the representation efficient, the d'tor of String
-// is not virtual.  Therefore DO NOT INHERIT FROM String.
-class GTEST_API_ String {
- public:
-  // Static utility methods
-
-  // Returns the input enclosed in double quotes if it's not NULL;
-  // otherwise returns "(null)".  For example, "\"Hello\"" is returned
-  // for input "Hello".
-  //
-  // This is useful for printing a C string in the syntax of a literal.
-  //
-  // Known issue: escape sequences are not handled yet.
-  static String ShowCStringQuoted(const char* c_str);
-
-  // Clones a 0-terminated C string, allocating memory using new.  The
-  // caller is responsible for deleting the return value using
-  // delete[].  Returns the cloned string, or NULL if the input is
-  // NULL.
-  //
-  // This is different from strdup() in string.h, which allocates
-  // memory using malloc().
-  static const char* CloneCString(const char* c_str);
-
-#if GTEST_OS_WINDOWS_MOBILE
-  // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be
-  // able to pass strings to Win32 APIs on CE we need to convert them
-  // to 'Unicode', UTF-16.
-
-  // Creates a UTF-16 wide string from the given ANSI string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the wide string, or NULL if the
-  // input is NULL.
-  //
-  // The wide string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static LPCWSTR AnsiToUtf16(const char* c_str);
-
-  // Creates an ANSI string from the given wide string, allocating
-  // memory using new. The caller is responsible for deleting the return
-  // value using delete[]. Returns the ANSI string, or NULL if the
-  // input is NULL.
-  //
-  // The returned string is created using the ANSI codepage (CP_ACP) to
-  // match the behaviour of the ANSI versions of Win32 calls and the
-  // C runtime.
-  static const char* Utf16ToAnsi(LPCWSTR utf16_str);
-#endif
-
-  // Compares two C strings.  Returns true iff they have the same content.
-  //
-  // Unlike strcmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CStringEquals(const char* lhs, const char* rhs);
-
-  // Converts a wide C string to a String using the UTF-8 encoding.
-  // NULL will be converted to "(null)".  If an error occurred during
-  // the conversion, "(failed to convert from wide string)" is
-  // returned.
-  static String ShowWideCString(const wchar_t* wide_c_str);
-
-  // Similar to ShowWideCString(), except that this function encloses
-  // the converted string in double quotes.
-  static String ShowWideCStringQuoted(const wchar_t* wide_c_str);
-
-  // Compares two wide C strings.  Returns true iff they have the same
-  // content.
-  //
-  // Unlike wcscmp(), this function can handle NULL argument(s).  A
-  // NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs);
-
-  // Compares two C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike strcasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL C string,
-  // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
-
-  // Compares two wide C strings, ignoring case.  Returns true iff they
-  // have the same content.
-  //
-  // Unlike wcscasecmp(), this function can handle NULL argument(s).
-  // A NULL C string is considered different to any non-NULL wide C string,
-  // including the empty string.
-  // NB: The implementations on different platforms slightly differ.
-  // On windows, this method uses _wcsicmp which compares according to LC_CTYPE
-  // environment variable. On GNU platform this method uses wcscasecmp
-  // which compares according to LC_CTYPE category of the current locale.
-  // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the
-  // current locale.
-  static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
-                                               const wchar_t* rhs);
-
-  // Formats a list of arguments to a String, using the same format
-  // spec string as for printf.
-  //
-  // We do not use the StringPrintf class as it is not universally
-  // available.
-  //
-  // The result is limited to 4096 characters (including the tailing
-  // 0).  If 4096 characters are not enough to format the input,
-  // "<buffer exceeded>" is returned.
-  static String Format(const char* format, ...);
-
-  // C'tors
-
-  // The default c'tor constructs a NULL string.
-  String() : c_str_(NULL), length_(0) {}
-
-  // Constructs a String by cloning a 0-terminated C string.
-  String(const char* a_c_str) {  // NOLINT
-    if (a_c_str == NULL) {
-      c_str_ = NULL;
-      length_ = 0;
-    } else {
-      ConstructNonNull(a_c_str, strlen(a_c_str));
-    }
-  }
-
-  // Constructs a String by copying a given number of chars from a
-  // buffer.  E.g. String("hello", 3) creates the string "hel",
-  // String("a\0bcd", 4) creates "a\0bc", String(NULL, 0) creates "",
-  // and String(NULL, 1) results in access violation.
-  String(const char* buffer, size_t a_length) {
-    ConstructNonNull(buffer, a_length);
-  }
-
-  // The copy c'tor creates a new copy of the string.  The two
-  // String objects do not share content.
-  String(const String& str) : c_str_(NULL), length_(0) { *this = str; }
-
-  // D'tor.  String is intended to be a final class, so the d'tor
-  // doesn't need to be virtual.
-  ~String() { delete[] c_str_; }
-
-  // Allows a String to be implicitly converted to an ::std::string or
-  // ::string, and vice versa.  Converting a String containing a NULL
-  // pointer to ::std::string or ::string is undefined behavior.
-  // Converting a ::std::string or ::string containing an embedded NUL
-  // character to a String will result in the prefix up to the first
-  // NUL character.
-  String(const ::std::string& str) {
-    ConstructNonNull(str.c_str(), str.length());
-  }
-
-  operator ::std::string() const { return ::std::string(c_str(), length()); }
-
-#if GTEST_HAS_GLOBAL_STRING
-  String(const ::string& str) {
-    ConstructNonNull(str.c_str(), str.length());
-  }
-
-  operator ::string() const { return ::string(c_str(), length()); }
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-  // Returns true iff this is an empty string (i.e. "").
-  bool empty() const { return (c_str() != NULL) && (length() == 0); }
-
-  // Compares this with another String.
-  // Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0
-  // if this is greater than rhs.
-  int Compare(const String& rhs) const;
-
-  // Returns true iff this String equals the given C string.  A NULL
-  // string and a non-NULL string are considered not equal.
-  bool operator==(const char* a_c_str) const { return Compare(a_c_str) == 0; }
-
-  // Returns true iff this String is less than the given String.  A
-  // NULL string is considered less than "".
-  bool operator<(const String& rhs) const { return Compare(rhs) < 0; }
-
-  // Returns true iff this String doesn't equal the given C string.  A NULL
-  // string and a non-NULL string are considered not equal.
-  bool operator!=(const char* a_c_str) const { return !(*this == a_c_str); }
-
-  // Returns true iff this String ends with the given suffix.  *Any*
-  // String is considered to end with a NULL or empty suffix.
-  bool EndsWith(const char* suffix) const;
-
-  // Returns true iff this String ends with the given suffix, not considering
-  // case. Any String is considered to end with a NULL or empty suffix.
-  bool EndsWithCaseInsensitive(const char* suffix) const;
-
-  // Returns the length of the encapsulated string, or 0 if the
-  // string is NULL.
-  size_t length() const { return length_; }
-
-  // Gets the 0-terminated C string this String object represents.
-  // The String object still owns the string.  Therefore the caller
-  // should NOT delete the return value.
-  const char* c_str() const { return c_str_; }
-
-  // Assigns a C string to this object.  Self-assignment works.
-  const String& operator=(const char* a_c_str) {
-    return *this = String(a_c_str);
-  }
-
-  // Assigns a String object to this object.  Self-assignment works.
-  const String& operator=(const String& rhs) {
-    if (this != &rhs) {
-      delete[] c_str_;
-      if (rhs.c_str() == NULL) {
-        c_str_ = NULL;
-        length_ = 0;
-      } else {
-        ConstructNonNull(rhs.c_str(), rhs.length());
-      }
-    }
-
-    return *this;
-  }
-
- private:
-  // Constructs a non-NULL String from the given content.  This
-  // function can only be called when c_str_ has not been allocated.
-  // ConstructNonNull(NULL, 0) results in an empty string ("").
-  // ConstructNonNull(NULL, non_zero) is undefined behavior.
-  void ConstructNonNull(const char* buffer, size_t a_length) {
-    char* const str = new char[a_length + 1];
-    memcpy(str, buffer, a_length);
-    str[a_length] = '\0';
-    c_str_ = str;
-    length_ = a_length;
-  }
-
-  const char* c_str_;
-  size_t length_;
-};  // class String
-
-// Streams a String to an ostream.  Each '\0' character in the String
-// is replaced with "\\0".
-inline ::std::ostream& operator<<(::std::ostream& os, const String& str) {
-  if (str.c_str() == NULL) {
-    os << "(null)";
-  } else {
-    const char* const c_str = str.c_str();
-    for (size_t i = 0; i != str.length(); i++) {
-      if (c_str[i] == '\0') {
-        os << "\\0";
-      } else {
-        os << c_str[i];
-      }
-    }
-  }
-  return os;
-}
-
-// Gets the content of the stringstream's buffer as a String.  Each '\0'
-// character in the buffer is replaced with "\\0".
-GTEST_API_ String StringStreamToString(::std::stringstream* stream);
-
-// Converts a streamable value to a String.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-
-// Declared here but defined in gtest.h, so that it has access
-// to the definition of the Message class, required by the ARM
-// compiler.
-template <typename T>
-String StreamableToString(const T& streamable);
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: keith.ray@gmail.com (Keith Ray)
-//
-// Google Test filepath utilities
-//
-// This header file declares classes and functions used internally by
-// Google Test.  They are subject to change without notice.
-//
-// This file is #included in <gtest/internal/gtest-internal.h>.
-// Do not include this header file separately!
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-
-
-namespace testing {
-namespace internal {
-
-// FilePath - a class for file and directory pathname manipulation which
-// handles platform-specific conventions (like the pathname separator).
-// Used for helper functions for naming files in a directory for xml output.
-// Except for Set methods, all methods are const or static, which provides an
-// "immutable value object" -- useful for peace of mind.
-// A FilePath with a value ending in a path separator ("like/this/") represents
-// a directory, otherwise it is assumed to represent a file. In either case,
-// it may or may not represent an actual file or directory in the file system.
-// Names are NOT checked for syntax correctness -- no checking for illegal
-// characters, malformed paths, etc.
-
-class GTEST_API_ FilePath {
- public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
-
-  explicit FilePath(const char* pathname) : pathname_(pathname) {
-    Normalize();
-  }
-
-  explicit FilePath(const String& pathname) : pathname_(pathname) {
-    Normalize();
-  }
-
-  FilePath& operator=(const FilePath& rhs) {
-    Set(rhs);
-    return *this;
-  }
-
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
-
-  String ToString() const { return pathname_; }
-  const char* c_str() const { return pathname_.c_str(); }
-
-  // Returns the current working directory, or "" if unsuccessful.
-  static FilePath GetCurrentDir();
-
-  // Given directory = "dir", base_name = "test", number = 0,
-  // extension = "xml", returns "dir/test.xml". If number is greater
-  // than zero (e.g., 12), returns "dir/test_12.xml".
-  // On Windows platform, uses \ as the separator rather than /.
-  static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
-                               const char* extension);
-
-  // Given directory = "dir", relative_path = "test.xml",
-  // returns "dir/test.xml".
-  // On Windows, uses \ as the separator rather than /.
-  static FilePath ConcatPaths(const FilePath& directory,
-                              const FilePath& relative_path);
-
-  // Returns a pathname for a file that does not currently exist. The pathname
-  // will be directory/base_name.extension or
-  // directory/base_name_<number>.extension if directory/base_name.extension
-  // already exists. The number will be incremented until a pathname is found
-  // that does not already exist.
-  // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'.
-  // There could be a race condition if two or more processes are calling this
-  // function at the same time -- they could both pick the same filename.
-  static FilePath GenerateUniqueFileName(const FilePath& directory,
-                                         const FilePath& base_name,
-                                         const char* extension);
-
-  // Returns true iff the path is NULL or "".
-  bool IsEmpty() const { return c_str() == NULL || *c_str() == '\0'; }
-
-  // If input name has a trailing separator character, removes it and returns
-  // the name, otherwise return the name string unmodified.
-  // On Windows platform, uses \ as the separator, other platforms use /.
-  FilePath RemoveTrailingPathSeparator() const;
-
-  // Returns a copy of the FilePath with the directory part removed.
-  // Example: FilePath("path/to/file").RemoveDirectoryName() returns
-  // FilePath("file"). If there is no directory part ("just_a_file"), it returns
-  // the FilePath unmodified. If there is no file part ("just_a_dir/") it
-  // returns an empty FilePath ("").
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveDirectoryName() const;
-
-  // RemoveFileName returns the directory path with the filename removed.
-  // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/".
-  // If the FilePath is "a_file" or "/a_file", RemoveFileName returns
-  // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does
-  // not have a file, like "just/a/dir/", it returns the FilePath unmodified.
-  // On Windows platform, '\' is the path separator, otherwise it is '/'.
-  FilePath RemoveFileName() const;
-
-  // Returns a copy of the FilePath with the case-insensitive extension removed.
-  // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns
-  // FilePath("dir/file"). If a case-insensitive extension is not
-  // found, returns a copy of the original FilePath.
-  FilePath RemoveExtension(const char* extension) const;
-
-  // Creates directories so that path exists. Returns true if successful or if
-  // the directories already exist; returns false if unable to create
-  // directories for any reason. Will also return false if the FilePath does
-  // not represent a directory (that is, it doesn't end with a path separator).
-  bool CreateDirectoriesRecursively() const;
-
-  // Create the directory so that path exists. Returns true if successful or
-  // if the directory already exists; returns false if unable to create the
-  // directory for any reason, including if the parent directory does not
-  // exist. Not named "CreateDirectory" because that's a macro on Windows.
-  bool CreateFolder() const;
-
-  // Returns true if FilePath describes something in the file-system,
-  // either a file, directory, or whatever, and that something exists.
-  bool FileOrDirectoryExists() const;
-
-  // Returns true if pathname describes a directory in the file-system
-  // that exists.
-  bool DirectoryExists() const;
-
-  // Returns true if FilePath ends with a path separator, which indicates that
-  // it is intended to represent a directory. Returns false otherwise.
-  // This does NOT check that a directory (or file) actually exists.
-  bool IsDirectory() const;
-
-  // Returns true if pathname describes a root directory. (Windows has one
-  // root directory per disk drive.)
-  bool IsRootDirectory() const;
-
-  // Returns true if pathname describes an absolute path.
-  bool IsAbsolutePath() const;
-
- private:
-  // Replaces multiple consecutive separators with a single separator.
-  // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
-  // redundancies that might be in a pathname involving "." or "..".
-  //
-  // A pathname with multiple consecutive separators may occur either through
-  // user error or as a result of some scripts or APIs that generate a pathname
-  // with a trailing separator. On other platforms the same API or script
-  // may NOT generate a pathname with a trailing "/". Then elsewhere that
-  // pathname may have another "/" and pathname components added to it,
-  // without checking for the separator already being there.
-  // The script language and operating system may allow paths like "foo//bar"
-  // but some of the functions in FilePath will not handle that correctly. In
-  // particular, RemoveTrailingPathSeparator() only removes one separator, and
-  // it is called in CreateDirectoriesRecursively() assuming that it will change
-  // a pathname from directory syntax (trailing separator) to filename syntax.
-  //
-  // On Windows this method also replaces the alternate path separator '/' with
-  // the primary path separator '\\', so that for example "bar\\/\\foo" becomes
-  // "bar\\foo".
-
-  void Normalize();
-
-  // Returns a pointer to the last occurence of a valid path separator in
-  // the FilePath. On Windows, for example, both '/' and '\' are valid path
-  // separators. Returns NULL if no path separator was found.
-  const char* FindLastPathSeparator() const;
-
-  String pathname_;
-};  // class FilePath
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-// This file was GENERATED by command:
-//     pump.py gtest-type-util.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Type utilities needed for implementing typed and type-parameterized
-// tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently we support at most 50 types in a list, and at most 50
-// type-parameterized tests in one type-parameterized test case.
-// Please contact googletestframework@googlegroups.com if you need
-// more.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-
-// #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
-// libstdc++ (which is where cxxabi.h comes from).
-# ifdef __GLIBCXX__
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // __GLIBCXX__
-
-namespace testing {
-namespace internal {
-
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-String GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if defined(__GLIBCXX__) || defined(__HP_aCC)
-  int status = 0;
-  // gcc's implementation of typeid(T).name() mangles the type name,
-  // so we have to demangle it.
-#   ifdef __GLIBCXX__
-  using abi::__cxa_demangle;
-#   endif // __GLIBCXX__
-  char* const readable_name = __cxa_demangle(name, 0, 0, &status);
-  const String name_str(status == 0 ? readable_name : name);
-  free(readable_name);
-  return name_str;
-#  else
-  return name;
-#  endif  // __GLIBCXX__ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
-}
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// AssertyTypeEq<T1, T2>::type is defined iff T1 and T2 are the same
-// type.  This can be used as a compile-time assertion to ensure that
-// two types are equal.
-
-template <typename T1, typename T2>
-struct AssertTypeEq;
-
-template <typename T>
-struct AssertTypeEq<T, T> {
-  typedef bool type;
-};
-
-// A unique type used as the default value for the arguments of class
-// template Types.  This allows us to simulate variadic templates
-// (e.g. Types<int>, Type<int, double>, and etc), which C++ doesn't
-// support directly.
-struct None {};
-
-// The following family of struct and struct templates are used to
-// represent type lists.  In particular, TypesN<T1, T2, ..., TN>
-// represents a type list with N types (T1, T2, ..., and TN) in it.
-// Except for Types0, every struct in the family has two member types:
-// Head for the first type in the list, and Tail for the rest of the
-// list.
-
-// The empty type list.
-struct Types0 {};
-
-// Type lists of length 1, 2, 3, and so on.
-
-template <typename T1>
-struct Types1 {
-  typedef T1 Head;
-  typedef Types0 Tail;
-};
-template <typename T1, typename T2>
-struct Types2 {
-  typedef T1 Head;
-  typedef Types1<T2> Tail;
-};
-
-template <typename T1, typename T2, typename T3>
-struct Types3 {
-  typedef T1 Head;
-  typedef Types2<T2, T3> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types4 {
-  typedef T1 Head;
-  typedef Types3<T2, T3, T4> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types5 {
-  typedef T1 Head;
-  typedef Types4<T2, T3, T4, T5> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types6 {
-  typedef T1 Head;
-  typedef Types5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types7 {
-  typedef T1 Head;
-  typedef Types6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types8 {
-  typedef T1 Head;
-  typedef Types7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types9 {
-  typedef T1 Head;
-  typedef Types8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types10 {
-  typedef T1 Head;
-  typedef Types9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types11 {
-  typedef T1 Head;
-  typedef Types10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types12 {
-  typedef T1 Head;
-  typedef Types11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types13 {
-  typedef T1 Head;
-  typedef Types12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types14 {
-  typedef T1 Head;
-  typedef Types13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types15 {
-  typedef T1 Head;
-  typedef Types14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types16 {
-  typedef T1 Head;
-  typedef Types15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types17 {
-  typedef T1 Head;
-  typedef Types16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types18 {
-  typedef T1 Head;
-  typedef Types17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types19 {
-  typedef T1 Head;
-  typedef Types18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types20 {
-  typedef T1 Head;
-  typedef Types19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types21 {
-  typedef T1 Head;
-  typedef Types20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types22 {
-  typedef T1 Head;
-  typedef Types21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types23 {
-  typedef T1 Head;
-  typedef Types22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types24 {
-  typedef T1 Head;
-  typedef Types23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types25 {
-  typedef T1 Head;
-  typedef Types24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types26 {
-  typedef T1 Head;
-  typedef Types25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types27 {
-  typedef T1 Head;
-  typedef Types26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types28 {
-  typedef T1 Head;
-  typedef Types27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types29 {
-  typedef T1 Head;
-  typedef Types28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types30 {
-  typedef T1 Head;
-  typedef Types29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types31 {
-  typedef T1 Head;
-  typedef Types30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types32 {
-  typedef T1 Head;
-  typedef Types31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types33 {
-  typedef T1 Head;
-  typedef Types32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types34 {
-  typedef T1 Head;
-  typedef Types33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types35 {
-  typedef T1 Head;
-  typedef Types34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types36 {
-  typedef T1 Head;
-  typedef Types35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types37 {
-  typedef T1 Head;
-  typedef Types36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types38 {
-  typedef T1 Head;
-  typedef Types37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types39 {
-  typedef T1 Head;
-  typedef Types38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types40 {
-  typedef T1 Head;
-  typedef Types39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types41 {
-  typedef T1 Head;
-  typedef Types40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types42 {
-  typedef T1 Head;
-  typedef Types41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types43 {
-  typedef T1 Head;
-  typedef Types42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types44 {
-  typedef T1 Head;
-  typedef Types43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types45 {
-  typedef T1 Head;
-  typedef Types44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types46 {
-  typedef T1 Head;
-  typedef Types45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types47 {
-  typedef T1 Head;
-  typedef Types46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types48 {
-  typedef T1 Head;
-  typedef Types47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types49 {
-  typedef T1 Head;
-  typedef Types48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct Types50 {
-  typedef T1 Head;
-  typedef Types49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-      T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-      T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-      T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-}  // namespace internal
-
-// We don't want to require the users to write TypesN<...> directly,
-// as that would require them to count the length.  Types<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Types<int>
-// will appear as Types<int, None, None, ..., None> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Types<T1, ..., TN>, and Google Test will translate
-// that to TypesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Types template.
-template <typename T1 = internal::None, typename T2 = internal::None,
-    typename T3 = internal::None, typename T4 = internal::None,
-    typename T5 = internal::None, typename T6 = internal::None,
-    typename T7 = internal::None, typename T8 = internal::None,
-    typename T9 = internal::None, typename T10 = internal::None,
-    typename T11 = internal::None, typename T12 = internal::None,
-    typename T13 = internal::None, typename T14 = internal::None,
-    typename T15 = internal::None, typename T16 = internal::None,
-    typename T17 = internal::None, typename T18 = internal::None,
-    typename T19 = internal::None, typename T20 = internal::None,
-    typename T21 = internal::None, typename T22 = internal::None,
-    typename T23 = internal::None, typename T24 = internal::None,
-    typename T25 = internal::None, typename T26 = internal::None,
-    typename T27 = internal::None, typename T28 = internal::None,
-    typename T29 = internal::None, typename T30 = internal::None,
-    typename T31 = internal::None, typename T32 = internal::None,
-    typename T33 = internal::None, typename T34 = internal::None,
-    typename T35 = internal::None, typename T36 = internal::None,
-    typename T37 = internal::None, typename T38 = internal::None,
-    typename T39 = internal::None, typename T40 = internal::None,
-    typename T41 = internal::None, typename T42 = internal::None,
-    typename T43 = internal::None, typename T44 = internal::None,
-    typename T45 = internal::None, typename T46 = internal::None,
-    typename T47 = internal::None, typename T48 = internal::None,
-    typename T49 = internal::None, typename T50 = internal::None>
-struct Types {
-  typedef internal::Types50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Types<internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types0 type;
-};
-template <typename T1>
-struct Types<T1, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types1<T1> type;
-};
-template <typename T1, typename T2>
-struct Types<T1, T2, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types2<T1, T2> type;
-};
-template <typename T1, typename T2, typename T3>
-struct Types<T1, T2, T3, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types3<T1, T2, T3> type;
-};
-template <typename T1, typename T2, typename T3, typename T4>
-struct Types<T1, T2, T3, T4, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types4<T1, T2, T3, T4> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-struct Types<T1, T2, T3, T4, T5, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types5<T1, T2, T3, T4, T5> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-struct Types<T1, T2, T3, T4, T5, T6, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types6<T1, T2, T3, T4, T5, T6> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-struct Types<T1, T2, T3, T4, T5, T6, T7, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, internal::None,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None, internal::None> {
-  typedef internal::Types43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None, internal::None> {
-  typedef internal::Types44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    internal::None, internal::None, internal::None, internal::None,
-    internal::None> {
-  typedef internal::Types45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, internal::None, internal::None, internal::None, internal::None> {
-  typedef internal::Types46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, internal::None, internal::None, internal::None> {
-  typedef internal::Types47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, internal::None, internal::None> {
-  typedef internal::Types48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-struct Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14, T15,
-    T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29, T30,
-    T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44, T45,
-    T46, T47, T48, T49, internal::None> {
-  typedef internal::Types49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-namespace internal {
-
-# define GTEST_TEMPLATE_ template <typename T> class
-
-// The template "selector" struct TemplateSel<Tmpl> is used to
-// represent Tmpl, which must be a class template with one type
-// parameter, as a type.  TemplateSel<Tmpl>::Bind<T>::type is defined
-// as the type Tmpl<T>.  This allows us to actually instantiate the
-// template "selected" by TemplateSel<Tmpl>.
-//
-// This trick is necessary for simulating typedef for class templates,
-// which C++ doesn't support directly.
-template <GTEST_TEMPLATE_ Tmpl>
-struct TemplateSel {
-  template <typename T>
-  struct Bind {
-    typedef Tmpl<T> type;
-  };
-};
-
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
-
-// A unique struct template used as the default value for the
-// arguments of class template Templates.  This allows us to simulate
-// variadic templates (e.g. Templates<int>, Templates<int, double>,
-// and etc), which C++ doesn't support directly.
-template <typename T>
-struct NoneT {};
-
-// The following family of struct and struct templates are used to
-// represent template lists.  In particular, TemplatesN<T1, T2, ...,
-// TN> represents a list of N templates (T1, T2, ..., and TN).  Except
-// for Templates0, every struct in the family has two member types:
-// Head for the selector of the first template in the list, and Tail
-// for the rest of the list.
-
-// The empty template list.
-struct Templates0 {};
-
-// Template lists of length 1, 2, 3, and so on.
-
-template <GTEST_TEMPLATE_ T1>
-struct Templates1 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates0 Tail;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates2 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates1<T2> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates3 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates2<T2, T3> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates4 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates3<T2, T3, T4> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates5 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates4<T2, T3, T4, T5> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates6 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates5<T2, T3, T4, T5, T6> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates7 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates6<T2, T3, T4, T5, T6, T7> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates8 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates7<T2, T3, T4, T5, T6, T7, T8> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates9 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates8<T2, T3, T4, T5, T6, T7, T8, T9> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates10 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates9<T2, T3, T4, T5, T6, T7, T8, T9, T10> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates11 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates10<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates12 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates11<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates13 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates12<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates14 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates13<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates15 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates14<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates16 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates15<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates17 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates16<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates18 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates17<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates19 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates18<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates20 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates19<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates21 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates20<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates22 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates21<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates23 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates22<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates24 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates23<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates25 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates24<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates26 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates25<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates27 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates26<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates28 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates27<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates29 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates28<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates30 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates29<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates31 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates30<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates32 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates31<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates33 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates32<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates34 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates33<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates35 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates34<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates36 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates35<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates37 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates36<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates38 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates37<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates39 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates38<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates40 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates39<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates41 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates40<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates42 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates41<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates43 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates42<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates44 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates43<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates45 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates44<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates46 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates45<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates47 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates46<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates48 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates47<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates49 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates48<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49> Tail;
-};
-
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49, GTEST_TEMPLATE_ T50>
-struct Templates50 {
-  typedef TemplateSel<T1> Head;
-  typedef Templates49<T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-      T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-      T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-      T43, T44, T45, T46, T47, T48, T49, T50> Tail;
-};
-
-
-// We don't want to require the users to write TemplatesN<...> directly,
-// as that would require them to count the length.  Templates<...> is much
-// easier to write, but generates horrible messages when there is a
-// compiler error, as gcc insists on printing out each template
-// argument, even if it has the default value (this means Templates<list>
-// will appear as Templates<list, NoneT, NoneT, ..., NoneT> in the compiler
-// errors).
-//
-// Our solution is to combine the best part of the two approaches: a
-// user would write Templates<T1, ..., TN>, and Google Test will translate
-// that to TemplatesN<T1, ..., TN> internally to make error messages
-// readable.  The translation is done by the 'type' member of the
-// Templates template.
-template <GTEST_TEMPLATE_ T1 = NoneT, GTEST_TEMPLATE_ T2 = NoneT,
-    GTEST_TEMPLATE_ T3 = NoneT, GTEST_TEMPLATE_ T4 = NoneT,
-    GTEST_TEMPLATE_ T5 = NoneT, GTEST_TEMPLATE_ T6 = NoneT,
-    GTEST_TEMPLATE_ T7 = NoneT, GTEST_TEMPLATE_ T8 = NoneT,
-    GTEST_TEMPLATE_ T9 = NoneT, GTEST_TEMPLATE_ T10 = NoneT,
-    GTEST_TEMPLATE_ T11 = NoneT, GTEST_TEMPLATE_ T12 = NoneT,
-    GTEST_TEMPLATE_ T13 = NoneT, GTEST_TEMPLATE_ T14 = NoneT,
-    GTEST_TEMPLATE_ T15 = NoneT, GTEST_TEMPLATE_ T16 = NoneT,
-    GTEST_TEMPLATE_ T17 = NoneT, GTEST_TEMPLATE_ T18 = NoneT,
-    GTEST_TEMPLATE_ T19 = NoneT, GTEST_TEMPLATE_ T20 = NoneT,
-    GTEST_TEMPLATE_ T21 = NoneT, GTEST_TEMPLATE_ T22 = NoneT,
-    GTEST_TEMPLATE_ T23 = NoneT, GTEST_TEMPLATE_ T24 = NoneT,
-    GTEST_TEMPLATE_ T25 = NoneT, GTEST_TEMPLATE_ T26 = NoneT,
-    GTEST_TEMPLATE_ T27 = NoneT, GTEST_TEMPLATE_ T28 = NoneT,
-    GTEST_TEMPLATE_ T29 = NoneT, GTEST_TEMPLATE_ T30 = NoneT,
-    GTEST_TEMPLATE_ T31 = NoneT, GTEST_TEMPLATE_ T32 = NoneT,
-    GTEST_TEMPLATE_ T33 = NoneT, GTEST_TEMPLATE_ T34 = NoneT,
-    GTEST_TEMPLATE_ T35 = NoneT, GTEST_TEMPLATE_ T36 = NoneT,
-    GTEST_TEMPLATE_ T37 = NoneT, GTEST_TEMPLATE_ T38 = NoneT,
-    GTEST_TEMPLATE_ T39 = NoneT, GTEST_TEMPLATE_ T40 = NoneT,
-    GTEST_TEMPLATE_ T41 = NoneT, GTEST_TEMPLATE_ T42 = NoneT,
-    GTEST_TEMPLATE_ T43 = NoneT, GTEST_TEMPLATE_ T44 = NoneT,
-    GTEST_TEMPLATE_ T45 = NoneT, GTEST_TEMPLATE_ T46 = NoneT,
-    GTEST_TEMPLATE_ T47 = NoneT, GTEST_TEMPLATE_ T48 = NoneT,
-    GTEST_TEMPLATE_ T49 = NoneT, GTEST_TEMPLATE_ T50 = NoneT>
-struct Templates {
-  typedef Templates50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49, T50> type;
-};
-
-template <>
-struct Templates<NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates0 type;
-};
-template <GTEST_TEMPLATE_ T1>
-struct Templates<T1, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates1<T1> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2>
-struct Templates<T1, T2, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates2<T1, T2> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3>
-struct Templates<T1, T2, T3, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates3<T1, T2, T3> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4>
-struct Templates<T1, T2, T3, T4, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates4<T1, T2, T3, T4> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5>
-struct Templates<T1, T2, T3, T4, T5, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates5<T1, T2, T3, T4, T5> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6>
-struct Templates<T1, T2, T3, T4, T5, T6, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates6<T1, T2, T3, T4, T5, T6> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates7<T1, T2, T3, T4, T5, T6, T7> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates8<T1, T2, T3, T4, T5, T6, T7, T8> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates9<T1, T2, T3, T4, T5, T6, T7, T8, T9> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT> {
-  typedef Templates24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT> {
-  typedef Templates28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT> {
-  typedef Templates29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, NoneT, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, NoneT, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, NoneT, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, NoneT, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, NoneT,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    NoneT, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, NoneT, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, NoneT, NoneT, NoneT, NoneT> {
-  typedef Templates46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, NoneT, NoneT, NoneT> {
-  typedef Templates47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, NoneT, NoneT> {
-  typedef Templates48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48> type;
-};
-template <GTEST_TEMPLATE_ T1, GTEST_TEMPLATE_ T2, GTEST_TEMPLATE_ T3,
-    GTEST_TEMPLATE_ T4, GTEST_TEMPLATE_ T5, GTEST_TEMPLATE_ T6,
-    GTEST_TEMPLATE_ T7, GTEST_TEMPLATE_ T8, GTEST_TEMPLATE_ T9,
-    GTEST_TEMPLATE_ T10, GTEST_TEMPLATE_ T11, GTEST_TEMPLATE_ T12,
-    GTEST_TEMPLATE_ T13, GTEST_TEMPLATE_ T14, GTEST_TEMPLATE_ T15,
-    GTEST_TEMPLATE_ T16, GTEST_TEMPLATE_ T17, GTEST_TEMPLATE_ T18,
-    GTEST_TEMPLATE_ T19, GTEST_TEMPLATE_ T20, GTEST_TEMPLATE_ T21,
-    GTEST_TEMPLATE_ T22, GTEST_TEMPLATE_ T23, GTEST_TEMPLATE_ T24,
-    GTEST_TEMPLATE_ T25, GTEST_TEMPLATE_ T26, GTEST_TEMPLATE_ T27,
-    GTEST_TEMPLATE_ T28, GTEST_TEMPLATE_ T29, GTEST_TEMPLATE_ T30,
-    GTEST_TEMPLATE_ T31, GTEST_TEMPLATE_ T32, GTEST_TEMPLATE_ T33,
-    GTEST_TEMPLATE_ T34, GTEST_TEMPLATE_ T35, GTEST_TEMPLATE_ T36,
-    GTEST_TEMPLATE_ T37, GTEST_TEMPLATE_ T38, GTEST_TEMPLATE_ T39,
-    GTEST_TEMPLATE_ T40, GTEST_TEMPLATE_ T41, GTEST_TEMPLATE_ T42,
-    GTEST_TEMPLATE_ T43, GTEST_TEMPLATE_ T44, GTEST_TEMPLATE_ T45,
-    GTEST_TEMPLATE_ T46, GTEST_TEMPLATE_ T47, GTEST_TEMPLATE_ T48,
-    GTEST_TEMPLATE_ T49>
-struct Templates<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13, T14,
-    T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28, T29,
-    T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43, T44,
-    T45, T46, T47, T48, T49, NoneT> {
-  typedef Templates49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-      T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-      T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-      T42, T43, T44, T45, T46, T47, T48, T49> type;
-};
-
-// The TypeList template makes it possible to use either a single type
-// or a Types<...> list in TYPED_TEST_CASE() and
-// INSTANTIATE_TYPED_TEST_CASE_P().
-
-template <typename T>
-struct TypeList { typedef Types1<T> type; };
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-struct TypeList<Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> > {
-  typedef typename Types<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-      T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-      T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-      T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>::type type;
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-
-// Due to C++ preprocessor weirdness, we need double indirection to
-// concatenate two tokens when one of them is __LINE__.  Writing
-//
-//   foo ## __LINE__
-//
-// will result in the token foo__LINE__, instead of foo followed by
-// the current line number.  For more details, see
-// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
-#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
-
-// Google Test defines the testing::Message class to allow construction of
-// test messages via the << operator.  The idea is that anything
-// streamable to std::ostream can be streamed to a testing::Message.
-// This allows a user to use his own types in Google Test assertions by
-// overloading the << operator.
-//
-// util/gtl/stl_logging-inl.h overloads << for STL containers.  These
-// overloads cannot be defined in the std namespace, as that will be
-// undefined behavior.  Therefore, they are defined in the global
-// namespace instead.
-//
-// C++'s symbol lookup rule (i.e. Koenig lookup) says that these
-// overloads are visible in either the std namespace or the global
-// namespace, but not other namespaces, including the testing
-// namespace which Google Test's Message class is in.
-//
-// To allow STL containers (and other types that has a << operator
-// defined in the global namespace) to be used in Google Test assertions,
-// testing::Message must access the custom << operator from the global
-// namespace.  Hence this helper function.
-//
-// Note: Jeffrey Yasskin suggested an alternative fix by "using
-// ::operator<<;" in the definition of Message's operator<<.  That fix
-// doesn't require a helper function, but unfortunately doesn't
-// compile with MSVC.
-template <typename T>
-inline void GTestStreamToHelper(std::ostream* os, const T& val) {
-  *os << val;
-}
-
-class ProtocolMessage;
-namespace proto2 { class Message; }
-
-namespace testing {
-
-// Forward declarations.
-
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test cases.
-
-template <typename T>
-::std::string PrintToString(const T& value);
-
-namespace internal {
-
-struct TraceInfo;                      // Information about a trace point.
-class ScopedTrace;                     // Implements scoped trace.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
-
-// How many times InitGoogleTest() has been called.
-extern int g_init_gtest_count;
-
-// The text used in failure messages to indicate the start of the
-// stack trace.
-GTEST_API_ extern const char kStackTraceMarker[];
-
-// A secret type that Google Test users don't know about.  It has no
-// definition on purpose.  Therefore it's impossible to create a
-// Secret object, which is what we want.
-class Secret;
-
-// Two overloaded helpers for checking at compile time whether an
-// expression is a null pointer literal (i.e. NULL or any 0-valued
-// compile-time integral constant).  Their return values have
-// different sizes, so we can use sizeof() to test which version is
-// picked by the compiler.  These helpers have no implementations, as
-// we only need their signatures.
-//
-// Given IsNullLiteralHelper(x), the compiler will pick the first
-// version if x can be implicitly converted to Secret*, and pick the
-// second version otherwise.  Since Secret is a secret and incomplete
-// type, the only expression a user can write that has type Secret* is
-// a null pointer literal.  Therefore, we know that x is a null
-// pointer literal if and only if the first version is picked by the
-// compiler.
-char IsNullLiteralHelper(Secret* p);
-char (&IsNullLiteralHelper(...))[2];  // NOLINT
-
-// A compile-time bool constant that is true if and only if x is a
-// null pointer literal (i.e. NULL or any 0-valued compile-time
-// integral constant).
-#ifdef GTEST_ELLIPSIS_NEEDS_POD_
-// We lose support for NULL detection where the compiler doesn't like
-// passing non-POD classes through ellipsis (...).
-# define GTEST_IS_NULL_LITERAL_(x) false
-#else
-# define GTEST_IS_NULL_LITERAL_(x) \
-    (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1)
-#endif  // GTEST_ELLIPSIS_NEEDS_POD_
-
-// Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ String AppendUserMessage(const String& gtest_msg,
-                                    const Message& user_msg);
-
-// A helper class for creating scoped traces in user programs.
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-  ScopedTrace(const char* file, int line, const Message& message);
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
-
-// Converts a streamable value to a String.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-// Declared here but defined in gtest.h, so that it has access
-// to the definition of the Message class, required by the ARM
-// compiler.
-template <typename T>
-String StreamableToString(const T& streamable);
-
-// The Symbian compiler has a bug that prevents it from selecting the
-// correct overload of FormatForComparisonFailureMessage (see below)
-// unless we pass the first argument by reference.  If we do that,
-// however, Visual Age C++ 10.1 generates a compiler error.  Therefore
-// we only apply the work-around for Symbian.
-#if defined(__SYMBIAN32__)
-# define GTEST_CREF_WORKAROUND_ const&
-#else
-# define GTEST_CREF_WORKAROUND_
-#endif
-
-// When this operand is a const char* or char*, if the other operand
-// is a ::std::string or ::string, we print this operand as a C string
-// rather than a pointer (we do the same for wide strings); otherwise
-// we print it as a pointer to be safe.
-
-// This internal macro is used to avoid duplicated code.
-#define GTEST_FORMAT_IMPL_(operand2_type, operand1_printer)\
-inline String FormatForComparisonFailureMessage(\
-    operand2_type::value_type* GTEST_CREF_WORKAROUND_ str, \
-    const operand2_type& /*operand2*/) {\
-  return operand1_printer(str);\
-}\
-inline String FormatForComparisonFailureMessage(\
-    const operand2_type::value_type* GTEST_CREF_WORKAROUND_ str, \
-    const operand2_type& /*operand2*/) {\
-  return operand1_printer(str);\
-}
-
-GTEST_FORMAT_IMPL_(::std::string, String::ShowCStringQuoted)
-#if GTEST_HAS_STD_WSTRING
-GTEST_FORMAT_IMPL_(::std::wstring, String::ShowWideCStringQuoted)
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_FORMAT_IMPL_(::string, String::ShowCStringQuoted)
-#endif  // GTEST_HAS_GLOBAL_STRING
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_FORMAT_IMPL_(::wstring, String::ShowWideCStringQuoted)
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#undef GTEST_FORMAT_IMPL_
-
-// The next four overloads handle the case where the operand being
-// printed is a char/wchar_t pointer and the other operand is not a
-// string/wstring object.  In such cases, we just print the operand as
-// a pointer to be safe.
-#define GTEST_FORMAT_CHAR_PTR_IMPL_(CharType)                       \
-  template <typename T>                                             \
-  String FormatForComparisonFailureMessage(CharType* GTEST_CREF_WORKAROUND_ p, \
-                                           const T&) { \
-    return PrintToString(static_cast<const void*>(p));              \
-  }
-
-GTEST_FORMAT_CHAR_PTR_IMPL_(char)
-GTEST_FORMAT_CHAR_PTR_IMPL_(const char)
-GTEST_FORMAT_CHAR_PTR_IMPL_(wchar_t)
-GTEST_FORMAT_CHAR_PTR_IMPL_(const wchar_t)
-
-#undef GTEST_FORMAT_CHAR_PTR_IMPL_
-
-// Constructs and returns the message for an equality assertion
-// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
-//
-// The first four parameters are the expressions used in the assertion
-// and their values, as strings.  For example, for ASSERT_EQ(foo, bar)
-// where foo is 5 and bar is 6, we have:
-//
-//   expected_expression: "foo"
-//   actual_expression:   "bar"
-//   expected_value:      "5"
-//   actual_value:        "6"
-//
-// The ignoring_case parameter is true iff the assertion is a
-// *_STRCASEEQ*.  When it's true, the string " (ignoring case)" will
-// be inserted into the message.
-GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
-                                     const char* actual_expression,
-                                     const String& expected_value,
-                                     const String& actual_value,
-                                     bool ignoring_case);
-
-// Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
-GTEST_API_ String GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
-
-// This template class represents an IEEE floating-point number
-// (either single-precision or double-precision, depending on the
-// template parameters).
-//
-// The purpose of this class is to do more sophisticated number
-// comparison.  (Due to round-off error, etc, it's very unlikely that
-// two floating-points will be equal exactly.  Hence a naive
-// comparison by the == operation often doesn't work.)
-//
-// Format of IEEE floating-point:
-//
-//   The most-significant bit being the leftmost, an IEEE
-//   floating-point looks like
-//
-//     sign_bit exponent_bits fraction_bits
-//
-//   Here, sign_bit is a single bit that designates the sign of the
-//   number.
-//
-//   For float, there are 8 exponent bits and 23 fraction bits.
-//
-//   For double, there are 11 exponent bits and 52 fraction bits.
-//
-//   More details can be found at
-//   http://en.wikipedia.org/wiki/IEEE_floating-point_standard.
-//
-// Template parameter:
-//
-//   RawType: the raw floating-point type (either float or double)
-template <typename RawType>
-class FloatingPoint {
- public:
-  // Defines the unsigned integer type that has the same size as the
-  // floating point number.
-  typedef typename TypeWithSize<sizeof(RawType)>::UInt Bits;
-
-  // Constants.
-
-  // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
-
-  // # of fraction bits in a number.
-  static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
-
-  // # of exponent bits in a number.
-  static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
-
-  // The mask for the sign bit.
-  static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
-
-  // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
-
-  // The mask for the exponent bits.
-  static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
-
-  // How many ULP's (Units in the Last Place) we want to tolerate when
-  // comparing two numbers.  The larger the value, the more error we
-  // allow.  A 0 value means that two numbers must be exactly the same
-  // to be considered equal.
-  //
-  // The maximum error of a single floating-point operation is 0.5
-  // units in the last place.  On Intel CPU's, all floating-point
-  // calculations are done with 80-bit precision, while double has 64
-  // bits.  Therefore, 4 should be enough for ordinary use.
-  //
-  // See the following article for more details on ULP:
-  // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm.
-  static const size_t kMaxUlps = 4;
-
-  // Constructs a FloatingPoint from a raw floating-point number.
-  //
-  // On an Intel CPU, passing a non-normalized NAN (Not a Number)
-  // around may change its bits, although the new value is guaranteed
-  // to be also a NAN.  Therefore, don't expect this constructor to
-  // preserve the bits in x when x is a NAN.
-  explicit FloatingPoint(const RawType& x) { u_.value_ = x; }
-
-  // Static methods
-
-  // Reinterprets a bit pattern as a floating-point number.
-  //
-  // This function is needed to test the AlmostEquals() method.
-  static RawType ReinterpretBits(const Bits bits) {
-    FloatingPoint fp(0);
-    fp.u_.bits_ = bits;
-    return fp.u_.value_;
-  }
-
-  // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
-
-  // Non-static methods
-
-  // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
-
-  // Returns the exponent bits of this number.
-  Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
-
-  // Returns the fraction bits of this number.
-  Bits fraction_bits() const { return kFractionBitMask & u_.bits_; }
-
-  // Returns the sign bit of this number.
-  Bits sign_bit() const { return kSignBitMask & u_.bits_; }
-
-  // Returns true iff this is NAN (not a number).
-  bool is_nan() const {
-    // It's a NAN if the exponent bits are all ones and the fraction
-    // bits are not entirely zeros.
-    return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0);
-  }
-
-  // Returns true iff this number is at most kMaxUlps ULP's away from
-  // rhs.  In particular, this function:
-  //
-  //   - returns false if either number is (or both are) NAN.
-  //   - treats really large numbers as almost equal to infinity.
-  //   - thinks +0.0 and -0.0 are 0 DLP's apart.
-  bool AlmostEquals(const FloatingPoint& rhs) const {
-    // The IEEE standard says that any comparison operation involving
-    // a NAN must return false.
-    if (is_nan() || rhs.is_nan()) return false;
-
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
-  }
-
- private:
-  // The data type used to store the actual floating-point number.
-  union FloatingPointUnion {
-    RawType value_;  // The raw floating-point number.
-    Bits bits_;      // The bits that represent the number.
-  };
-
-  // Converts an integer from the sign-and-magnitude representation to
-  // the biased representation.  More precisely, let N be 2 to the
-  // power of (kBitCount - 1), an integer x is represented by the
-  // unsigned number x + N.
-  //
-  // For instance,
-  //
-  //   -N + 1 (the most negative number representable using
-  //          sign-and-magnitude) is represented by 1;
-  //   0      is represented by N; and
-  //   N - 1  (the biggest number representable using
-  //          sign-and-magnitude) is represented by 2N - 1.
-  //
-  // Read http://en.wikipedia.org/wiki/Signed_number_representations
-  // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
-    if (kSignBitMask & sam) {
-      // sam represents a negative number.
-      return ~sam + 1;
-    } else {
-      // sam represents a positive number.
-      return kSignBitMask | sam;
-    }
-  }
-
-  // Given two numbers in the sign-and-magnitude representation,
-  // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
-    const Bits biased1 = SignAndMagnitudeToBiased(sam1);
-    const Bits biased2 = SignAndMagnitudeToBiased(sam2);
-    return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
-  }
-
-  FloatingPointUnion u_;
-};
-
-// Typedefs the instances of the FloatingPoint template class that we
-// care to use.
-typedef FloatingPoint<float> Float;
-typedef FloatingPoint<double> Double;
-
-// In order to catch the mistake of putting tests that use different
-// test fixture classes in the same test case, we need to assign
-// unique IDs to fixture classes and compare them.  The TypeId type is
-// used to hold such IDs.  The user should treat TypeId as an opaque
-// type: the only operation allowed on TypeId values is to compare
-// them for equality using the == operator.
-typedef const void* TypeId;
-
-template <typename T>
-class TypeIdHelper {
- public:
-  // dummy_ must not have a const type.  Otherwise an overly eager
-  // compiler (e.g. MSVC 7.1 & 8.0) may try to merge
-  // TypeIdHelper<T>::dummy_ for different Ts as an "optimization".
-  static bool dummy_;
-};
-
-template <typename T>
-bool TypeIdHelper<T>::dummy_ = false;
-
-// GetTypeId<T>() returns the ID of type T.  Different values will be
-// returned for different types.  Calling the function twice with the
-// same type argument is guaranteed to return the same ID.
-template <typename T>
-TypeId GetTypeId() {
-  // The compiler is required to allocate a different
-  // TypeIdHelper<T>::dummy_ variable for each T used to instantiate
-  // the template.  Therefore, the address of dummy_ is guaranteed to
-  // be unique.
-  return &(TypeIdHelper<T>::dummy_);
-}
-
-// Returns the type ID of ::testing::Test.  Always call this instead
-// of GetTypeId< ::testing::Test>() to get the type ID of
-// ::testing::Test, as the latter may give the wrong result due to a
-// suspected linker bug when compiling Google Test as a Mac OS X
-// framework.
-GTEST_API_ TypeId GetTestTypeId();
-
-// Defines the abstract factory interface that creates instances
-// of a Test object.
-class TestFactoryBase {
- public:
-  virtual ~TestFactoryBase() {}
-
-  // Creates a test instance to run. The instance is both created and destroyed
-  // within TestInfoImpl::Run()
-  virtual Test* CreateTest() = 0;
-
- protected:
-  TestFactoryBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
-};
-
-// This class provides implementation of TeastFactoryBase interface.
-// It is used in TEST and TEST_F macros.
-template <class TestClass>
-class TestFactoryImpl : public TestFactoryBase {
- public:
-  virtual Test* CreateTest() { return new TestClass; }
-};
-
-#if GTEST_OS_WINDOWS
-
-// Predicate-formatters for implementing the HRESULT checking macros
-// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}
-// We pass a long instead of HRESULT to avoid causing an
-// include dependency for the HRESULT type.
-GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr,
-                                            long hr);  // NOLINT
-GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr,
-                                            long hr);  // NOLINT
-
-#endif  // GTEST_OS_WINDOWS
-
-// Types of SetUpTestCase() and TearDownTestCase() functions.
-typedef void (*SetUpTestCaseFunc)();
-typedef void (*TearDownTestCaseFunc)();
-
-// Creates a new TestInfo object and registers it with Google Test;
-// returns the created object.
-//
-// Arguments:
-//
-//   test_case_name:   name of the test case
-//   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
-//                     this is not  a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
-//                     or NULL if this is not a type-parameterized test.
-//   fixture_class_id: ID of the test fixture class
-//   set_up_tc:        pointer to the function that sets up the test case
-//   tear_down_tc:     pointer to the function that tears down the test case
-//   factory:          pointer to the factory that creates a test object.
-//                     The newly created TestInfo instance will assume
-//                     ownership of the factory object.
-GTEST_API_ TestInfo* MakeAndRegisterTestInfo(
-    const char* test_case_name, const char* name,
-    const char* type_param,
-    const char* value_param,
-    TypeId fixture_class_id,
-    SetUpTestCaseFunc set_up_tc,
-    TearDownTestCaseFunc tear_down_tc,
-    TestFactoryBase* factory);
-
-// If *pstr starts with the given prefix, modifies *pstr to be right
-// past the prefix and returns true; otherwise leaves *pstr unchanged
-// and returns false.  None of pstr, *pstr, and prefix can be NULL.
-GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
-
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// State of the definition of a type-parameterized test case.
-class GTEST_API_ TypedTestCasePState {
- public:
-  TypedTestCasePState() : registered_(false) {}
-
-  // Adds the given test name to defined_test_names_ and return true
-  // if the test case hasn't been registered; otherwise aborts the
-  // program.
-  bool AddTestName(const char* file, int line, const char* case_name,
-                   const char* test_name) {
-    if (registered_) {
-      fprintf(stderr, "%s Test %s must be defined before "
-              "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n",
-              FormatFileLocation(file, line).c_str(), test_name, case_name);
-      fflush(stderr);
-      posix::Abort();
-    }
-    defined_test_names_.insert(test_name);
-    return true;
-  }
-
-  // Verifies that registered_tests match the test names in
-  // defined_test_names_; returns registered_tests if successful, or
-  // aborts the program otherwise.
-  const char* VerifyRegisteredTestNames(
-      const char* file, int line, const char* registered_tests);
-
- private:
-  bool registered_;
-  ::std::set<const char*> defined_test_names_;
-};
-
-// Skips to the first non-space char after the first comma in 'str';
-// returns NULL if no comma is found in 'str'.
-inline const char* SkipComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  if (comma == NULL) {
-    return NULL;
-  }
-  while (IsSpace(*(++comma))) {}
-  return comma;
-}
-
-// Returns the prefix of 'str' before the first comma in it; returns
-// the entire string if it contains no comma.
-inline String GetPrefixUntilComma(const char* str) {
-  const char* comma = strchr(str, ',');
-  return comma == NULL ? String(str) : String(str, comma - str);
-}
-
-// TypeParameterizedTest<Fixture, TestSel, Types>::Register()
-// registers a list of type-parameterized tests with Google Test.  The
-// return value is insignificant - we just need to return something
-// such that we can call this function in a namespace scope.
-//
-// Implementation note: The GTEST_TEMPLATE_ macro declares a template
-// template parameter.  It's defined in gtest-type-util.h.
-template <GTEST_TEMPLATE_ Fixture, class TestSel, typename Types>
-class TypeParameterizedTest {
- public:
-  // 'index' is the index of the test in the type list 'Types'
-  // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
-  // Types).  Valid values for 'index' are [0, N - 1] where N is the
-  // length of Types.
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names, int index) {
-    typedef typename Types::Head Type;
-    typedef Fixture<Type> FixtureClass;
-    typedef typename GTEST_BIND_(TestSel, Type) TestClass;
-
-    // First, registers the first type-parameterized test in the type
-    // list.
-    MakeAndRegisterTestInfo(
-        String::Format("%s%s%s/%d", prefix, prefix[0] == '\0' ? "" : "/",
-                       case_name, index).c_str(),
-        GetPrefixUntilComma(test_names).c_str(),
-        GetTypeName<Type>().c_str(),
-        NULL,  // No value parameter.
-        GetTypeId<FixtureClass>(),
-        TestClass::SetUpTestCase,
-        TestClass::TearDownTestCase,
-        new TestFactoryImpl<TestClass>);
-
-    // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, case_name, test_names, index + 1);
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, class TestSel>
-class TypeParameterizedTest<Fixture, TestSel, Types0> {
- public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/, int /*index*/) {
-    return true;
-  }
-};
-
-// TypeParameterizedTestCase<Fixture, Tests, Types>::Register()
-// registers *all combinations* of 'Tests' and 'Types' with Google
-// Test.  The return value is insignificant - we just need to return
-// something such that we can call this function in a namespace scope.
-template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
-class TypeParameterizedTestCase {
- public:
-  static bool Register(const char* prefix, const char* case_name,
-                       const char* test_names) {
-    typedef typename Tests::Head Head;
-
-    // First, register the first test in 'Test' for each type in 'Types'.
-    TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, case_name, test_names, 0);
-
-    // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, case_name, SkipComma(test_names));
-  }
-};
-
-// The base case for the compile time recursion.
-template <GTEST_TEMPLATE_ Fixture, typename Types>
-class TypeParameterizedTestCase<Fixture, Templates0, Types> {
- public:
-  static bool Register(const char* /*prefix*/, const char* /*case_name*/,
-                       const char* /*test_names*/) {
-    return true;
-  }
-};
-
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
-// Returns the current OS stack trace as a String.
-//
-// The maximum number of stack frames to be included is specified by
-// the gtest_stack_trace_depth flag.  The skip_count parameter
-// specifies the number of top frames to be skipped, which doesn't
-// count against the number of frames to be included.
-//
-// For example, if Foo() calls Bar(), which in turn calls
-// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
-// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ String GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
-                                                  int skip_count);
-
-// Helpers for suppressing warnings on unreachable code or constant
-// condition.
-
-// Always returns true.
-GTEST_API_ bool AlwaysTrue();
-
-// Always returns false.
-inline bool AlwaysFalse() { return !AlwaysTrue(); }
-
-// Helper for suppressing false warning from Clang on a const char*
-// variable declared in a conditional expression always being NULL in
-// the else branch.
-struct GTEST_API_ ConstCharPtr {
-  ConstCharPtr(const char* str) : value(str) {}
-  operator bool() const { return true; }
-  const char* value;
-};
-
-// A simple Linear Congruential Generator for generating random
-// numbers with a uniform distribution.  Unlike rand() and srand(), it
-// doesn't use global state (and therefore can't interfere with user
-// code).  Unlike rand_r(), it's portable.  An LCG isn't very random,
-// but it's good enough for our purposes.
-class GTEST_API_ Random {
- public:
-  static const UInt32 kMaxRange = 1u << 31;
-
-  explicit Random(UInt32 seed) : state_(seed) {}
-
-  void Reseed(UInt32 seed) { state_ = seed; }
-
-  // Generates a random number from [0, range).  Crashes if 'range' is
-  // 0 or greater than kMaxRange.
-  UInt32 Generate(UInt32 range);
-
- private:
-  UInt32 state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
-};
-
-// Defining a variable of type CompileAssertTypesEqual<T1, T2> will cause a
-// compiler error iff T1 and T2 are different types.
-template <typename T1, typename T2>
-struct CompileAssertTypesEqual;
-
-template <typename T>
-struct CompileAssertTypesEqual<T, T> {
-};
-
-// Removes the reference from a type if it is a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::remove_reference, which is not widely available yet.
-template <typename T>
-struct RemoveReference { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveReference<T&> { typedef T type; };  // NOLINT
-
-// A handy wrapper around RemoveReference that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_REFERENCE_(T) \
-    typename ::testing::internal::RemoveReference<T>::type
-
-// Removes const from a type if it is a const type, otherwise leaves
-// it unchanged.  This is the same as tr1::remove_const, which is not
-// widely available yet.
-template <typename T>
-struct RemoveConst { typedef T type; };  // NOLINT
-template <typename T>
-struct RemoveConst<const T> { typedef T type; };  // NOLINT
-
-// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above
-// definition to fail to remove the const in 'const int[3]' and 'const
-// char[3][4]'.  The following specialization works around the bug.
-// However, it causes trouble with GCC and thus needs to be
-// conditionally compiled.
-#if defined(_MSC_VER) || defined(__SUNPRO_CC) || defined(__IBMCPP__)
-template <typename T, size_t N>
-struct RemoveConst<const T[N]> {
-  typedef typename RemoveConst<T>::type type[N];
-};
-#endif
-
-// A handy wrapper around RemoveConst that works when the argument
-// T depends on template parameters.
-#define GTEST_REMOVE_CONST_(T) \
-    typename ::testing::internal::RemoveConst<T>::type
-
-// Turns const U&, U&, const U, and U all into U.
-#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
-    GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
-
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Adds a reference to const on top of T as necessary.  For example,
-// it transforms
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> const char&
-//   const char&  ==> const char&
-//
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
-
-// ImplicitlyConvertible<From, To>::value is a compile-time bool
-// constant that's true iff type From can be implicitly converted to
-// type To.
-template <typename From, typename To>
-class ImplicitlyConvertible {
- private:
-  // We need the following helper functions only for their types.
-  // They have no implementations.
-
-  // MakeFrom() is an expression whose type is From.  We cannot simply
-  // use From(), as the type From may not have a public default
-  // constructor.
-  static From MakeFrom();
-
-  // These two functions are overloaded.  Given an expression
-  // Helper(x), the compiler will pick the first version if x can be
-  // implicitly converted to type To; otherwise it will pick the
-  // second version.
-  //
-  // The first version returns a value of size 1, and the second
-  // version returns a value of size 2.  Therefore, by checking the
-  // size of Helper(x), which can be done at compile time, we can tell
-  // which version of Helper() is used, and hence whether x can be
-  // implicitly converted to type To.
-  static char Helper(To);
-  static char (&Helper(...))[2];  // NOLINT
-
-  // We have to put the 'public' section after the 'private' section,
-  // or MSVC refuses to compile the code.
- public:
-  // MSVC warns about implicitly converting from double to int for
-  // possible loss of data, so we need to temporarily disable the
-  // warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4244)  // Temporarily disables warning 4244.
-
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-# pragma warning(pop)           // Restores the warning state.
-#elif defined(__BORLANDC__)
-  // C++Builder cannot use member overload resolution during template
-  // instantiation.  The simplest workaround is to use its C++0x type traits
-  // functions (C++Builder 2009 and above only).
-  static const bool value = __is_convertible(From, To);
-#else
-  static const bool value =
-      sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1;
-#endif  // _MSV_VER
-};
-template <typename From, typename To>
-const bool ImplicitlyConvertible<From, To>::value;
-
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true iff T is type ProtocolMessage, proto2::Message, or a subclass
-// of those.
-template <typename T>
-struct IsAProtocolMessage
-    : public bool_constant<
-  ImplicitlyConvertible<const T*, const ::ProtocolMessage*>::value ||
-  ImplicitlyConvertible<const T*, const ::proto2::Message*>::value> {
-};
-
-// When the compiler sees expression IsContainerTest<C>(0), if C is an
-// STL-style container class, the first overload of IsContainerTest
-// will be viable (since both C::iterator* and C::const_iterator* are
-// valid types and NULL can be implicitly converted to them).  It will
-// be picked over the second overload as 'int' is a perfect match for
-// the type of argument 0.  If C::iterator or C::const_iterator is not
-// a valid type, the first overload is not viable, and the second
-// overload will be picked.  Therefore, we can determine whether C is
-// a container class by checking the type of IsContainerTest<C>(0).
-// The value of the expression is insignificant.
-//
-// Note that we look for both C::iterator and C::const_iterator.  The
-// reason is that C++ injects the name of a class as a member of the
-// class itself (e.g. you can refer to class iterator as either
-// 'iterator' or 'iterator::iterator').  If we look for C::iterator
-// only, for example, we would mistakenly think that a class named
-// iterator is an STL container.
-//
-// Also note that the simpler approach of overloading
-// IsContainerTest(typename C::const_iterator*) and
-// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
-typedef int IsContainer;
-template <class C>
-IsContainer IsContainerTest(int /* dummy */,
-                            typename C::iterator* /* it */ = NULL,
-                            typename C::const_iterator* /* const_it */ = NULL) {
-  return 0;
-}
-
-typedef char IsNotContainer;
-template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
-
-// EnableIf<condition>::type is void when 'Cond' is true, and
-// undefined when 'Cond' is false.  To use SFINAE to make a function
-// overload only apply when a particular expression is true, add
-// "typename EnableIf<expression>::type* = 0" as the last parameter.
-template<bool> struct EnableIf;
-template<> struct EnableIf<true> { typedef void type; };  // NOLINT
-
-// Utilities for native arrays.
-
-// ArrayEq() compares two k-dimensional native arrays using the
-// elements' operator==, where k can be any integer >= 0.  When k is
-// 0, ArrayEq() degenerates into comparing a single pair of values.
-
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
-  return internal::ArrayEq(lhs, N, rhs);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous ArrayEq() function, arrays with different sizes would
-// lead to different copies of the template code.
-template <typename T, typename U>
-bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
-  for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
-  }
-  return true;
-}
-
-// Finds the first element in the iterator range [begin, end) that
-// equals elem.  Element may be a native array type itself.
-template <typename Iter, typename Element>
-Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
-  for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
-  }
-  return end;
-}
-
-// CopyArray() copies a k-dimensional native array using the elements'
-// operator=, where k can be any integer >= 0.  When k is 0,
-// CopyArray() degenerates into copying a single value.
-
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to);
-
-// This generic version is used when k is 0.
-template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
-
-// This overload is used when k >= 1.
-template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
-  internal::CopyArray(from, N, *to);
-}
-
-// This helper reduces code bloat.  If we instead put its logic inside
-// the previous CopyArray() function, arrays with different sizes
-// would lead to different copies of the template code.
-template <typename T, typename U>
-void CopyArray(const T* from, size_t size, U* to) {
-  for (size_t i = 0; i != size; i++) {
-    internal::CopyArray(from[i], to + i);
-  }
-}
-
-// The relation between an NativeArray object (see below) and the
-// native array it represents.
-enum RelationToSource {
-  kReference,  // The NativeArray references the native array.
-  kCopy        // The NativeArray makes a copy of the native array and
-               // owns the copy.
-};
-
-// Adapts a native array to a read-only STL-style container.  Instead
-// of the complete STL container concept, this adaptor only implements
-// members useful for Google Mock's container matchers.  New members
-// should be added as needed.  To simplify the implementation, we only
-// support Element being a raw type (i.e. having no top-level const or
-// reference modifier).  It's the client's responsibility to satisfy
-// this requirement.  Element can be an array type itself (hence
-// multi-dimensional arrays are supported).
-template <typename Element>
-class NativeArray {
- public:
-  // STL-style container typedefs.
-  typedef Element value_type;
-  typedef Element* iterator;
-  typedef const Element* const_iterator;
-
-  // Constructs from a native array.
-  NativeArray(const Element* array, size_t count, RelationToSource relation) {
-    Init(array, count, relation);
-  }
-
-  // Copy constructor.
-  NativeArray(const NativeArray& rhs) {
-    Init(rhs.array_, rhs.size_, rhs.relation_to_source_);
-  }
-
-  ~NativeArray() {
-    // Ensures that the user doesn't instantiate NativeArray with a
-    // const or reference type.
-    static_cast<void>(StaticAssertTypeEqHelper<Element,
-        GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>());
-    if (relation_to_source_ == kCopy)
-      delete[] array_;
-  }
-
-  // STL-style container methods.
-  size_t size() const { return size_; }
-  const_iterator begin() const { return array_; }
-  const_iterator end() const { return array_ + size_; }
-  bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
-  }
-
- private:
-  // Initializes this object; makes a copy of the input array if
-  // 'relation' is kCopy.
-  void Init(const Element* array, size_t a_size, RelationToSource relation) {
-    if (relation == kReference) {
-      array_ = array;
-    } else {
-      Element* const copy = new Element[a_size];
-      CopyArray(array, a_size, copy);
-      array_ = copy;
-    }
-    size_ = a_size;
-    relation_to_source_ = relation;
-  }
-
-  const Element* array_;
-  size_t size_;
-  RelationToSource relation_to_source_;
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
-
-#define GTEST_MESSAGE_(message, result_type) \
-  GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
-
-#define GTEST_FATAL_FAILURE_(message) \
-  return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure)
-
-#define GTEST_NONFATAL_FAILURE_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure)
-
-#define GTEST_SUCCESS_(message) \
-  GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
-
-// Suppresses MSVC warnings 4072 (unreachable code) for the code following
-// statement if it returns or throws (or doesn't return or throw in some
-// situations).
-#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
-
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
-
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail("Expected: " #statement " doesn't throw an exception.\n" \
-           "  Actual: it throws.")
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
-
-// Implements Boolean test assertions such as EXPECT_TRUE. expression can be
-// either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
-#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
-
-// Expands to the name of the class that implements the given test.
-#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-  test_case_name##_##test_name##_Test
-
-// Helper macro for defining tests.
-#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\
-class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\
- public:\
-  GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\
- private:\
-  virtual void TestBody();\
-  static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\
-};\
-\
-::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
-  ::test_info_ =\
-    ::testing::internal::MakeAndRegisterTestInfo(\
-        #test_case_name, #test_name, NULL, NULL, \
-        (parent_id), \
-        parent_class::SetUpTestCase, \
-        parent_class::TearDownTestCase, \
-        new ::testing::internal::TestFactoryImpl<\
-            GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\
-void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines the public API for death tests.  It is
-// #included by gtest.h so a user doesn't need to include this
-// directly.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines internal utilities needed for implementing
-// death tests.  They are subject to change without notice.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-
-
-#include <stdio.h>
-
-namespace testing {
-namespace internal {
-
-GTEST_DECLARE_string_(internal_run_death_test);
-
-// Names of the flags (needed for parsing Google Test flags).
-const char kDeathTestStyleFlag[] = "death_test_style";
-const char kDeathTestUseFork[] = "death_test_use_fork";
-const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
-
-#if GTEST_HAS_DEATH_TEST
-
-// DeathTest is a class that hides much of the complexity of the
-// GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
-// returns a concrete class that depends on the prevailing death test
-// style, as defined by the --gtest_death_test_style and/or
-// --gtest_internal_run_death_test flags.
-
-// In describing the results of death tests, these terms are used with
-// the corresponding definitions:
-//
-// exit status:  The integer exit information in the format specified
-//               by wait(2)
-// exit code:    The integer code passed to exit(3), _exit(2), or
-//               returned from main()
-class GTEST_API_ DeathTest {
- public:
-  // Create returns false if there was an error determining the
-  // appropriate action to take for the current death test; for example,
-  // if the gtest_death_test_style flag is set to an invalid value.
-  // The LastMessage method will return a more detailed message in that
-  // case.  Otherwise, the DeathTest pointer pointed to by the "test"
-  // argument is set.  If the death test should be skipped, the pointer
-  // is set to NULL; otherwise, it is set to the address of a new concrete
-  // DeathTest object that controls the execution of the current test.
-  static bool Create(const char* statement, const RE* regex,
-                     const char* file, int line, DeathTest** test);
-  DeathTest();
-  virtual ~DeathTest() { }
-
-  // A helper class that aborts a death test when it's deleted.
-  class ReturnSentinel {
-   public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
-    ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
-   private:
-    DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
-  } GTEST_ATTRIBUTE_UNUSED_;
-
-  // An enumeration of possible roles that may be taken when a death
-  // test is encountered.  EXECUTE means that the death test logic should
-  // be executed immediately.  OVERSEE means that the program should prepare
-  // the appropriate environment for a child process to execute the death
-  // test, then wait for it to complete.
-  enum TestRole { OVERSEE_TEST, EXECUTE_TEST };
-
-  // An enumeration of the three reasons that a test might be aborted.
-  enum AbortReason {
-    TEST_ENCOUNTERED_RETURN_STATEMENT,
-    TEST_THREW_EXCEPTION,
-    TEST_DID_NOT_DIE
-  };
-
-  // Assumes one of the above roles.
-  virtual TestRole AssumeRole() = 0;
-
-  // Waits for the death test to finish and returns its status.
-  virtual int Wait() = 0;
-
-  // Returns true if the death test passed; that is, the test process
-  // exited during the test, its exit status matches a user-supplied
-  // predicate, and its stderr output matches a user-supplied regular
-  // expression.
-  // The user-supplied predicate may be a macro expression rather
-  // than a function pointer or functor, or else Wait and Passed could
-  // be combined.
-  virtual bool Passed(bool exit_status_ok) = 0;
-
-  // Signals that the death test did not die as expected.
-  virtual void Abort(AbortReason reason) = 0;
-
-  // Returns a human-readable outcome message regarding the outcome of
-  // the last death test.
-  static const char* LastMessage();
-
-  static void set_last_death_test_message(const String& message);
-
- private:
-  // A string containing a description of the outcome of the last death test.
-  static String last_death_test_message_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
-};
-
-// Factory interface for death tests.  May be mocked out for testing.
-class DeathTestFactory {
- public:
-  virtual ~DeathTestFactory() { }
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test) = 0;
-};
-
-// A concrete DeathTestFactory implementation for normal use.
-class DefaultDeathTestFactory : public DeathTestFactory {
- public:
-  virtual bool Create(const char* statement, const RE* regex,
-                      const char* file, int line, DeathTest** test);
-};
-
-// Returns true if exit_status describes a process that was terminated
-// by a signal, or exited normally with a nonzero exit code.
-GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
-
-// Traps C++ exceptions escaping statement and reports them as test
-// failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
-        ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
-    death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  }
-
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
-
-# endif
-
-// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
-// ASSERT_EXIT*, and EXPECT_EXIT*.
-# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    const ::testing::internal::RE& gtest_regex = (regex); \
-    ::testing::internal::DeathTest* gtest_dt; \
-    if (!::testing::internal::DeathTest::Create(#statement, &gtest_regex, \
-        __FILE__, __LINE__, &gtest_dt)) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-    } \
-    if (gtest_dt != NULL) { \
-      ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \
-          gtest_dt_ptr(gtest_dt); \
-      switch (gtest_dt->AssumeRole()) { \
-        case ::testing::internal::DeathTest::OVERSEE_TEST: \
-          if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \
-            goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \
-          } \
-          break; \
-        case ::testing::internal::DeathTest::EXECUTE_TEST: { \
-          ::testing::internal::DeathTest::ReturnSentinel \
-              gtest_sentinel(gtest_dt); \
-          GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \
-          gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \
-          break; \
-        } \
-        default: \
-          break; \
-      } \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \
-      fail(::testing::internal::DeathTest::LastMessage())
-// The symbol "fail" here expands to something into which a message
-// can be streamed.
-
-// A class representing the parsed contents of the
-// --gtest_internal_run_death_test flag, as it existed when
-// RUN_ALL_TESTS was called.
-class InternalRunDeathTestFlag {
- public:
-  InternalRunDeathTestFlag(const String& a_file,
-                           int a_line,
-                           int an_index,
-                           int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
-
-  ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
-  }
-
-  String file() const { return file_; }
-  int line() const { return line_; }
-  int index() const { return index_; }
-  int write_fd() const { return write_fd_; }
-
- private:
-  String file_;
-  int line_;
-  int index_;
-  int write_fd_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
-};
-
-// Returns a newly created InternalRunDeathTestFlag object with fields
-// initialized from the GTEST_FLAG(internal_run_death_test) flag if
-// the flag is specified; otherwise returns NULL.
-InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
-
-#else  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
-
-#endif  // GTEST_HAS_DEATH_TEST
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-
-namespace testing {
-
-// This flag controls the style of death tests.  Valid values are "threadsafe",
-// meaning that the death test child process will re-execute the test binary
-// from the start, running only a single death test, or "fast",
-// meaning that the child process will execute the test logic immediately
-// after forking.
-GTEST_DECLARE_string_(death_test_style);
-
-#if GTEST_HAS_DEATH_TEST
-
-// The following macros are useful for writing death tests.
-
-// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is
-// executed:
-//
-//   1. It generates a warning if there is more than one active
-//   thread.  This is because it's safe to fork() or clone() only
-//   when there is a single thread.
-//
-//   2. The parent process clone()s a sub-process and runs the death
-//   test in it; the sub-process exits with code 0 at the end of the
-//   death test, if it hasn't exited already.
-//
-//   3. The parent process waits for the sub-process to terminate.
-//
-//   4. The parent process checks the exit code and error message of
-//   the sub-process.
-//
-// Examples:
-//
-//   ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number");
-//   for (int i = 0; i < 5; i++) {
-//     EXPECT_DEATH(server.ProcessRequest(i),
-//                  "Invalid request .* in ProcessRequest()")
-//         << "Failed to die on request " << i);
-//   }
-//
-//   ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting");
-//
-//   bool KilledBySIGHUP(int exit_code) {
-//     return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP;
-//   }
-//
-//   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
-//
-// On the regular expressions used in death tests:
-//
-//   On POSIX-compliant systems (*nix), we use the <regex.h> library,
-//   which uses the POSIX extended regex syntax.
-//
-//   On other platforms (e.g. Windows), we only support a simple regex
-//   syntax implemented as part of Google Test.  This limited
-//   implementation should be enough most of the time when writing
-//   death tests; though it lacks many features you can find in PCRE
-//   or POSIX extended regex syntax.  For example, we don't support
-//   union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and
-//   repetition count ("x{5,7}"), among others.
-//
-//   Below is the syntax that we do support.  We chose it to be a
-//   subset of both PCRE and POSIX extended regex, so it's easy to
-//   learn wherever you come from.  In the following: 'A' denotes a
-//   literal character, period (.), or a single \\ escape sequence;
-//   'x' and 'y' denote regular expressions; 'm' and 'n' are for
-//   natural numbers.
-//
-//     c     matches any literal character c
-//     \\d   matches any decimal digit
-//     \\D   matches any character that's not a decimal digit
-//     \\f   matches \f
-//     \\n   matches \n
-//     \\r   matches \r
-//     \\s   matches any ASCII whitespace, including \n
-//     \\S   matches any character that's not a whitespace
-//     \\t   matches \t
-//     \\v   matches \v
-//     \\w   matches any letter, _, or decimal digit
-//     \\W   matches any character that \\w doesn't match
-//     \\c   matches any literal character c, which must be a punctuation
-//     .     matches any single character except \n
-//     A?    matches 0 or 1 occurrences of A
-//     A*    matches 0 or many occurrences of A
-//     A+    matches 1 or many occurrences of A
-//     ^     matches the beginning of a string (not that of each line)
-//     $     matches the end of a string (not that of each line)
-//     xy    matches x followed by y
-//
-//   If you accidentally use PCRE or POSIX extended regex features
-//   not implemented by us, you will get a run-time failure.  In that
-//   case, please try to rewrite your regular expression within the
-//   above syntax.
-//
-//   This implementation is *not* meant to be as highly tuned or robust
-//   as a compiled regex library, but should perform well enough for a
-//   death test, which already incurs significant overhead by launching
-//   a child process.
-//
-// Known caveats:
-//
-//   A "threadsafe" style death test obtains the path to the test
-//   program from argv[0] and re-executes it in the sub-process.  For
-//   simplicity, the current implementation doesn't search the PATH
-//   when launching the sub-process.  This means that the user must
-//   invoke the test program via a path that contains at least one
-//   path separator (e.g. path/to/foo_test and
-//   /absolute/path/to/bar_test are fine, but foo_test is not).  This
-//   is rarely a problem as people usually don't put the test binary
-//   directory in PATH.
-//
-// TODO(wan@google.com): make thread-safe death tests search the PATH.
-
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
-
-// Like ASSERT_EXIT, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
-
-// Asserts that a given statement causes the program to exit, either by
-// explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Like ASSERT_DEATH, but continues on to successive tests in the
-// test case, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
-
-// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
-
-// Tests that an exit code describes a normal exit with a given exit code.
-class GTEST_API_ ExitedWithCode {
- public:
-  explicit ExitedWithCode(int exit_code);
-  bool operator()(int exit_status) const;
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
-
-  const int exit_code_;
-};
-
-# if !GTEST_OS_WINDOWS
-// Tests that an exit code describes an exit due to termination by a
-// given signal.
-class GTEST_API_ KilledBySignal {
- public:
-  explicit KilledBySignal(int signum);
-  bool operator()(int exit_status) const;
- private:
-  const int signum_;
-};
-# endif  // !GTEST_OS_WINDOWS
-
-// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
-// The death testing framework causes this to have interesting semantics,
-// since the sideeffects of the call are only visible in opt mode, and not
-// in debug mode.
-//
-// In practice, this can be used to test functions that utilize the
-// LOG(DFATAL) macro using the following style:
-//
-// int DieInDebugOr12(int* sideeffect) {
-//   if (sideeffect) {
-//     *sideeffect = 12;
-//   }
-//   LOG(DFATAL) << "death";
-//   return 12;
-// }
-//
-// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) {
-//   int sideeffect = 0;
-//   // Only asserts in dbg.
-//   EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death");
-//
-// #ifdef NDEBUG
-//   // opt-mode has sideeffect visible.
-//   EXPECT_EQ(12, sideeffect);
-// #else
-//   // dbg-mode no visible sideeffect.
-//   EXPECT_EQ(0, sideeffect);
-// #endif
-// }
-//
-// This will assert that DieInDebugReturn12InOpt() crashes in debug
-// mode, usually due to a DCHECK or LOG(DFATAL), but returns the
-// appropriate fallback value (12 in this case) in opt mode. If you
-// need to test that a function has appropriate side-effects in opt
-// mode, include assertions against the side-effects.  A general
-// pattern for this is:
-//
-// EXPECT_DEBUG_DEATH({
-//   // Side-effects here will have an effect after this statement in
-//   // opt mode, but none in debug mode.
-//   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
-// }, "death");
-//
-# ifdef NDEBUG
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  do { statement; } while (::testing::internal::AlwaysFalse())
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  do { statement; } while (::testing::internal::AlwaysFalse())
-
-# else
-
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
-
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
-
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
-#endif  // GTEST_HAS_DEATH_TEST
-
-// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
-// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
-// death tests are supported; otherwise they just issue a warning.  This is
-// useful when you are combining death test assertions with normal test
-// assertions in one test.
-#if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
-#else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
-#endif
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-// Copyright 2005, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
-//
-// This header file defines the Message class.
-//
-// IMPORTANT NOTE: Due to limitation of the C++ language, we have to
-// leave some internal implementation details in this header file.
-// They are clearly marked by comments like this:
-//
-//   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-//
-// Such code is NOT meant to be used by a user directly, and is subject
-// to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
-// program!
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-
-#include <limits>
-
-
-namespace testing {
-
-// The Message class works like an ostream repeater.
-//
-// Typical usage:
-//
-//   1. You stream a bunch of values to a Message object.
-//      It will remember the text in a stringstream.
-//   2. Then you stream the Message object to an ostream.
-//      This causes the text in the Message to be streamed
-//      to the ostream.
-//
-// For example;
-//
-//   testing::Message foo;
-//   foo << 1 << " != " << 2;
-//   std::cout << foo;
-//
-// will print "1 != 2".
-//
-// Message is not intended to be inherited from.  In particular, its
-// destructor is not virtual.
-//
-// Note that stringstream behaves differently in gcc and in MSVC.  You
-// can stream a NULL char pointer to it in the former, but not in the
-// latter (it causes an access violation if you do).  The Message
-// class hides this difference by treating a NULL char pointer as
-// "(null)".
-class GTEST_API_ Message {
- private:
-  // The type of basic IO manipulators (endl, ends, and flush) for
-  // narrow streams.
-  typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&);
-
- public:
-  // Constructs an empty Message.
-  // We allocate the stringstream separately because otherwise each use of
-  // ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's
-  // stack frame leading to huge stack frames in some cases; gcc does not reuse
-  // the stack space.
-  Message() : ss_(new ::std::stringstream) {
-    // By default, we want there to be enough precision when printing
-    // a double to a Message.
-    *ss_ << std::setprecision(std::numeric_limits<double>::digits10 + 2);
-  }
-
-  // Copy constructor.
-  Message(const Message& msg) : ss_(new ::std::stringstream) {  // NOLINT
-    *ss_ << msg.GetString();
-  }
-
-  // Constructs a Message from a C-string.
-  explicit Message(const char* str) : ss_(new ::std::stringstream) {
-    *ss_ << str;
-  }
-
-#if GTEST_OS_SYMBIAN
-  // Streams a value (either a pointer or not) to this object.
-  template <typename T>
-  inline Message& operator <<(const T& value) {
-    StreamHelper(typename internal::is_pointer<T>::type(), value);
-    return *this;
-  }
-#else
-  // Streams a non-pointer value to this object.
-  template <typename T>
-  inline Message& operator <<(const T& val) {
-    ::GTestStreamToHelper(ss_.get(), val);
-    return *this;
-  }
-
-  // Streams a pointer value to this object.
-  //
-  // This function is an overload of the previous one.  When you
-  // stream a pointer to a Message, this definition will be used as it
-  // is more specialized.  (The C++ Standard, section
-  // [temp.func.order].)  If you stream a non-pointer, then the
-  // previous definition will be used.
-  //
-  // The reason for this overload is that streaming a NULL pointer to
-  // ostream is undefined behavior.  Depending on the compiler, you
-  // may get "0", "(nil)", "(null)", or an access violation.  To
-  // ensure consistent result across compilers, we always treat NULL
-  // as "(null)".
-  template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
-    if (pointer == NULL) {
-      *ss_ << "(null)";
-    } else {
-      ::GTestStreamToHelper(ss_.get(), pointer);
-    }
-    return *this;
-  }
-#endif  // GTEST_OS_SYMBIAN
-
-  // Since the basic IO manipulators are overloaded for both narrow
-  // and wide streams, we have to provide this specialized definition
-  // of operator <<, even though its body is the same as the
-  // templatized version above.  Without this definition, streaming
-  // endl or other basic IO manipulators to Message will confuse the
-  // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
-    *ss_ << val;
-    return *this;
-  }
-
-  // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
-
-  // These two overloads allow streaming a wide C string to a Message
-  // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str) {
-    return *this << internal::String::ShowWideCString(wide_c_str);
-  }
-  Message& operator <<(wchar_t* wide_c_str) {
-    return *this << internal::String::ShowWideCString(wide_c_str);
-  }
-
-#if GTEST_HAS_STD_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_GLOBAL_WSTRING
-  // Converts the given wide string to a narrow string using the UTF-8
-  // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::wstring& wstr);
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-  // Gets the text streamed to this object so far as a String.
-  // Each '\0' character in the buffer is replaced with "\\0".
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::String GetString() const {
-    return internal::StringStreamToString(ss_.get());
-  }
-
- private:
-
-#if GTEST_OS_SYMBIAN
-  // These are needed as the Nokia Symbian Compiler cannot decide between
-  // const T& and const T* in a function template. The Nokia compiler _can_
-  // decide between class template specializations for T and T*, so a
-  // tr1::type_traits-like is_pointer works, and we can overload on that.
-  template <typename T>
-  inline void StreamHelper(internal::true_type /*dummy*/, T* pointer) {
-    if (pointer == NULL) {
-      *ss_ << "(null)";
-    } else {
-      ::GTestStreamToHelper(ss_.get(), pointer);
-    }
-  }
-  template <typename T>
-  inline void StreamHelper(internal::false_type /*dummy*/, const T& value) {
-    ::GTestStreamToHelper(ss_.get(), value);
-  }
-#endif  // GTEST_OS_SYMBIAN
-
-  // We'll hold the text streamed to this object here.
-  const internal::scoped_ptr< ::std::stringstream> ss_;
-
-  // We declare (but don't implement) this to prevent the compiler
-  // from implementing the assignment operator.
-  void operator=(const Message&);
-};
-
-// Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
-  return os << sb.GetString();
-}
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-// This file was GENERATED by command:
-//     pump.py gtest-param-test.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: vladl@google.com (Vlad Losev)
-//
-// Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
-//
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-
-
-// Value-parameterized tests allow you to test your code with different
-// parameters without writing multiple copies of the same test.
-//
-// Here is how you use value-parameterized tests:
-
-#if 0
-
-// To write value-parameterized tests, first you should define a fixture
-// class. It is usually derived from testing::TestWithParam<T> (see below for
-// another inheritance scheme that's sometimes useful in more complicated
-// class hierarchies), where the type of your parameter values.
-// TestWithParam<T> is itself derived from testing::Test. T can be any
-// copyable type. If it's a raw pointer, you are responsible for managing the
-// lifespan of the pointed values.
-
-class FooTest : public ::testing::TestWithParam<const char*> {
-  // You can implement all the usual class fixture members here.
-};
-
-// Then, use the TEST_P macro to define as many parameterized tests
-// for this fixture as you want. The _P suffix is for "parameterized"
-// or "pattern", whichever you prefer to think.
-
-TEST_P(FooTest, DoesBlah) {
-  // Inside a test, access the test parameter with the GetParam() method
-  // of the TestWithParam<T> class:
-  EXPECT_TRUE(foo.Blah(GetParam()));
-  ...
-}
-
-TEST_P(FooTest, HasBlahBlah) {
-  ...
-}
-
-// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
-// case with any set of parameters you want. Google Test defines a number
-// of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
-// are all in the testing namespace:
-//
-//
-//  Range(begin, end [, step]) - Yields values {begin, begin+step,
-//                               begin+step+step, ...}. The values do not
-//                               include end. step defaults to 1.
-//  Values(v1, v2, ..., vN)    - Yields values {v1, v2, ..., vN}.
-//  ValuesIn(container)        - Yields values from a C-style array, an STL
-//  ValuesIn(begin,end)          container, or an iterator range [begin, end).
-//  Bool()                     - Yields sequence {false, true}.
-//  Combine(g1, g2, ..., gN)   - Yields all combinations (the Cartesian product
-//                               for the math savvy) of the values generated
-//                               by the N generators.
-//
-// For more details, see comments at the definitions of these functions below
-// in this file.
-//
-// The following statement will instantiate tests from the FooTest test case
-// each with parameter values "meeny", "miny", and "moe".
-
-INSTANTIATE_TEST_CASE_P(InstantiationName,
-                        FooTest,
-                        Values("meeny", "miny", "moe"));
-
-// To distinguish different instances of the pattern, (yes, you
-// can instantiate it more then once) the first argument to the
-// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the
-// actual test case name. Remember to pick unique prefixes for different
-// instantiations. The tests from the instantiation above will have
-// these names:
-//
-//    * InstantiationName/FooTest.DoesBlah/0 for "meeny"
-//    * InstantiationName/FooTest.DoesBlah/1 for "miny"
-//    * InstantiationName/FooTest.DoesBlah/2 for "moe"
-//    * InstantiationName/FooTest.HasBlahBlah/0 for "meeny"
-//    * InstantiationName/FooTest.HasBlahBlah/1 for "miny"
-//    * InstantiationName/FooTest.HasBlahBlah/2 for "moe"
-//
-// You can use these names in --gtest_filter.
-//
-// This statement will instantiate all tests from FooTest again, each
-// with parameter values "cat" and "dog":
-
-const char* pets[] = {"cat", "dog"};
-INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets));
-
-// The tests from the instantiation above will have these names:
-//
-//    * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat"
-//    * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog"
-//
-// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests
-// in the given test case, whether their definitions come before or
-// AFTER the INSTANTIATE_TEST_CASE_P statement.
-//
-// Please also note that generator expressions (including parameters to the
-// generators) are evaluated in InitGoogleTest(), after main() has started.
-// This allows the user on one hand, to adjust generator parameters in order
-// to dynamically determine a set of tests to run and on the other hand,
-// give the user a chance to inspect the generated tests with Google Test
-// reflection API before RUN_ALL_TESTS() is executed.
-//
-// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc
-// for more examples.
-//
-// In the future, we plan to publish the API for defining new parameter
-// generators. But for now this interface remains part of the internal
-// implementation and is subject to change.
-//
-//
-// A parameterized test fixture must be derived from testing::Test and from
-// testing::WithParamInterface<T>, where T is the type of the parameter
-// values. Inheriting from TestWithParam<T> satisfies that requirement because
-// TestWithParam<T> inherits from both Test and WithParamInterface. In more
-// complicated hierarchies, however, it is occasionally useful to inherit
-// separately from Test and WithParamInterface. For example:
-
-class BaseTest : public ::testing::Test {
-  // You can inherit all the usual members for a non-parameterized test
-  // fixture here.
-};
-
-class DerivedTest : public BaseTest, public ::testing::WithParamInterface<int> {
-  // The usual test fixture members go here too.
-};
-
-TEST_F(BaseTest, HasFoo) {
-  // This is an ordinary non-parameterized test.
-}
-
-TEST_P(DerivedTest, DoesBlah) {
-  // GetParam works just the same here as if you inherit from TestWithParam.
-  EXPECT_TRUE(foo.Blah(GetParam()));
-}
-
-#endif  // 0
-
-
-#if !GTEST_OS_SYMBIAN
-# include <utility>
-#endif
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-
-#include <iterator>
-#include <utility>
-#include <vector>
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-// Copyright 2003 Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: Dan Egnor (egnor@google.com)
-//
-// A "smart" pointer type with reference tracking.  Every pointer to a
-// particular object is kept on a circular linked list.  When the last pointer
-// to an object is destroyed or reassigned, the object is deleted.
-//
-// Used properly, this deletes the object when the last reference goes away.
-// There are several caveats:
-// - Like all reference counting schemes, cycles lead to leaks.
-// - Each smart pointer is actually two pointers (8 bytes instead of 4).
-// - Every time a pointer is assigned, the entire list of pointers to that
-//   object is traversed.  This class is therefore NOT SUITABLE when there
-//   will often be more than two or three pointers to a particular object.
-// - References are only tracked as long as linked_ptr<> objects are copied.
-//   If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS
-//   will happen (double deletion).
-//
-// A good use of this class is storing object references in STL containers.
-// You can safely put linked_ptr<> in a vector<>.
-// Other uses may not be as good.
-//
-// Note: If you use an incomplete type with linked_ptr<>, the class
-// *containing* linked_ptr<> must have a constructor and destructor (even
-// if they do nothing!).
-//
-// Bill Gibbons suggested we use something like this.
-//
-// Thread Safety:
-//   Unlike other linked_ptr implementations, in this implementation
-//   a linked_ptr object is thread-safe in the sense that:
-//     - it's safe to copy linked_ptr objects concurrently,
-//     - it's safe to copy *from* a linked_ptr and read its underlying
-//       raw pointer (e.g. via get()) concurrently, and
-//     - it's safe to write to two linked_ptrs that point to the same
-//       shared object concurrently.
-// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
-// confusion with normal linked_ptr.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-
-#include <stdlib.h>
-#include <assert.h>
-
-
-namespace testing {
-namespace internal {
-
-// Protects copying of all linked_ptr objects.
-GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex);
-
-// This is used internally by all instances of linked_ptr<>.  It needs to be
-// a non-template class because different types of linked_ptr<> can refer to
-// the same object (linked_ptr<Superclass>(obj) vs linked_ptr<Subclass>(obj)).
-// So, it needs to be possible for different types of linked_ptr to participate
-// in the same circular linked list, so we need a single class type here.
-//
-// DO NOT USE THIS CLASS DIRECTLY YOURSELF.  Use linked_ptr<T>.
-class linked_ptr_internal {
- public:
-  // Create a new circle that includes only this instance.
-  void join_new() {
-    next_ = this;
-  }
-
-  // Many linked_ptr operations may change p.link_ for some linked_ptr
-  // variable p in the same circle as this object.  Therefore we need
-  // to prevent two such operations from occurring concurrently.
-  //
-  // Note that different types of linked_ptr objects can coexist in a
-  // circle (e.g. linked_ptr<Base>, linked_ptr<Derived1>, and
-  // linked_ptr<Derived2>).  Therefore we must use a single mutex to
-  // protect all linked_ptr objects.  This can create serious
-  // contention in production code, but is acceptable in a testing
-  // framework.
-
-  // Join an existing circle.
-  // L < g_linked_ptr_mutex
-  void join(linked_ptr_internal const* ptr) {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    linked_ptr_internal const* p = ptr;
-    while (p->next_ != ptr) p = p->next_;
-    p->next_ = this;
-    next_ = ptr;
-  }
-
-  // Leave whatever circle we're part of.  Returns true if we were the
-  // last member of the circle.  Once this is done, you can join() another.
-  // L < g_linked_ptr_mutex
-  bool depart() {
-    MutexLock lock(&g_linked_ptr_mutex);
-
-    if (next_ == this) return true;
-    linked_ptr_internal const* p = next_;
-    while (p->next_ != this) p = p->next_;
-    p->next_ = next_;
-    return false;
-  }
-
- private:
-  mutable linked_ptr_internal const* next_;
-};
-
-template <typename T>
-class linked_ptr {
- public:
-  typedef T element_type;
-
-  // Take over ownership of a raw pointer.  This should happen as soon as
-  // possible after the object is created.
-  explicit linked_ptr(T* ptr = NULL) { capture(ptr); }
-  ~linked_ptr() { depart(); }
-
-  // Copy an existing linked_ptr<>, adding ourselves to the list of references.
-  template <typename U> linked_ptr(linked_ptr<U> const& ptr) { copy(&ptr); }
-  linked_ptr(linked_ptr const& ptr) {  // NOLINT
-    assert(&ptr != this);
-    copy(&ptr);
-  }
-
-  // Assignment releases the old value and acquires the new.
-  template <typename U> linked_ptr& operator=(linked_ptr<U> const& ptr) {
-    depart();
-    copy(&ptr);
-    return *this;
-  }
-
-  linked_ptr& operator=(linked_ptr const& ptr) {
-    if (&ptr != this) {
-      depart();
-      copy(&ptr);
-    }
-    return *this;
-  }
-
-  // Smart pointer members.
-  void reset(T* ptr = NULL) {
-    depart();
-    capture(ptr);
-  }
-  T* get() const { return value_; }
-  T* operator->() const { return value_; }
-  T& operator*() const { return *value_; }
-
-  bool operator==(T* p) const { return value_ == p; }
-  bool operator!=(T* p) const { return value_ != p; }
-  template <typename U>
-  bool operator==(linked_ptr<U> const& ptr) const {
-    return value_ == ptr.get();
-  }
-  template <typename U>
-  bool operator!=(linked_ptr<U> const& ptr) const {
-    return value_ != ptr.get();
-  }
-
- private:
-  template <typename U>
-  friend class linked_ptr;
-
-  T* value_;
-  linked_ptr_internal link_;
-
-  void depart() {
-    if (link_.depart()) delete value_;
-  }
-
-  void capture(T* ptr) {
-    value_ = ptr;
-    link_.join_new();
-  }
-
-  template <typename U> void copy(linked_ptr<U> const* ptr) {
-    value_ = ptr->get();
-    if (value_)
-      link_.join(&ptr->link_);
-    else
-      link_.join_new();
-  }
-};
-
-template<typename T> inline
-bool operator==(T* ptr, const linked_ptr<T>& x) {
-  return ptr == x.get();
-}
-
-template<typename T> inline
-bool operator!=(T* ptr, const linked_ptr<T>& x) {
-  return ptr != x.get();
-}
-
-// A function to convert T* into linked_ptr<T>
-// Doing e.g. make_linked_ptr(new FooBarBaz<type>(arg)) is a shorter notation
-// for linked_ptr<FooBarBaz<type> >(new FooBarBaz<type>(arg))
-template <typename T>
-linked_ptr<T> make_linked_ptr(T* ptr) {
-  return linked_ptr<T>(ptr);
-}
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
-// Copyright 2007, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-// Google Test - The Google C++ Testing Framework
-//
-// This file implements a universal value printer that can print a
-// value of any type T:
-//
-//   void ::testing::internal::UniversalPrinter<T>::Print(value, ostream_ptr);
-//
-// A user can teach this function how to print a class type T by
-// defining either operator<<() or PrintTo() in the namespace that
-// defines T.  More specifically, the FIRST defined function in the
-// following list will be used (assuming T is defined in namespace
-// foo):
-//
-//   1. foo::PrintTo(const T&, ostream*)
-//   2. operator<<(ostream&, const T&) defined in either foo or the
-//      global namespace.
-//
-// If none of the above is defined, it will print the debug string of
-// the value if it is a protocol buffer, or print the raw bytes in the
-// value otherwise.
-//
-// To aid debugging: when T is a reference type, the address of the
-// value is also printed; when T is a (const) char pointer, both the
-// pointer value and the NUL-terminated string it points to are
-// printed.
-//
-// We also provide some convenient wrappers:
-//
-//   // Prints a value to a string.  For a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   std::string ::testing::PrintToString(const T& value);
-//
-//   // Prints a value tersely: for a reference type, the referenced
-//   // value (but not the address) is printed; for a (const or not) char
-//   // pointer, the NUL-terminated string (but not the pointer) is
-//   // printed.
-//   void ::testing::internal::UniversalTersePrint(const T& value, ostream*);
-//
-//   // Prints value using the type inferred by the compiler.  The difference
-//   // from UniversalTersePrint() is that this function prints both the
-//   // pointer and the NUL-terminated string for a (const or not) char pointer.
-//   void ::testing::internal::UniversalPrint(const T& value, ostream*);
-//
-//   // Prints the fields of a tuple tersely to a string vector, one
-//   // element for each field. Tuple support must be enabled in
-//   // gtest-port.h.
-//   std::vector<string> UniversalTersePrintTupleFieldsToStrings(
-//       const Tuple& value);
-//
-// Known limitation:
-//
-// The print primitives print the elements of an STL-style container
-// using the compiler-inferred type of *iter where iter is a
-// const_iterator of the container.  When const_iterator is an input
-// iterator but not a forward iterator, this inferred type may not
-// match value_type, and the print output may be incorrect.  In
-// practice, this is rarely a problem as for most containers
-// const_iterator is a forward iterator.  We'll fix this if there's an
-// actual need for it.  Note that this fix cannot rely on value_type
-// being defined as many user-defined container types don't have
-// value_type.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-
-#include <ostream>  // NOLINT
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace testing {
-
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
-
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
-
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-  kOtherType              // anything else
-};
-
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
-                         sizeof(value), os);
-  }
-};
-
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
-
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const ::testing::internal::string short_str = value.ShortDebugString();
-    const ::testing::internal::string pretty_str =
-        short_str.length() <= kProtobufOneLinerMaxLength ?
-        short_str : ("\n" + value.DebugString());
-    *os << ("<" + pretty_str + ">");
-  }
-};
-
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
-  // Since T has no << operator or PrintTo() but can be implicitly
-  // converted to BiggestInt, we print it as a BiggestInt.
-  //
-  // Most likely T is an enum type (either named or unnamed), in which
-  // case printing it as an integer is the desired behavior.  In case
-  // T is not an enum, printing it as an integer is the best we can do
-  // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
-  }
-};
-
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T,
-      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
-       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
-       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
-  return os;
-}
-
-}  // namespace internal2
-}  // namespace testing
-
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
-
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-  //
-  // We cannot write 'using ::testing::internal2::operator<<;', which
-  // gcc 3.3 fails to compile due to a compiler bug.
-  using namespace ::testing::internal2;  // NOLINT
-
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
-}
-
-}  // namespace testing_internal
-
-namespace testing {
-namespace internal {
-
-// UniversalPrinter<T>::Print(value, ostream_ptr) prints the given
-// value to the given ostream.  The caller must ensure that
-// 'ostream_ptr' is not NULL, or the behavior is undefined.
-//
-// We define UniversalPrinter as a class template (as opposed to a
-// function template), as we need to partially specialize it for
-// reference types, which cannot be done with function templates.
-template <typename T>
-class UniversalPrinter;
-
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(IsContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const C& container, ::std::ostream* os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    true_type /* is a pointer */,
-                    T* p, ::std::ostream* os) {
-  if (p == NULL) {
-    *os << "NULL";
-  } else {
-    // C++ doesn't allow casting from a function pointer to any object
-    // pointer.
-    //
-    // IsTrue() silences warnings: "Condition is always true",
-    // "unreachable code".
-    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
-      // T is not a function type.  We just call << to print p,
-      // relying on ADL to pick up user-defined << for their pointer
-      // types, if any.
-      *os << p;
-    } else {
-      // T is a function type, so '*os << p' doesn't do what we want
-      // (it just prints p as bool).  We want to print p as a const
-      // void*.  However, we cannot cast it to const void* directly,
-      // even using reinterpret_cast, as earlier versions of gcc
-      // (e.g. 3.4.5) cannot compile the cast when p is a function
-      // pointer.  Casting to UInt64 first solves the problem.
-      *os << reinterpret_cast<const void*>(
-          reinterpret_cast<internal::UInt64>(p));
-    }
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    false_type /* is not a pointer */,
-                    const T& value, ::std::ostream* os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
-// Prints the given value using the << operator if it has one;
-// otherwise prints the bytes in it.  This is what
-// UniversalPrinter<T>::Print() does when PrintTo() is not specialized
-// or overloaded for type T.
-//
-// A user can override this behavior for a class type Foo by defining
-// an overload of PrintTo() in the namespace where Foo is defined.  We
-// give the user this option as sometimes defining a << operator for
-// Foo is not desirable (e.g. the coding style may prevent doing it,
-// or there is already a << operator but it doesn't do what the user
-// wants).
-template <typename T>
-void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first two
-  // arguments determine which version will be picked.  If T is an
-  // STL-style container, the version for container will be called; if
-  // T is a pointer, the pointer version will be called; otherwise the
-  // generic version will be called.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // The second argument of DefaultPrintTo() is needed to bypass a bug
-  // in Symbian's C++ compiler that prevents it from picking the right
-  // overload between:
-  //
-  //   PrintTo(const T& x, ...);
-  //   PrintTo(T* x, ...);
-  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
-}
-
-// The following list of PrintTo() overloads tells
-// UniversalPrinter<T>::Print() how to print standard types (built-in
-// types, strings, plain arrays, and pointers).
-
-// Overloads for various char types.
-GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os);
-GTEST_API_ void PrintTo(signed char c, ::std::ostream* os);
-inline void PrintTo(char c, ::std::ostream* os) {
-  // When printing a plain char, we always treat it as unsigned.  This
-  // way, the output won't be affected by whether the compiler thinks
-  // char is signed or not.
-  PrintTo(static_cast<unsigned char>(c), os);
-}
-
-// Overloads for other simple built-in types.
-inline void PrintTo(bool x, ::std::ostream* os) {
-  *os << (x ? "true" : "false");
-}
-
-// Overload for wchar_t type.
-// Prints a wchar_t as a symbol if it is printable or as its internal
-// code otherwise and also as its decimal code (except for L'\0').
-// The L'\0' char is printed as "L'\\0'". The decimal code is printed
-// as signed integer when wchar_t is implemented by the compiler
-// as a signed type and is printed as an unsigned integer when wchar_t
-// is implemented as an unsigned type.
-GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
-
-// Overloads for C strings.
-GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
-inline void PrintTo(char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const char*>(s), os);
-}
-
-// signed/unsigned char is often used for representing binary data, so
-// we print pointers to it as void* to be safe.
-inline void PrintTo(const signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(signed char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(const unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-inline void PrintTo(unsigned char* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const void*>(s), os);
-}
-
-// MSVC can be configured to define wchar_t as a typedef of unsigned
-// short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
-// type.  When wchar_t is a typedef, defining an overload for const
-// wchar_t* would cause unsigned short* be printed as a wide string,
-// possibly causing invalid memory accesses.
-#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
-// Overloads for wide C strings
-GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os);
-inline void PrintTo(wchar_t* s, ::std::ostream* os) {
-  PrintTo(ImplicitCast_<const wchar_t*>(s), os);
-}
-#endif
-
-// Overload for C arrays.  Multi-dimensional arrays are printed
-// properly.
-
-// Prints the given number of elements in an array, without printing
-// the curly braces.
-template <typename T>
-void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
-  UniversalPrint(a[0], os);
-  for (size_t i = 1; i != count; i++) {
-    *os << ", ";
-    UniversalPrint(a[i], os);
-  }
-}
-
-// Overloads for ::string and ::std::string.
-#if GTEST_HAS_GLOBAL_STRING
-GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os);
-inline void PrintTo(const ::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_STRING
-
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
-inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
-  PrintStringTo(s, os);
-}
-
-// Overloads for ::wstring and ::std::wstring.
-#if GTEST_HAS_GLOBAL_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_GLOBAL_WSTRING
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
-inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
-  PrintWideStringTo(s, os);
-}
-#endif  // GTEST_HAS_STD_WSTRING
-
-#if GTEST_HAS_TR1_TUPLE
-// Overload for ::std::tr1::tuple.  Needed for printing function arguments,
-// which are packed as tuples.
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os);
-
-// Overloaded PrintTo() for tuples of various arities.  We support
-// tuples of up-to 10 fields.  The following implementation works
-// regardless of whether tr1::tuple is implemented using the
-// non-standard variadic template feature or not.
-
-inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1>
-void PrintTo(const ::std::tr1::tuple<T1>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2>
-void PrintTo(const ::std::tr1::tuple<T1, T2>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4>& t, ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9>
-void PrintTo(const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9>& t,
-             ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-          typename T6, typename T7, typename T8, typename T9, typename T10>
-void PrintTo(
-    const ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>& t,
-    ::std::ostream* os) {
-  PrintTupleTo(t, os);
-}
-#endif  // GTEST_HAS_TR1_TUPLE
-
-// Overload for std::pair.
-template <typename T1, typename T2>
-void PrintTo(const ::std::pair<T1, T2>& value, ::std::ostream* os) {
-  *os << '(';
-  // We cannot use UniversalPrint(value.first, os) here, as T1 may be
-  // a reference type.  The same for printing value.second.
-  UniversalPrinter<T1>::Print(value.first, os);
-  *os << ", ";
-  UniversalPrinter<T2>::Print(value.second, os);
-  *os << ')';
-}
-
-// Implements printing a non-reference type T by letting the compiler
-// pick the right overload of PrintTo() for T.
-template <typename T>
-class UniversalPrinter {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
-
-  // Note: we deliberately don't call this PrintTo(), as that name
-  // conflicts with ::testing::internal::PrintTo in the body of the
-  // function.
-  static void Print(const T& value, ::std::ostream* os) {
-    // By default, ::testing::internal::PrintTo() is used for printing
-    // the value.
-    //
-    // Thanks to Koenig look-up, if T is a class and has its own
-    // PrintTo() function defined in its namespace, that function will
-    // be visible here.  Since it is more specific than the generic ones
-    // in ::testing::internal, it will be picked by the compiler in the
-    // following statement - exactly what we want.
-    PrintTo(value, os);
-  }
-
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
-};
-
-// UniversalPrintArray(begin, len, os) prints an array of 'len'
-// elements, starting at address 'begin'.
-template <typename T>
-void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
-  if (len == 0) {
-    *os << "{}";
-  } else {
-    *os << "{ ";
-    const size_t kThreshold = 18;
-    const size_t kChunkSize = 8;
-    // If the array has more than kThreshold elements, we'll have to
-    // omit some details by printing only the first and the last
-    // kChunkSize elements.
-    // TODO(wan@google.com): let the user control the threshold using a flag.
-    if (len <= kThreshold) {
-      PrintRawArrayTo(begin, len, os);
-    } else {
-      PrintRawArrayTo(begin, kChunkSize, os);
-      *os << ", ..., ";
-      PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os);
-    }
-    *os << " }";
-  }
-}
-// This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(const char* begin,
-                                    size_t len,
-                                    ::std::ostream* os);
-
-// Implements printing an array type T[N].
-template <typename T, size_t N>
-class UniversalPrinter<T[N]> {
- public:
-  // Prints the given array, omitting some elements when there are too
-  // many.
-  static void Print(const T (&a)[N], ::std::ostream* os) {
-    UniversalPrintArray(a, N, os);
-  }
-};
-
-// Implements printing a reference type T&.
-template <typename T>
-class UniversalPrinter<T&> {
- public:
-  // MSVC warns about adding const to a function type, so we want to
-  // disable the warning.
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4180)  // Temporarily disables warning 4180.
-#endif  // _MSC_VER
-
-  static void Print(const T& value, ::std::ostream* os) {
-    // Prints the address of the value.  We use reinterpret_cast here
-    // as static_cast doesn't compile when T is a function type.
-    *os << "@" << reinterpret_cast<const void*>(&value) << " ";
-
-    // Then prints the value itself.
-    UniversalPrint(value, os);
-  }
-
-#ifdef _MSC_VER
-# pragma warning(pop)           // Restores the warning state.
-#endif  // _MSC_VER
-};
-
-// Prints a value tersely: for a reference type, the referenced value
-// (but not the address) is printed; for a (const) char pointer, the
-// NUL-terminated string (but not the pointer) is printed.
-template <typename T>
-void UniversalTersePrint(const T& value, ::std::ostream* os) {
-  UniversalPrint(value, os);
-}
-inline void UniversalTersePrint(const char* str, ::std::ostream* os) {
-  if (str == NULL) {
-    *os << "NULL";
-  } else {
-    UniversalPrint(string(str), os);
-  }
-}
-inline void UniversalTersePrint(char* str, ::std::ostream* os) {
-  UniversalTersePrint(static_cast<const char*>(str), os);
-}
-
-// Prints a value using the type inferred by the compiler.  The
-// difference between this and UniversalTersePrint() is that for a
-// (const) char pointer, this prints both the pointer and the
-// NUL-terminated string.
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os) {
-  UniversalPrinter<T>::Print(value, os);
-}
-
-#if GTEST_HAS_TR1_TUPLE
-typedef ::std::vector<string> Strings;
-
-// This helper template allows PrintTo() for tuples and
-// UniversalTersePrintTupleFieldsToStrings() to be defined by
-// induction on the number of tuple fields.  The idea is that
-// TuplePrefixPrinter<N>::PrintPrefixTo(t, os) prints the first N
-// fields in tuple t, and can be defined in terms of
-// TuplePrefixPrinter<N - 1>.
-
-// The inductive case.
-template <size_t N>
-struct TuplePrefixPrinter {
-  // Prints the first N fields of a tuple.
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    TuplePrefixPrinter<N - 1>::PrintPrefixTo(t, os);
-    *os << ", ";
-    UniversalPrinter<typename ::std::tr1::tuple_element<N - 1, Tuple>::type>
-        ::Print(::std::tr1::get<N - 1>(t), os);
-  }
-
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    TuplePrefixPrinter<N - 1>::TersePrintPrefixToStrings(t, strings);
-    ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<N - 1>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Base cases.
-template <>
-struct TuplePrefixPrinter<0> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple&, ::std::ostream*) {}
-
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple&, Strings*) {}
-};
-// We have to specialize the entire TuplePrefixPrinter<> class
-// template here, even though the definition of
-// TersePrintPrefixToStrings() is the same as the generic version, as
-// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't
-// support specializing a method template of a class template.
-template <>
-struct TuplePrefixPrinter<1> {
-  template <typename Tuple>
-  static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) {
-    UniversalPrinter<typename ::std::tr1::tuple_element<0, Tuple>::type>::
-        Print(::std::tr1::get<0>(t), os);
-  }
-
-  template <typename Tuple>
-  static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) {
-    ::std::stringstream ss;
-    UniversalTersePrint(::std::tr1::get<0>(t), &ss);
-    strings->push_back(ss.str());
-  }
-};
-
-// Helper function for printing a tuple.  T must be instantiated with
-// a tuple type.
-template <typename T>
-void PrintTupleTo(const T& t, ::std::ostream* os) {
-  *os << "(";
-  TuplePrefixPrinter< ::std::tr1::tuple_size<T>::value>::
-      PrintPrefixTo(t, os);
-  *os << ")";
-}
-
-// Prints the fields of a tuple tersely to a string vector, one
-// element for each field.  See the comment before
-// UniversalTersePrint() for how we define "tersely".
-template <typename Tuple>
-Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
-  Strings result;
-  TuplePrefixPrinter< ::std::tr1::tuple_size<Tuple>::value>::
-      TersePrintPrefixToStrings(value, &result);
-  return result;
-}
-#endif  // GTEST_HAS_TR1_TUPLE
-
-}  // namespace internal
-
-template <typename T>
-::std::string PrintToString(const T& value) {
-  ::std::stringstream ss;
-  internal::UniversalTersePrint(value, &ss);
-  return ss.str();
-}
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-namespace internal {
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Outputs a message explaining invalid registration of different
-// fixture class for the same test case. This may happen when
-// TEST_P macro is used to define two tests with the same name
-// but in different namespaces.
-GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name,
-                                          const char* file, int line);
-
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
-
-// Interface for iterating over elements provided by an implementation
-// of ParamGeneratorInterface<T>.
-template <typename T>
-class ParamIteratorInterface {
- public:
-  virtual ~ParamIteratorInterface() {}
-  // A pointer to the base generator instance.
-  // Used only for the purposes of iterator comparison
-  // to make sure that two iterators belong to the same generator.
-  virtual const ParamGeneratorInterface<T>* BaseGenerator() const = 0;
-  // Advances iterator to point to the next element
-  // provided by the generator. The caller is responsible
-  // for not calling Advance() on an iterator equal to
-  // BaseGenerator()->End().
-  virtual void Advance() = 0;
-  // Clones the iterator object. Used for implementing copy semantics
-  // of ParamIterator<T>.
-  virtual ParamIteratorInterface* Clone() const = 0;
-  // Dereferences the current iterator and provides (read-only) access
-  // to the pointed value. It is the caller's responsibility not to call
-  // Current() on an iterator equal to BaseGenerator()->End().
-  // Used for implementing ParamGenerator<T>::operator*().
-  virtual const T* Current() const = 0;
-  // Determines whether the given iterator and other point to the same
-  // element in the sequence generated by the generator.
-  // Used for implementing ParamGenerator<T>::operator==().
-  virtual bool Equals(const ParamIteratorInterface& other) const = 0;
-};
-
-// Class iterating over elements provided by an implementation of
-// ParamGeneratorInterface<T>. It wraps ParamIteratorInterface<T>
-// and implements the const forward iterator concept.
-template <typename T>
-class ParamIterator {
- public:
-  typedef T value_type;
-  typedef const T& reference;
-  typedef ptrdiff_t difference_type;
-
-  // ParamIterator assumes ownership of the impl_ pointer.
-  ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
-  ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
-    return *this;
-  }
-
-  const T& operator*() const { return *impl_->Current(); }
-  const T* operator->() const { return impl_->Current(); }
-  // Prefix version of operator++.
-  ParamIterator& operator++() {
-    impl_->Advance();
-    return *this;
-  }
-  // Postfix version of operator++.
-  ParamIterator operator++(int /*unused*/) {
-    ParamIteratorInterface<T>* clone = impl_->Clone();
-    impl_->Advance();
-    return ParamIterator(clone);
-  }
-  bool operator==(const ParamIterator& other) const {
-    return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_);
-  }
-  bool operator!=(const ParamIterator& other) const {
-    return !(*this == other);
-  }
-
- private:
-  friend class ParamGenerator<T>;
-  explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  scoped_ptr<ParamIteratorInterface<T> > impl_;
-};
-
-// ParamGeneratorInterface<T> is the binary interface to access generators
-// defined in other translation units.
-template <typename T>
-class ParamGeneratorInterface {
- public:
-  typedef T ParamType;
-
-  virtual ~ParamGeneratorInterface() {}
-
-  // Generator interface definition
-  virtual ParamIteratorInterface<T>* Begin() const = 0;
-  virtual ParamIteratorInterface<T>* End() const = 0;
-};
-
-// Wraps ParamGeneratorInterface<T> and provides general generator syntax
-// compatible with the STL Container concept.
-// This class implements copy initialization semantics and the contained
-// ParamGeneratorInterface<T> instance is shared among all copies
-// of the original object. This is possible because that instance is immutable.
-template<typename T>
-class ParamGenerator {
- public:
-  typedef ParamIterator<T> iterator;
-
-  explicit ParamGenerator(ParamGeneratorInterface<T>* impl) : impl_(impl) {}
-  ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {}
-
-  ParamGenerator& operator=(const ParamGenerator& other) {
-    impl_ = other.impl_;
-    return *this;
-  }
-
-  iterator begin() const { return iterator(impl_->Begin()); }
-  iterator end() const { return iterator(impl_->End()); }
-
- private:
-  linked_ptr<const ParamGeneratorInterface<T> > impl_;
-};
-
-// Generates values from a range of two comparable values. Can be used to
-// generate sequences of user-defined types that implement operator+() and
-// operator<().
-// This class is used in the Range() function.
-template <typename T, typename IncrementT>
-class RangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
-  virtual ~RangeGenerator() {}
-
-  virtual ParamIteratorInterface<T>* Begin() const {
-    return new Iterator(this, begin_, 0, step_);
-  }
-  virtual ParamIteratorInterface<T>* End() const {
-    return new Iterator(this, end_, end_index_, step_);
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base, T value, int index,
-             IncrementT step)
-        : base_(base), value_(value), index_(index), step_(step) {}
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
-      return base_;
-    }
-    virtual void Advance() {
-      value_ = value_ + step_;
-      index_++;
-    }
-    virtual ParamIteratorInterface<T>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const T* Current() const { return &value_; }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const int other_index =
-          CheckedDowncastToActualType<const Iterator>(&other)->index_;
-      return index_ == other_index;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
-          step_(other.step_) {}
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<T>* const base_;
-    T value_;
-    int index_;
-    const IncrementT step_;
-  };  // class RangeGenerator::Iterator
-
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
-                               const IncrementT& step) {
-    int end_index = 0;
-    for (T i = begin; i < end; i = i + step)
-      end_index++;
-    return end_index;
-  }
-
-  // No implementation - assignment is unsupported.
-  void operator=(const RangeGenerator& other);
-
-  const T begin_;
-  const T end_;
-  const IncrementT step_;
-  // The index for the end() iterator. All the elements in the generated
-  // sequence are indexed (0-based) to aid iterator comparison.
-  const int end_index_;
-};  // class RangeGenerator
-
-
-// Generates values from a pair of STL-style iterators. Used in the
-// ValuesIn() function. The elements are copied from the source range
-// since the source can be located on the stack, and the generator
-// is likely to persist beyond that stack frame.
-template <typename T>
-class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
- public:
-  template <typename ForwardIterator>
-  ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end)
-      : container_(begin, end) {}
-  virtual ~ValuesInIteratorRangeGenerator() {}
-
-  virtual ParamIteratorInterface<T>* Begin() const {
-    return new Iterator(this, container_.begin());
-  }
-  virtual ParamIteratorInterface<T>* End() const {
-    return new Iterator(this, container_.end());
-  }
-
- private:
-  typedef typename ::std::vector<T> ContainerType;
-
-  class Iterator : public ParamIteratorInterface<T> {
-   public:
-    Iterator(const ParamGeneratorInterface<T>* base,
-             typename ContainerType::const_iterator iterator)
-        : base_(base), iterator_(iterator) {}
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<T>* BaseGenerator() const {
-      return base_;
-    }
-    virtual void Advance() {
-      ++iterator_;
-      value_.reset();
-    }
-    virtual ParamIteratorInterface<T>* Clone() const {
-      return new Iterator(*this);
-    }
-    // We need to use cached value referenced by iterator_ because *iterator_
-    // can return a temporary object (and of type other then T), so just
-    // having "return &*iterator_;" doesn't work.
-    // value_ is updated here and not in Advance() because Advance()
-    // can advance iterator_ beyond the end of the range, and we cannot
-    // detect that fact. The client code, on the other hand, is
-    // responsible for not calling Current() on an out-of-range iterator.
-    virtual const T* Current() const {
-      if (value_.get() == NULL)
-        value_.reset(new T(*iterator_));
-      return value_.get();
-    }
-    virtual bool Equals(const ParamIteratorInterface<T>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
-    }
-
-   private:
-    Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
-        : ParamIteratorInterface<T>(),
-          base_(other.base_),
-          iterator_(other.iterator_) {}
-
-    const ParamGeneratorInterface<T>* const base_;
-    typename ContainerType::const_iterator iterator_;
-    // A cached value of *iterator_. We keep it here to allow access by
-    // pointer in the wrapping iterator's operator->().
-    // value_ needs to be mutable to be accessed in Current().
-    // Use of scoped_ptr helps manage cached value's lifetime,
-    // which is bound by the lifespan of the iterator itself.
-    mutable scoped_ptr<const T> value_;
-  };  // class ValuesInIteratorRangeGenerator::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const ValuesInIteratorRangeGenerator& other);
-
-  const ContainerType container_;
-};  // class ValuesInIteratorRangeGenerator
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Stores a parameter value and later creates tests parameterized with that
-// value.
-template <class TestClass>
-class ParameterizedTestFactory : public TestFactoryBase {
- public:
-  typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
-  virtual Test* CreateTest() {
-    TestClass::SetParam(&parameter_);
-    return new TestClass();
-  }
-
- private:
-  const ParamType parameter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactoryBase is a base class for meta-factories that create
-// test factories for passing into MakeAndRegisterTestInfo function.
-template <class ParamType>
-class TestMetaFactoryBase {
- public:
-  virtual ~TestMetaFactoryBase() {}
-
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0;
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// TestMetaFactory creates test factories for passing into
-// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives
-// ownership of test factory pointer, same factory object cannot be passed
-// into that method twice. But ParameterizedTestCaseInfo is going to call
-// it for each Test/Parameter value combination. Thus it needs meta factory
-// creator class.
-template <class TestCase>
-class TestMetaFactory
-    : public TestMetaFactoryBase<typename TestCase::ParamType> {
- public:
-  typedef typename TestCase::ParamType ParamType;
-
-  TestMetaFactory() {}
-
-  virtual TestFactoryBase* CreateTestFactory(ParamType parameter) {
-    return new ParameterizedTestFactory<TestCase>(parameter);
-  }
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseInfoBase is a generic interface
-// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase
-// accumulates test information provided by TEST_P macro invocations
-// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations
-// and uses that information to register all resulting test instances
-// in RegisterTests method. The ParameterizeTestCaseRegistry class holds
-// a collection of pointers to the ParameterizedTestCaseInfo objects
-// and calls RegisterTests() on each of them when asked.
-class ParameterizedTestCaseInfoBase {
- public:
-  virtual ~ParameterizedTestCaseInfoBase() {}
-
-  // Base part of test case name for display purposes.
-  virtual const string& GetTestCaseName() const = 0;
-  // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const = 0;
-  // UnitTest class invokes this method to register tests in this
-  // test case right before running them in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
-  virtual void RegisterTests() = 0;
-
- protected:
-  ParameterizedTestCaseInfoBase() {}
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase);
-};
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P
-// macro invocations for a particular test case and generators
-// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that
-// test case. It registers tests with all values generated by all
-// generators when asked.
-template <class TestCase>
-class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
- public:
-  // ParamType and GeneratorCreationFunc are private types but are required
-  // for declarations of public methods AddTestPattern() and
-  // AddTestCaseInstantiation().
-  typedef typename TestCase::ParamType ParamType;
-  // A function that returns an instance of appropriate generator type.
-  typedef ParamGenerator<ParamType>(GeneratorCreationFunc)();
-
-  explicit ParameterizedTestCaseInfo(const char* name)
-      : test_case_name_(name) {}
-
-  // Test case base name for display purposes.
-  virtual const string& GetTestCaseName() const { return test_case_name_; }
-  // Test case id to verify identity.
-  virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
-  // TEST_P macro uses AddTestPattern() to record information
-  // about a single test in a LocalTestInfo structure.
-  // test_case_name is the base name of the test case (without invocation
-  // prefix). test_base_name is the name of an individual test without
-  // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
-  // test case base name and DoBar is test base name.
-  void AddTestPattern(const char* test_case_name,
-                      const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(linked_ptr<TestInfo>(new TestInfo(test_case_name,
-                                                       test_base_name,
-                                                       meta_factory)));
-  }
-  // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
-  // about a generator.
-  int AddTestCaseInstantiation(const string& instantiation_name,
-                               GeneratorCreationFunc* func,
-                               const char* /* file */,
-                               int /* line */) {
-    instantiations_.push_back(::std::make_pair(instantiation_name, func));
-    return 0;  // Return value used only to run this method in namespace scope.
-  }
-  // UnitTest class invokes this method to register tests in this test case
-  // test cases right before running tests in RUN_ALL_TESTS macro.
-  // This method should not be called more then once on any single
-  // instance of a ParameterizedTestCaseInfoBase derived class.
-  // UnitTest has a guard to prevent from calling this method more then once.
-  virtual void RegisterTests() {
-    for (typename TestInfoContainer::iterator test_it = tests_.begin();
-         test_it != tests_.end(); ++test_it) {
-      linked_ptr<TestInfo> test_info = *test_it;
-      for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
-        const string& instantiation_name = gen_it->first;
-        ParamGenerator<ParamType> generator((*gen_it->second)());
-
-        Message test_case_name_stream;
-        if ( !instantiation_name.empty() )
-          test_case_name_stream << instantiation_name << "/";
-        test_case_name_stream << test_info->test_case_base_name;
-
-        int i = 0;
-        for (typename ParamGenerator<ParamType>::iterator param_it =
-                 generator.begin();
-             param_it != generator.end(); ++param_it, ++i) {
-          Message test_name_stream;
-          test_name_stream << test_info->test_base_name << "/" << i;
-          MakeAndRegisterTestInfo(
-              test_case_name_stream.GetString().c_str(),
-              test_name_stream.GetString().c_str(),
-              NULL,  // No type parameter.
-              PrintToString(*param_it).c_str(),
-              GetTestCaseTypeId(),
-              TestCase::SetUpTestCase,
-              TestCase::TearDownTestCase,
-              test_info->test_meta_factory->CreateTestFactory(*param_it));
-        }  // for param_it
-      }  // for gen_it
-    }  // for test_it
-  }  // RegisterTests
-
- private:
-  // LocalTestInfo structure keeps information about a single test registered
-  // with TEST_P macro.
-  struct TestInfo {
-    TestInfo(const char* a_test_case_base_name,
-             const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory) :
-        test_case_base_name(a_test_case_base_name),
-        test_base_name(a_test_base_name),
-        test_meta_factory(a_test_meta_factory) {}
-
-    const string test_case_base_name;
-    const string test_base_name;
-    const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
-  };
-  typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
-  // Keeps pairs of <Instantiation name, Sequence generator creation function>
-  // received from INSTANTIATE_TEST_CASE_P macros.
-  typedef ::std::vector<std::pair<string, GeneratorCreationFunc*> >
-      InstantiationContainer;
-
-  const string test_case_name_;
-  TestInfoContainer tests_;
-  InstantiationContainer instantiations_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo);
-};  // class ParameterizedTestCaseInfo
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase
-// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P
-// macros use it to locate their corresponding ParameterizedTestCaseInfo
-// descriptors.
-class ParameterizedTestCaseRegistry {
- public:
-  ParameterizedTestCaseRegistry() {}
-  ~ParameterizedTestCaseRegistry() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      delete *it;
-    }
-  }
-
-  // Looks up or creates and returns a structure containing information about
-  // tests and instantiations of a particular test case.
-  template <class TestCase>
-  ParameterizedTestCaseInfo<TestCase>* GetTestCasePatternHolder(
-      const char* test_case_name,
-      const char* file,
-      int line) {
-    ParameterizedTestCaseInfo<TestCase>* typed_test_info = NULL;
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      if ((*it)->GetTestCaseName() == test_case_name) {
-        if ((*it)->GetTestCaseTypeId() != GetTypeId<TestCase>()) {
-          // Complain about incorrect usage of Google Test facilities
-          // and terminate the program since we cannot guaranty correct
-          // test case setup and tear-down in this case.
-          ReportInvalidTestCaseType(test_case_name,  file, line);
-          posix::Abort();
-        } else {
-          // At this point we are sure that the object we found is of the same
-          // type we are looking for, so we downcast it to that type
-          // without further checks.
-          typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestCaseInfo<TestCase> >(*it);
-        }
-        break;
-      }
-    }
-    if (typed_test_info == NULL) {
-      typed_test_info = new ParameterizedTestCaseInfo<TestCase>(test_case_name);
-      test_case_infos_.push_back(typed_test_info);
-    }
-    return typed_test_info;
-  }
-  void RegisterTests() {
-    for (TestCaseInfoContainer::iterator it = test_case_infos_.begin();
-         it != test_case_infos_.end(); ++it) {
-      (*it)->RegisterTests();
-    }
-  }
-
- private:
-  typedef ::std::vector<ParameterizedTestCaseInfoBase*> TestCaseInfoContainer;
-
-  TestCaseInfoContainer test_case_infos_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry);
-};
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-// This file was GENERATED by command:
-//     pump.py gtest-param-util-generated.h.pump
-// DO NOT EDIT BY HAND!!!
-
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
-
-// Type and function utilities for implementing parameterized tests.
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
-// Currently Google Test supports at most 50 arguments in Values,
-// and at most 10 arguments in Combine. Please contact
-// googletestframework@googlegroups.com if you need more.
-// Please note that the number of arguments to Combine is limited
-// by the maximum arity of the implementation of tr1::tuple which is
-// currently set at 10.
-
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Forward declarations of ValuesIn(), which is implemented in
-// include/gtest/gtest-param-test.h.
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end);
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]);
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container);
-
-namespace internal {
-
-// Used in the Values() function to provide polymorphic capabilities.
-template <typename T1>
-class ValueArray1 {
- public:
-  explicit ValueArray1(T1 v1) : v1_(v1) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const { return ValuesIn(&v1_, &v1_ + 1); }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray1& other);
-
-  const T1 v1_;
-};
-
-template <typename T1, typename T2>
-class ValueArray2 {
- public:
-  ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray2& other);
-
-  const T1 v1_;
-  const T2 v2_;
-};
-
-template <typename T1, typename T2, typename T3>
-class ValueArray3 {
- public:
-  ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray3& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4>
-class ValueArray4 {
- public:
-  ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray4& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class ValueArray5 {
- public:
-  ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray5& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class ValueArray6 {
- public:
-  ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray6& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class ValueArray7 {
- public:
-  ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray7& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class ValueArray8 {
- public:
-  ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-      T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray8& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class ValueArray9 {
- public:
-  ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-      T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray9& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class ValueArray10 {
- public:
-  ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray10& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-class ValueArray11 {
- public:
-  ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray11& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-class ValueArray12 {
- public:
-  ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray12& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-class ValueArray13 {
- public:
-  ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray13& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-class ValueArray14 {
- public:
-  ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray14& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-class ValueArray15 {
- public:
-  ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray15& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-class ValueArray16 {
- public:
-  ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray16& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-class ValueArray17 {
- public:
-  ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-      T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray17& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-class ValueArray18 {
- public:
-  ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray18& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-class ValueArray19 {
- public:
-  ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray19& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-class ValueArray20 {
- public:
-  ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray20& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-class ValueArray21 {
- public:
-  ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray21& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-class ValueArray22 {
- public:
-  ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray22& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-class ValueArray23 {
- public:
-  ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_,
-        v23_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray23& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-class ValueArray24 {
- public:
-  ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray24& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-class ValueArray25 {
- public:
-  ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-      T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray25& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-class ValueArray26 {
- public:
-  ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray26& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-class ValueArray27 {
- public:
-  ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray27& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-class ValueArray28 {
- public:
-  ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray28& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-class ValueArray29 {
- public:
-  ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray29& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-class ValueArray30 {
- public:
-  ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray30& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-class ValueArray31 {
- public:
-  ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray31& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-class ValueArray32 {
- public:
-  ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray32& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-class ValueArray33 {
- public:
-  ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-      T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray33& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-class ValueArray34 {
- public:
-  ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray34& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-class ValueArray35 {
- public:
-  ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_,
-        v35_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray35& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-class ValueArray36 {
- public:
-  ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray36& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-class ValueArray37 {
- public:
-  ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray37& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-class ValueArray38 {
- public:
-  ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray38& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-class ValueArray39 {
- public:
-  ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray39& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-class ValueArray40 {
- public:
-  ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray40& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-class ValueArray41 {
- public:
-  ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-      T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray41& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-class ValueArray42 {
- public:
-  ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray42& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-class ValueArray43 {
- public:
-  ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6),
-      v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13),
-      v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19),
-      v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25),
-      v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31),
-      v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37),
-      v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray43& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-class ValueArray44 {
- public:
-  ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5),
-      v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12),
-      v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18),
-      v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24),
-      v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30),
-      v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36),
-      v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42),
-      v43_(v43), v44_(v44) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray44& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-class ValueArray45 {
- public:
-  ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4),
-      v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11),
-      v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17),
-      v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23),
-      v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29),
-      v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35),
-      v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41),
-      v42_(v42), v43_(v43), v44_(v44), v45_(v45) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray45& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-class ValueArray46 {
- public:
-  ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3),
-      v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray46& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-class ValueArray47 {
- public:
-  ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2),
-      v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10),
-      v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16),
-      v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22),
-      v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28),
-      v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34),
-      v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40),
-      v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46),
-      v47_(v47) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_,
-        v47_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray47& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-class ValueArray48 {
- public:
-  ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1),
-      v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9),
-      v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15),
-      v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21),
-      v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27),
-      v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33),
-      v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39),
-      v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45),
-      v46_(v46), v47_(v47), v48_(v48) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_,
-        v48_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray48& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-class ValueArray49 {
- public:
-  ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48,
-      T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_,
-        v48_, v49_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray49& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-};
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-class ValueArray50 {
- public:
-  ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-      T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-      T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-      T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-      T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-      T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49,
-      T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7),
-      v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14),
-      v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20),
-      v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26),
-      v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32),
-      v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38),
-      v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44),
-      v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {}
-
-  template <typename T>
-  operator ParamGenerator<T>() const {
-    const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_,
-        v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_,
-        v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_,
-        v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_,
-        v48_, v49_, v50_};
-    return ValuesIn(array);
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ValueArray50& other);
-
-  const T1 v1_;
-  const T2 v2_;
-  const T3 v3_;
-  const T4 v4_;
-  const T5 v5_;
-  const T6 v6_;
-  const T7 v7_;
-  const T8 v8_;
-  const T9 v9_;
-  const T10 v10_;
-  const T11 v11_;
-  const T12 v12_;
-  const T13 v13_;
-  const T14 v14_;
-  const T15 v15_;
-  const T16 v16_;
-  const T17 v17_;
-  const T18 v18_;
-  const T19 v19_;
-  const T20 v20_;
-  const T21 v21_;
-  const T22 v22_;
-  const T23 v23_;
-  const T24 v24_;
-  const T25 v25_;
-  const T26 v26_;
-  const T27 v27_;
-  const T28 v28_;
-  const T29 v29_;
-  const T30 v30_;
-  const T31 v31_;
-  const T32 v32_;
-  const T33 v33_;
-  const T34 v34_;
-  const T35 v35_;
-  const T36 v36_;
-  const T37 v37_;
-  const T38 v38_;
-  const T39 v39_;
-  const T40 v40_;
-  const T41 v41_;
-  const T42 v42_;
-  const T43 v43_;
-  const T44 v44_;
-  const T45 v45_;
-  const T46 v46_;
-  const T47 v47_;
-  const T48 v48_;
-  const T49 v49_;
-  const T50 v50_;
-};
-
-# if GTEST_HAS_COMBINE
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Generates values from the Cartesian product of values produced
-// by the argument generators.
-//
-template <typename T1, typename T2>
-class CartesianProductGenerator2
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2> ParamType;
-
-  CartesianProductGenerator2(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2)
-      : g1_(g1), g2_(g2) {}
-  virtual ~CartesianProductGenerator2() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current2_;
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator2::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator2& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-};  // class CartesianProductGenerator2
-
-
-template <typename T1, typename T2, typename T3>
-class CartesianProductGenerator3
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3> ParamType;
-
-  CartesianProductGenerator3(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  virtual ~CartesianProductGenerator3() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current3_;
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator3::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator3& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-};  // class CartesianProductGenerator3
-
-
-template <typename T1, typename T2, typename T3, typename T4>
-class CartesianProductGenerator4
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4> ParamType;
-
-  CartesianProductGenerator4(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  virtual ~CartesianProductGenerator4() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current4_;
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator4::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator4& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-};  // class CartesianProductGenerator4
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-class CartesianProductGenerator5
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5> ParamType;
-
-  CartesianProductGenerator5(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  virtual ~CartesianProductGenerator5() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current5_;
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator5::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator5& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-};  // class CartesianProductGenerator5
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-class CartesianProductGenerator6
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5,
-        T6> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> ParamType;
-
-  CartesianProductGenerator6(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  virtual ~CartesianProductGenerator6() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current6_;
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator6::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator6& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-};  // class CartesianProductGenerator6
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-class CartesianProductGenerator7
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> ParamType;
-
-  CartesianProductGenerator7(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  virtual ~CartesianProductGenerator7() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current7_;
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator7::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator7& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-};  // class CartesianProductGenerator7
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-class CartesianProductGenerator8
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> ParamType;
-
-  CartesianProductGenerator8(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  virtual ~CartesianProductGenerator8() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current8_;
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator8::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator8& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-};  // class CartesianProductGenerator8
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-class CartesianProductGenerator9
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9> ParamType;
-
-  CartesianProductGenerator9(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  virtual ~CartesianProductGenerator9() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current9_;
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator9::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator9& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-};  // class CartesianProductGenerator9
-
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-class CartesianProductGenerator10
-    : public ParamGeneratorInterface< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-        T7, T8, T9, T10> > {
- public:
-  typedef ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> ParamType;
-
-  CartesianProductGenerator10(const ParamGenerator<T1>& g1,
-      const ParamGenerator<T2>& g2, const ParamGenerator<T3>& g3,
-      const ParamGenerator<T4>& g4, const ParamGenerator<T5>& g5,
-      const ParamGenerator<T6>& g6, const ParamGenerator<T7>& g7,
-      const ParamGenerator<T8>& g8, const ParamGenerator<T9>& g9,
-      const ParamGenerator<T10>& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  virtual ~CartesianProductGenerator10() {}
-
-  virtual ParamIteratorInterface<ParamType>* Begin() const {
-    return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_,
-        g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_,
-        g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin());
-  }
-  virtual ParamIteratorInterface<ParamType>* End() const {
-    return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(),
-        g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_,
-        g8_.end(), g9_, g9_.end(), g10_, g10_.end());
-  }
-
- private:
-  class Iterator : public ParamIteratorInterface<ParamType> {
-   public:
-    Iterator(const ParamGeneratorInterface<ParamType>* base,
-      const ParamGenerator<T1>& g1,
-      const typename ParamGenerator<T1>::iterator& current1,
-      const ParamGenerator<T2>& g2,
-      const typename ParamGenerator<T2>::iterator& current2,
-      const ParamGenerator<T3>& g3,
-      const typename ParamGenerator<T3>::iterator& current3,
-      const ParamGenerator<T4>& g4,
-      const typename ParamGenerator<T4>::iterator& current4,
-      const ParamGenerator<T5>& g5,
-      const typename ParamGenerator<T5>::iterator& current5,
-      const ParamGenerator<T6>& g6,
-      const typename ParamGenerator<T6>::iterator& current6,
-      const ParamGenerator<T7>& g7,
-      const typename ParamGenerator<T7>::iterator& current7,
-      const ParamGenerator<T8>& g8,
-      const typename ParamGenerator<T8>::iterator& current8,
-      const ParamGenerator<T9>& g9,
-      const typename ParamGenerator<T9>::iterator& current9,
-      const ParamGenerator<T10>& g10,
-      const typename ParamGenerator<T10>::iterator& current10)
-        : base_(base),
-          begin1_(g1.begin()), end1_(g1.end()), current1_(current1),
-          begin2_(g2.begin()), end2_(g2.end()), current2_(current2),
-          begin3_(g3.begin()), end3_(g3.end()), current3_(current3),
-          begin4_(g4.begin()), end4_(g4.end()), current4_(current4),
-          begin5_(g5.begin()), end5_(g5.end()), current5_(current5),
-          begin6_(g6.begin()), end6_(g6.end()), current6_(current6),
-          begin7_(g7.begin()), end7_(g7.end()), current7_(current7),
-          begin8_(g8.begin()), end8_(g8.end()), current8_(current8),
-          begin9_(g9.begin()), end9_(g9.end()), current9_(current9),
-          begin10_(g10.begin()), end10_(g10.end()), current10_(current10)    {
-      ComputeCurrentValue();
-    }
-    virtual ~Iterator() {}
-
-    virtual const ParamGeneratorInterface<ParamType>* BaseGenerator() const {
-      return base_;
-    }
-    // Advance should not be called on beyond-of-range iterators
-    // so no component iterators must be beyond end of range, either.
-    virtual void Advance() {
-      assert(!AtEnd());
-      ++current10_;
-      if (current10_ == end10_) {
-        current10_ = begin10_;
-        ++current9_;
-      }
-      if (current9_ == end9_) {
-        current9_ = begin9_;
-        ++current8_;
-      }
-      if (current8_ == end8_) {
-        current8_ = begin8_;
-        ++current7_;
-      }
-      if (current7_ == end7_) {
-        current7_ = begin7_;
-        ++current6_;
-      }
-      if (current6_ == end6_) {
-        current6_ = begin6_;
-        ++current5_;
-      }
-      if (current5_ == end5_) {
-        current5_ = begin5_;
-        ++current4_;
-      }
-      if (current4_ == end4_) {
-        current4_ = begin4_;
-        ++current3_;
-      }
-      if (current3_ == end3_) {
-        current3_ = begin3_;
-        ++current2_;
-      }
-      if (current2_ == end2_) {
-        current2_ = begin2_;
-        ++current1_;
-      }
-      ComputeCurrentValue();
-    }
-    virtual ParamIteratorInterface<ParamType>* Clone() const {
-      return new Iterator(*this);
-    }
-    virtual const ParamType* Current() const { return &current_value_; }
-    virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
-      // Having the same base generator guarantees that the other
-      // iterator is of the same type and we can downcast.
-      GTEST_CHECK_(BaseGenerator() == other.BaseGenerator())
-          << "The program attempted to compare iterators "
-          << "from different generators." << std::endl;
-      const Iterator* typed_other =
-          CheckedDowncastToActualType<const Iterator>(&other);
-      // We must report iterators equal if they both point beyond their
-      // respective ranges. That can happen in a variety of fashions,
-      // so we have to consult AtEnd().
-      return (AtEnd() && typed_other->AtEnd()) ||
-         (
-          current1_ == typed_other->current1_ &&
-          current2_ == typed_other->current2_ &&
-          current3_ == typed_other->current3_ &&
-          current4_ == typed_other->current4_ &&
-          current5_ == typed_other->current5_ &&
-          current6_ == typed_other->current6_ &&
-          current7_ == typed_other->current7_ &&
-          current8_ == typed_other->current8_ &&
-          current9_ == typed_other->current9_ &&
-          current10_ == typed_other->current10_);
-    }
-
-   private:
-    Iterator(const Iterator& other)
-        : base_(other.base_),
-        begin1_(other.begin1_),
-        end1_(other.end1_),
-        current1_(other.current1_),
-        begin2_(other.begin2_),
-        end2_(other.end2_),
-        current2_(other.current2_),
-        begin3_(other.begin3_),
-        end3_(other.end3_),
-        current3_(other.current3_),
-        begin4_(other.begin4_),
-        end4_(other.end4_),
-        current4_(other.current4_),
-        begin5_(other.begin5_),
-        end5_(other.end5_),
-        current5_(other.current5_),
-        begin6_(other.begin6_),
-        end6_(other.end6_),
-        current6_(other.current6_),
-        begin7_(other.begin7_),
-        end7_(other.end7_),
-        current7_(other.current7_),
-        begin8_(other.begin8_),
-        end8_(other.end8_),
-        current8_(other.current8_),
-        begin9_(other.begin9_),
-        end9_(other.end9_),
-        current9_(other.current9_),
-        begin10_(other.begin10_),
-        end10_(other.end10_),
-        current10_(other.current10_) {
-      ComputeCurrentValue();
-    }
-
-    void ComputeCurrentValue() {
-      if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_, *current10_);
-    }
-    bool AtEnd() const {
-      // We must report iterator past the end of the range when either of the
-      // component iterators has reached the end of its range.
-      return
-          current1_ == end1_ ||
-          current2_ == end2_ ||
-          current3_ == end3_ ||
-          current4_ == end4_ ||
-          current5_ == end5_ ||
-          current6_ == end6_ ||
-          current7_ == end7_ ||
-          current8_ == end8_ ||
-          current9_ == end9_ ||
-          current10_ == end10_;
-    }
-
-    // No implementation - assignment is unsupported.
-    void operator=(const Iterator& other);
-
-    const ParamGeneratorInterface<ParamType>* const base_;
-    // begin[i]_ and end[i]_ define the i-th range that Iterator traverses.
-    // current[i]_ is the actual traversing iterator.
-    const typename ParamGenerator<T1>::iterator begin1_;
-    const typename ParamGenerator<T1>::iterator end1_;
-    typename ParamGenerator<T1>::iterator current1_;
-    const typename ParamGenerator<T2>::iterator begin2_;
-    const typename ParamGenerator<T2>::iterator end2_;
-    typename ParamGenerator<T2>::iterator current2_;
-    const typename ParamGenerator<T3>::iterator begin3_;
-    const typename ParamGenerator<T3>::iterator end3_;
-    typename ParamGenerator<T3>::iterator current3_;
-    const typename ParamGenerator<T4>::iterator begin4_;
-    const typename ParamGenerator<T4>::iterator end4_;
-    typename ParamGenerator<T4>::iterator current4_;
-    const typename ParamGenerator<T5>::iterator begin5_;
-    const typename ParamGenerator<T5>::iterator end5_;
-    typename ParamGenerator<T5>::iterator current5_;
-    const typename ParamGenerator<T6>::iterator begin6_;
-    const typename ParamGenerator<T6>::iterator end6_;
-    typename ParamGenerator<T6>::iterator current6_;
-    const typename ParamGenerator<T7>::iterator begin7_;
-    const typename ParamGenerator<T7>::iterator end7_;
-    typename ParamGenerator<T7>::iterator current7_;
-    const typename ParamGenerator<T8>::iterator begin8_;
-    const typename ParamGenerator<T8>::iterator end8_;
-    typename ParamGenerator<T8>::iterator current8_;
-    const typename ParamGenerator<T9>::iterator begin9_;
-    const typename ParamGenerator<T9>::iterator end9_;
-    typename ParamGenerator<T9>::iterator current9_;
-    const typename ParamGenerator<T10>::iterator begin10_;
-    const typename ParamGenerator<T10>::iterator end10_;
-    typename ParamGenerator<T10>::iterator current10_;
-    ParamType current_value_;
-  };  // class CartesianProductGenerator10::Iterator
-
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductGenerator10& other);
-
-  const ParamGenerator<T1> g1_;
-  const ParamGenerator<T2> g2_;
-  const ParamGenerator<T3> g3_;
-  const ParamGenerator<T4> g4_;
-  const ParamGenerator<T5> g5_;
-  const ParamGenerator<T6> g6_;
-  const ParamGenerator<T7> g7_;
-  const ParamGenerator<T8> g8_;
-  const ParamGenerator<T9> g9_;
-  const ParamGenerator<T10> g10_;
-};  // class CartesianProductGenerator10
-
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Helper classes providing Combine() with polymorphic features. They allow
-// casting CartesianProductGeneratorN<T> to ParamGenerator<U> if T is
-// convertible to U.
-//
-template <class Generator1, class Generator2>
-class CartesianProductHolder2 {
- public:
-CartesianProductHolder2(const Generator1& g1, const Generator2& g2)
-      : g1_(g1), g2_(g2) {}
-  template <typename T1, typename T2>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2> >(
-        new CartesianProductGenerator2<T1, T2>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder2& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-};  // class CartesianProductHolder2
-
-template <class Generator1, class Generator2, class Generator3>
-class CartesianProductHolder3 {
- public:
-CartesianProductHolder3(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3)
-      : g1_(g1), g2_(g2), g3_(g3) {}
-  template <typename T1, typename T2, typename T3>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3> >(
-        new CartesianProductGenerator3<T1, T2, T3>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder3& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-};  // class CartesianProductHolder3
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4>
-class CartesianProductHolder4 {
- public:
-CartesianProductHolder4(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {}
-  template <typename T1, typename T2, typename T3, typename T4>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4> >(
-        new CartesianProductGenerator4<T1, T2, T3, T4>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder4& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-};  // class CartesianProductHolder4
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5>
-class CartesianProductHolder5 {
- public:
-CartesianProductHolder5(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5> >(
-        new CartesianProductGenerator5<T1, T2, T3, T4, T5>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder5& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-};  // class CartesianProductHolder5
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6>
-class CartesianProductHolder6 {
- public:
-CartesianProductHolder6(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6> >(
-        new CartesianProductGenerator6<T1, T2, T3, T4, T5, T6>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder6& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-};  // class CartesianProductHolder6
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7>
-class CartesianProductHolder7 {
- public:
-CartesianProductHolder7(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6,
-      T7> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7> >(
-        new CartesianProductGenerator7<T1, T2, T3, T4, T5, T6, T7>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder7& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-};  // class CartesianProductHolder7
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8>
-class CartesianProductHolder8 {
- public:
-CartesianProductHolder8(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7),
-          g8_(g8) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7,
-      T8> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8> >(
-        new CartesianProductGenerator8<T1, T2, T3, T4, T5, T6, T7, T8>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder8& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-};  // class CartesianProductHolder8
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9>
-class CartesianProductHolder9 {
- public:
-CartesianProductHolder9(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9> >(
-        new CartesianProductGenerator9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder9& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-};  // class CartesianProductHolder9
-
-template <class Generator1, class Generator2, class Generator3,
-    class Generator4, class Generator5, class Generator6, class Generator7,
-    class Generator8, class Generator9, class Generator10>
-class CartesianProductHolder10 {
- public:
-CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
-    const Generator3& g3, const Generator4& g4, const Generator5& g5,
-    const Generator6& g6, const Generator7& g7, const Generator8& g8,
-    const Generator9& g9, const Generator10& g10)
-      : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8),
-          g9_(g9), g10_(g10) {}
-  template <typename T1, typename T2, typename T3, typename T4, typename T5,
-      typename T6, typename T7, typename T8, typename T9, typename T10>
-  operator ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-      T9, T10> >() const {
-    return ParamGenerator< ::std::tr1::tuple<T1, T2, T3, T4, T5, T6, T7, T8,
-        T9, T10> >(
-        new CartesianProductGenerator10<T1, T2, T3, T4, T5, T6, T7, T8, T9,
-            T10>(
-        static_cast<ParamGenerator<T1> >(g1_),
-        static_cast<ParamGenerator<T2> >(g2_),
-        static_cast<ParamGenerator<T3> >(g3_),
-        static_cast<ParamGenerator<T4> >(g4_),
-        static_cast<ParamGenerator<T5> >(g5_),
-        static_cast<ParamGenerator<T6> >(g6_),
-        static_cast<ParamGenerator<T7> >(g7_),
-        static_cast<ParamGenerator<T8> >(g8_),
-        static_cast<ParamGenerator<T9> >(g9_),
-        static_cast<ParamGenerator<T10> >(g10_)));
-  }
-
- private:
-  // No implementation - assignment is unsupported.
-  void operator=(const CartesianProductHolder10& other);
-
-  const Generator1 g1_;
-  const Generator2 g2_;
-  const Generator3 g3_;
-  const Generator4 g4_;
-  const Generator5 g5_;
-  const Generator6 g6_;
-  const Generator7 g7_;
-  const Generator8 g8_;
-  const Generator9 g9_;
-  const Generator10 g10_;
-};  // class CartesianProductHolder10
-
-# endif  // GTEST_HAS_COMBINE
-
-}  // namespace internal
-}  // namespace testing
-
-#endif  //  GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
-
-#if GTEST_HAS_PARAM_TEST
-
-namespace testing {
-
-// Functions producing parameter generators.
-//
-// Google Test uses these generators to produce parameters for value-
-// parameterized tests. When a parameterized test case is instantiated
-// with a particular generator, Google Test creates and runs tests
-// for each element in the sequence produced by the generator.
-//
-// In the following sample, tests from test case FooTest are instantiated
-// each three times with parameter values 3, 5, and 8:
-//
-// class FooTest : public TestWithParam<int> { ... };
-//
-// TEST_P(FooTest, TestThis) {
-// }
-// TEST_P(FooTest, TestThat) {
-// }
-// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8));
-//
-
-// Range() returns generators providing sequences of values in a range.
-//
-// Synopsis:
-// Range(start, end)
-//   - returns a generator producing a sequence of values {start, start+1,
-//     start+2, ..., }.
-// Range(start, end, step)
-//   - returns a generator producing a sequence of values {start, start+step,
-//     start+step+step, ..., }.
-// Notes:
-//   * The generated sequences never include end. For example, Range(1, 5)
-//     returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2)
-//     returns a generator producing {1, 3, 5, 7}.
-//   * start and end must have the same type. That type may be any integral or
-//     floating-point type or a user defined type satisfying these conditions:
-//     * It must be assignable (have operator=() defined).
-//     * It must have operator+() (operator+(int-compatible type) for
-//       two-operand version).
-//     * It must have operator<() defined.
-//     Elements in the resulting sequences will also have that type.
-//   * Condition start < end must be satisfied in order for resulting sequences
-//     to contain any elements.
-//
-template <typename T, typename IncrementT>
-internal::ParamGenerator<T> Range(T start, T end, IncrementT step) {
-  return internal::ParamGenerator<T>(
-      new internal::RangeGenerator<T, IncrementT>(start, end, step));
-}
-
-template <typename T>
-internal::ParamGenerator<T> Range(T start, T end) {
-  return Range(start, end, 1);
-}
-
-// ValuesIn() function allows generation of tests with parameters coming from
-// a container.
-//
-// Synopsis:
-// ValuesIn(const T (&array)[N])
-//   - returns a generator producing sequences with elements from
-//     a C-style array.
-// ValuesIn(const Container& container)
-//   - returns a generator producing sequences with elements from
-//     an STL-style container.
-// ValuesIn(Iterator begin, Iterator end)
-//   - returns a generator producing sequences with elements from
-//     a range [begin, end) defined by a pair of STL-style iterators. These
-//     iterators can also be plain C pointers.
-//
-// Please note that ValuesIn copies the values from the containers
-// passed in and keeps them to generate tests in RUN_ALL_TESTS().
-//
-// Examples:
-//
-// This instantiates tests from test case StringTest
-// each with C-string values of "foo", "bar", and "baz":
-//
-// const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
-//
-// This instantiates tests from test case StlStringTest
-// each with STL strings with values "a" and "b":
-//
-// ::std::vector< ::std::string> GetParameterStrings() {
-//   ::std::vector< ::std::string> v;
-//   v.push_back("a");
-//   v.push_back("b");
-//   return v;
-// }
-//
-// INSTANTIATE_TEST_CASE_P(CharSequence,
-//                         StlStringTest,
-//                         ValuesIn(GetParameterStrings()));
-//
-//
-// This will also instantiate tests from CharTest
-// each with parameter values 'a' and 'b':
-//
-// ::std::list<char> GetParameterChars() {
-//   ::std::list<char> list;
-//   list.push_back('a');
-//   list.push_back('b');
-//   return list;
-// }
-// ::std::list<char> l = GetParameterChars();
-// INSTANTIATE_TEST_CASE_P(CharSequence2,
-//                         CharTest,
-//                         ValuesIn(l.begin(), l.end()));
-//
-template <typename ForwardIterator>
-internal::ParamGenerator<
-  typename ::testing::internal::IteratorTraits<ForwardIterator>::value_type>
-ValuesIn(ForwardIterator begin, ForwardIterator end) {
-  typedef typename ::testing::internal::IteratorTraits<ForwardIterator>
-      ::value_type ParamType;
-  return internal::ParamGenerator<ParamType>(
-      new internal::ValuesInIteratorRangeGenerator<ParamType>(begin, end));
-}
-
-template <typename T, size_t N>
-internal::ParamGenerator<T> ValuesIn(const T (&array)[N]) {
-  return ValuesIn(array, array + N);
-}
-
-template <class Container>
-internal::ParamGenerator<typename Container::value_type> ValuesIn(
-    const Container& container) {
-  return ValuesIn(container.begin(), container.end());
-}
-
-// Values() allows generating tests from explicitly specified list of
-// parameters.
-//
-// Synopsis:
-// Values(T v1, T v2, ..., T vN)
-//   - returns a generator producing sequences with elements v1, v2, ..., vN.
-//
-// For example, this instantiates tests from test case BarTest each
-// with values "one", "two", and "three":
-//
-// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three"));
-//
-// This instantiates tests from test case BazTest each with values 1, 2, 3.5.
-// The exact type of values will depend on the type of parameter in BazTest.
-//
-// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5));
-//
-// Currently, Values() supports from 1 to 50 parameters.
-//
-template <typename T1>
-internal::ValueArray1<T1> Values(T1 v1) {
-  return internal::ValueArray1<T1>(v1);
-}
-
-template <typename T1, typename T2>
-internal::ValueArray2<T1, T2> Values(T1 v1, T2 v2) {
-  return internal::ValueArray2<T1, T2>(v1, v2);
-}
-
-template <typename T1, typename T2, typename T3>
-internal::ValueArray3<T1, T2, T3> Values(T1 v1, T2 v2, T3 v3) {
-  return internal::ValueArray3<T1, T2, T3>(v1, v2, v3);
-}
-
-template <typename T1, typename T2, typename T3, typename T4>
-internal::ValueArray4<T1, T2, T3, T4> Values(T1 v1, T2 v2, T3 v3, T4 v4) {
-  return internal::ValueArray4<T1, T2, T3, T4>(v1, v2, v3, v4);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5>
-internal::ValueArray5<T1, T2, T3, T4, T5> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5) {
-  return internal::ValueArray5<T1, T2, T3, T4, T5>(v1, v2, v3, v4, v5);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6>
-internal::ValueArray6<T1, T2, T3, T4, T5, T6> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6) {
-  return internal::ValueArray6<T1, T2, T3, T4, T5, T6>(v1, v2, v3, v4, v5, v6);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7>
-internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7) {
-  return internal::ValueArray7<T1, T2, T3, T4, T5, T6, T7>(v1, v2, v3, v4, v5,
-      v6, v7);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8>
-internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) {
-  return internal::ValueArray8<T1, T2, T3, T4, T5, T6, T7, T8>(v1, v2, v3, v4,
-      v5, v6, v7, v8);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9>
-internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) {
-  return internal::ValueArray9<T1, T2, T3, T4, T5, T6, T7, T8, T9>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10>
-internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) {
-  return internal::ValueArray10<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11>
-internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-    T11> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11) {
-  return internal::ValueArray11<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10,
-      T11>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12>
-internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-    T12> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12) {
-  return internal::ValueArray12<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13>
-internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12,
-    T13> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13) {
-  return internal::ValueArray13<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14>
-internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) {
-  return internal::ValueArray14<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15>
-internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) {
-  return internal::ValueArray15<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16>
-internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16) {
-  return internal::ValueArray16<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17>
-internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17) {
-  return internal::ValueArray17<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18>
-internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18) {
-  return internal::ValueArray18<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19>
-internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) {
-  return internal::ValueArray19<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20>
-internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) {
-  return internal::ValueArray20<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21>
-internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) {
-  return internal::ValueArray21<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22>
-internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22) {
-  return internal::ValueArray22<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23>
-internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23) {
-  return internal::ValueArray23<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24>
-internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24) {
-  return internal::ValueArray24<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25>
-internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) {
-  return internal::ValueArray25<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26>
-internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-    T26> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26) {
-  return internal::ValueArray26<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27>
-internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26,
-    T27> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27) {
-  return internal::ValueArray27<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28>
-internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27,
-    T28> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28) {
-  return internal::ValueArray28<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29>
-internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29) {
-  return internal::ValueArray29<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30>
-internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) {
-  return internal::ValueArray30<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31>
-internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) {
-  return internal::ValueArray31<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32>
-internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32) {
-  return internal::ValueArray32<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33>
-internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33) {
-  return internal::ValueArray33<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34>
-internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34) {
-  return internal::ValueArray34<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35>
-internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) {
-  return internal::ValueArray35<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36>
-internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) {
-  return internal::ValueArray36<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37>
-internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37> Values(T1 v1, T2 v2, T3 v3,
-    T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37) {
-  return internal::ValueArray37<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37>(v1, v2, v3,
-      v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38>
-internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38) {
-  return internal::ValueArray38<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38>(v1, v2,
-      v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18,
-      v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32,
-      v33, v34, v35, v36, v37, v38);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39>
-internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39> Values(T1 v1, T2 v2,
-    T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12,
-    T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20,
-    T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28,
-    T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36,
-    T37 v37, T38 v38, T39 v39) {
-  return internal::ValueArray39<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39>(v1,
-      v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17,
-      v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
-      v32, v33, v34, v35, v36, v37, v38, v39);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40>
-internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40> Values(T1 v1,
-    T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11,
-    T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19,
-    T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27,
-    T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35,
-    T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) {
-  return internal::ValueArray40<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
-      v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29,
-      v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41>
-internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40,
-    T41> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) {
-  return internal::ValueArray41<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14,
-      v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28,
-      v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42>
-internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41,
-    T42> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42) {
-  return internal::ValueArray42<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
-      v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27,
-      v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41,
-      v42);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43>
-internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42,
-    T43> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43) {
-  return internal::ValueArray43<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
-      v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26,
-      v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40,
-      v41, v42, v43);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44>
-internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9,
-    T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17,
-    T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25,
-    T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33,
-    T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41,
-    T42 v42, T43 v43, T44 v44) {
-  return internal::ValueArray44<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
-      v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25,
-      v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39,
-      v40, v41, v42, v43, v44);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45>
-internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8,
-    T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16,
-    T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24,
-    T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32,
-    T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40,
-    T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) {
-  return internal::ValueArray45<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45>(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10,
-      v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24,
-      v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38,
-      v39, v40, v41, v42, v43, v44, v45);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46>
-internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) {
-  return internal::ValueArray46<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46>(v1, v2, v3, v4, v5, v6, v7, v8, v9,
-      v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47>
-internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7,
-    T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) {
-  return internal::ValueArray47<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47>(v1, v2, v3, v4, v5, v6, v7, v8,
-      v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23,
-      v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37,
-      v38, v39, v40, v41, v42, v43, v44, v45, v46, v47);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48>
-internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6,
-    T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15,
-    T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23,
-    T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31,
-    T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39,
-    T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47,
-    T48 v48) {
-  return internal::ValueArray48<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48>(v1, v2, v3, v4, v5, v6, v7,
-      v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22,
-      v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36,
-      v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49>
-internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49> Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5,
-    T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14,
-    T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22,
-    T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30,
-    T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38,
-    T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46,
-    T47 v47, T48 v48, T49 v49) {
-  return internal::ValueArray49<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49>(v1, v2, v3, v4, v5, v6,
-      v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21,
-      v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35,
-      v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49);
-}
-
-template <typename T1, typename T2, typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8, typename T9, typename T10,
-    typename T11, typename T12, typename T13, typename T14, typename T15,
-    typename T16, typename T17, typename T18, typename T19, typename T20,
-    typename T21, typename T22, typename T23, typename T24, typename T25,
-    typename T26, typename T27, typename T28, typename T29, typename T30,
-    typename T31, typename T32, typename T33, typename T34, typename T35,
-    typename T36, typename T37, typename T38, typename T39, typename T40,
-    typename T41, typename T42, typename T43, typename T44, typename T45,
-    typename T46, typename T47, typename T48, typename T49, typename T50>
-internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11, T12, T13,
-    T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25, T26, T27, T28,
-    T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39, T40, T41, T42, T43,
-    T44, T45, T46, T47, T48, T49, T50> Values(T1 v1, T2 v2, T3 v3, T4 v4,
-    T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13,
-    T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21,
-    T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29,
-    T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37,
-    T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45,
-    T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) {
-  return internal::ValueArray50<T1, T2, T3, T4, T5, T6, T7, T8, T9, T10, T11,
-      T12, T13, T14, T15, T16, T17, T18, T19, T20, T21, T22, T23, T24, T25,
-      T26, T27, T28, T29, T30, T31, T32, T33, T34, T35, T36, T37, T38, T39,
-      T40, T41, T42, T43, T44, T45, T46, T47, T48, T49, T50>(v1, v2, v3, v4,
-      v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19,
-      v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33,
-      v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
-      v48, v49, v50);
-}
-
-// Bool() allows generating tests with parameters in a set of (false, true).
-//
-// Synopsis:
-// Bool()
-//   - returns a generator producing sequences with elements {false, true}.
-//
-// It is useful when testing code that depends on Boolean flags. Combinations
-// of multiple flags can be tested when several Bool()'s are combined using
-// Combine() function.
-//
-// In the following example all tests in the test case FlagDependentTest
-// will be instantiated twice with parameters false and true.
-//
-// class FlagDependentTest : public testing::TestWithParam<bool> {
-//   virtual void SetUp() {
-//     external_flag = GetParam();
-//   }
-// }
-// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool());
-//
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
-
-# if GTEST_HAS_COMBINE
-// Combine() allows the user to combine two or more sequences to produce
-// values of a Cartesian product of those sequences' elements.
-//
-// Synopsis:
-// Combine(gen1, gen2, ..., genN)
-//   - returns a generator producing sequences with elements coming from
-//     the Cartesian product of elements from the sequences generated by
-//     gen1, gen2, ..., genN. The sequence elements will have a type of
-//     tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
-//     of elements from sequences produces by gen1, gen2, ..., genN.
-//
-// Combine can have up to 10 arguments. This number is currently limited
-// by the maximum number of elements in the tuple implementation used by Google
-// Test.
-//
-// Example:
-//
-// This will instantiate tests in test case AnimalTest each one with
-// the parameter values tuple("cat", BLACK), tuple("cat", WHITE),
-// tuple("dog", BLACK), and tuple("dog", WHITE):
-//
-// enum Color { BLACK, GRAY, WHITE };
-// class AnimalTest
-//     : public testing::TestWithParam<tuple<const char*, Color> > {...};
-//
-// TEST_P(AnimalTest, AnimalLooksNice) {...}
-//
-// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest,
-//                         Combine(Values("cat", "dog"),
-//                                 Values(BLACK, WHITE)));
-//
-// This will instantiate tests in FlagDependentTest with all variations of two
-// Boolean flags:
-//
-// class FlagDependentTest
-//     : public testing::TestWithParam<tuple(bool, bool)> > {
-//   virtual void SetUp() {
-//     // Assigns external_flag_1 and external_flag_2 values from the tuple.
-//     tie(external_flag_1, external_flag_2) = GetParam();
-//   }
-// };
-//
-// TEST_P(FlagDependentTest, TestFeature1) {
-//   // Test your code using external_flag_1 and external_flag_2 here.
-// }
-// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest,
-//                         Combine(Bool(), Bool()));
-//
-template <typename Generator1, typename Generator2>
-internal::CartesianProductHolder2<Generator1, Generator2> Combine(
-    const Generator1& g1, const Generator2& g2) {
-  return internal::CartesianProductHolder2<Generator1, Generator2>(
-      g1, g2);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3>
-internal::CartesianProductHolder3<Generator1, Generator2, Generator3> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3) {
-  return internal::CartesianProductHolder3<Generator1, Generator2, Generator3>(
-      g1, g2, g3);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4>
-internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-    Generator4> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4) {
-  return internal::CartesianProductHolder4<Generator1, Generator2, Generator3,
-      Generator4>(
-      g1, g2, g3, g4);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5>
-internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-    Generator4, Generator5> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5) {
-  return internal::CartesianProductHolder5<Generator1, Generator2, Generator3,
-      Generator4, Generator5>(
-      g1, g2, g3, g4, g5);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6>
-internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6) {
-  return internal::CartesianProductHolder6<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6>(
-      g1, g2, g3, g4, g5, g6);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7>
-internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7) {
-  return internal::CartesianProductHolder7<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7>(
-      g1, g2, g3, g4, g5, g6, g7);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8>
-internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8) {
-  return internal::CartesianProductHolder8<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8>(
-      g1, g2, g3, g4, g5, g6, g7, g8);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9>
-internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8,
-    Generator9> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9) {
-  return internal::CartesianProductHolder9<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9);
-}
-
-template <typename Generator1, typename Generator2, typename Generator3,
-    typename Generator4, typename Generator5, typename Generator6,
-    typename Generator7, typename Generator8, typename Generator9,
-    typename Generator10>
-internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-    Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-    Generator10> Combine(
-    const Generator1& g1, const Generator2& g2, const Generator3& g3,
-        const Generator4& g4, const Generator5& g5, const Generator6& g6,
-        const Generator7& g7, const Generator8& g8, const Generator9& g9,
-        const Generator10& g10) {
-  return internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
-      Generator4, Generator5, Generator6, Generator7, Generator8, Generator9,
-      Generator10>(
-      g1, g2, g3, g4, g5, g6, g7, g8, g9, g10);
-}
-# endif  // GTEST_HAS_COMBINE
-
-
-
-# define TEST_P(test_case_name, test_name) \
-  class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
-      : public test_case_name { \
-   public: \
-    GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \
-    virtual void TestBody(); \
-   private: \
-    static int AddToRegistry() { \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestPattern(\
-                  #test_case_name, \
-                  #test_name, \
-                  new ::testing::internal::TestMetaFactory< \
-                      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \
-      return 0; \
-    } \
-    static int gtest_registering_dummy_; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(\
-        GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \
-  }; \
-  int GTEST_TEST_CLASS_NAME_(test_case_name, \
-                             test_name)::gtest_registering_dummy_ = \
-      GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \
-  void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
-
-# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
-      gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  int gtest_##prefix##test_case_name##_dummy_ = \
-      ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
-          GetTestCasePatternHolder<test_case_name>(\
-              #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\
-                  #prefix, \
-                  &gtest_##prefix##test_case_name##_EvalGenerator_, \
-                  __FILE__, __LINE__)
-
-}  // namespace testing
-
-#endif  // GTEST_HAS_PARAM_TEST
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Google C++ Testing Framework definitions useful in production code.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-
-// When you need to test the private or protected members of a class,
-// use the FRIEND_TEST macro to declare your tests as friends of the
-// class.  For example:
-//
-// class MyClass {
-//  private:
-//   void MyMethod();
-//   FRIEND_TEST(MyClassTest, MyMethod);
-// };
-//
-// class MyClassTest : public testing::Test {
-//   // ...
-// };
-//
-// TEST_F(MyClassTest, MyMethod) {
-//   // Can call MyClass::MyMethod() here.
-// }
-
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-// Copyright 2008, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: mheule@google.com (Markus Heule)
-//
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-
-#include <iosfwd>
-#include <vector>
-
-namespace testing {
-
-// A copyable object representing the result of a test part (i.e. an
-// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()).
-//
-// Don't inherit from TestPartResult as its destructor is not virtual.
-class GTEST_API_ TestPartResult {
- public:
-  // The possible outcomes of a test part (i.e. an assertion or an
-  // explicit SUCCEED(), FAIL(), or ADD_FAILURE()).
-  enum Type {
-    kSuccess,          // Succeeded.
-    kNonFatalFailure,  // Failed but the test can continue.
-    kFatalFailure      // Failed and the test should be terminated.
-  };
-
-  // C'tor.  TestPartResult does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestPartResult object.
-  TestPartResult(Type a_type,
-                 const char* a_file_name,
-                 int a_line_number,
-                 const char* a_message)
-      : type_(a_type),
-        file_name_(a_file_name),
-        line_number_(a_line_number),
-        summary_(ExtractSummary(a_message)),
-        message_(a_message) {
-  }
-
-  // Gets the outcome of the test part.
-  Type type() const { return type_; }
-
-  // Gets the name of the source file where the test part took place, or
-  // NULL if it's unknown.
-  const char* file_name() const { return file_name_.c_str(); }
-
-  // Gets the line in the source file where the test part took place,
-  // or -1 if it's unknown.
-  int line_number() const { return line_number_; }
-
-  // Gets the summary of the failure message.
-  const char* summary() const { return summary_.c_str(); }
-
-  // Gets the message associated with the test part.
-  const char* message() const { return message_.c_str(); }
-
-  // Returns true iff the test part passed.
-  bool passed() const { return type_ == kSuccess; }
-
-  // Returns true iff the test part failed.
-  bool failed() const { return type_ != kSuccess; }
-
-  // Returns true iff the test part non-fatally failed.
-  bool nonfatally_failed() const { return type_ == kNonFatalFailure; }
-
-  // Returns true iff the test part fatally failed.
-  bool fatally_failed() const { return type_ == kFatalFailure; }
- private:
-  Type type_;
-
-  // Gets the summary of the failure message by omitting the stack
-  // trace in it.
-  static internal::String ExtractSummary(const char* message);
-
-  // The name of the source file where the test part took place, or
-  // NULL if the source file is unknown.
-  internal::String file_name_;
-  // The line in the source file where the test part took place, or -1
-  // if the line number is unknown.
-  int line_number_;
-  internal::String summary_;  // The test failure summary.
-  internal::String message_;  // The test failure message.
-};
-
-// Prints a TestPartResult object.
-std::ostream& operator<<(std::ostream& os, const TestPartResult& result);
-
-// An array of TestPartResult objects.
-//
-// Don't inherit from TestPartResultArray as its destructor is not
-// virtual.
-class GTEST_API_ TestPartResultArray {
- public:
-  TestPartResultArray() {}
-
-  // Appends the given TestPartResult to the array.
-  void Append(const TestPartResult& result);
-
-  // Returns the TestPartResult at the given index (0-based).
-  const TestPartResult& GetTestPartResult(int index) const;
-
-  // Returns the number of TestPartResult objects in the array.
-  int size() const;
-
- private:
-  std::vector<TestPartResult> array_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
-};
-
-// This interface knows how to report a test part result.
-class TestPartResultReporterInterface {
- public:
-  virtual ~TestPartResultReporterInterface() {}
-
-  virtual void ReportTestPartResult(const TestPartResult& result) = 0;
-};
-
-namespace internal {
-
-// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a
-// statement generates new fatal failures. To do so it registers itself as the
-// current test part result reporter. Besides checking if fatal failures were
-// reported, it only delegates the reporting to the former result reporter.
-// The original result reporter is restored in the destructor.
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-class GTEST_API_ HasNewFatalFailureHelper
-    : public TestPartResultReporterInterface {
- public:
-  HasNewFatalFailureHelper();
-  virtual ~HasNewFatalFailureHelper();
-  virtual void ReportTestPartResult(const TestPartResult& result);
-  bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
- private:
-  bool has_new_fatal_failure_;
-  TestPartResultReporterInterface* original_reporter_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
-};
-
-}  // namespace internal
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-// Copyright 2008 Google Inc.
-// All Rights Reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-
-// This header implements typed tests and type-parameterized tests.
-
-// Typed (aka type-driven) tests repeat the same test for types in a
-// list.  You must know which types you want to test with when writing
-// typed tests. Here's how you do it:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
- public:
-  ...
-  typedef std::list<T> List;
-  static T shared_;
-  T value_;
-};
-
-// Next, associate a list of types with the test case, which will be
-// repeated for each type in the list.  The typedef is necessary for
-// the macro to parse correctly.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-TYPED_TEST_CASE(FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   TYPED_TEST_CASE(FooTest, int);
-
-// Then, use TYPED_TEST() instead of TEST_F() to define as many typed
-// tests for this test case as you want.
-TYPED_TEST(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  // Since we are inside a derived class template, C++ requires use to
-  // visit the members of FooTest via 'this'.
-  TypeParam n = this->value_;
-
-  // To visit static members of the fixture, add the TestFixture::
-  // prefix.
-  n += TestFixture::shared_;
-
-  // To refer to typedefs in the fixture, add the "typename
-  // TestFixture::" prefix.
-  typename TestFixture::List values;
-  values.push_back(n);
-  ...
-}
-
-TYPED_TEST(FooTest, HasPropertyA) { ... }
-
-#endif  // 0
-
-// Type-parameterized tests are abstract test patterns parameterized
-// by a type.  Compared with typed tests, type-parameterized tests
-// allow you to define the test pattern without knowing what the type
-// parameters are.  The defined pattern can be instantiated with
-// different types any number of times, in any number of translation
-// units.
-//
-// If you are designing an interface or concept, you can define a
-// suite of type-parameterized tests to verify properties that any
-// valid implementation of the interface/concept should have.  Then,
-// each implementation can easily instantiate the test suite to verify
-// that it conforms to the requirements, without having to write
-// similar tests repeatedly.  Here's an example:
-
-#if 0
-
-// First, define a fixture class template.  It should be parameterized
-// by a type.  Remember to derive it from testing::Test.
-template <typename T>
-class FooTest : public testing::Test {
-  ...
-};
-
-// Next, declare that you will define a type-parameterized test case
-// (the _P suffix is for "parameterized" or "pattern", whichever you
-// prefer):
-TYPED_TEST_CASE_P(FooTest);
-
-// Then, use TYPED_TEST_P() to define as many type-parameterized tests
-// for this type-parameterized test case as you want.
-TYPED_TEST_P(FooTest, DoesBlah) {
-  // Inside a test, refer to TypeParam to get the type parameter.
-  TypeParam n = 0;
-  ...
-}
-
-TYPED_TEST_P(FooTest, HasPropertyA) { ... }
-
-// Now the tricky part: you need to register all test patterns before
-// you can instantiate them.  The first argument of the macro is the
-// test case name; the rest are the names of the tests in this test
-// case.
-REGISTER_TYPED_TEST_CASE_P(FooTest,
-                           DoesBlah, HasPropertyA);
-
-// Finally, you are free to instantiate the pattern with the types you
-// want.  If you put the above code in a header file, you can #include
-// it in multiple C++ source files and instantiate it multiple times.
-//
-// To distinguish different instances of the pattern, the first
-// argument to the INSTANTIATE_* macro is a prefix that will be added
-// to the actual test case name.  Remember to pick unique prefixes for
-// different instances.
-typedef testing::Types<char, int, unsigned int> MyTypes;
-INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
-
-// If the type list contains only one type, you can write that type
-// directly without Types<...>:
-//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
-
-#endif  // 0
-
-
-// Implements typed tests.
-
-#if GTEST_HAS_TYPED_TEST
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the typedef for the type parameters of the
-// given test case.
-# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define TYPED_TEST_CASE(CaseName, Types) \
-  typedef ::testing::internal::TypeList< Types >::type \
-      GTEST_TYPE_PARAMS_(CaseName)
-
-# define TYPED_TEST(CaseName, TestName) \
-  template <typename gtest_TypeParam_> \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
-      : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTest< \
-          CaseName, \
-          ::testing::internal::TemplateSel< \
-              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
-          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", #CaseName, #TestName, 0); \
-  template <typename gtest_TypeParam_> \
-  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
-
-#endif  // GTEST_HAS_TYPED_TEST
-
-// Implements type-parameterized tests.
-
-#if GTEST_HAS_TYPED_TEST_P
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the namespace name that the type-parameterized tests for
-// the given type-parameterized test case are defined in.  The exact
-// name of the namespace is subject to change without notice.
-# define GTEST_CASE_NAMESPACE_(TestCaseName) \
-  gtest_case_##TestCaseName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-//
-// Expands to the name of the variable used to remember the names of
-// the defined tests in the given test case.
-# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \
-  gtest_typed_test_case_p_state_##TestCaseName##_
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY.
-//
-// Expands to the name of the variable used to remember the names of
-// the registered tests in the given test case.
-# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \
-  gtest_registered_test_names_##TestCaseName##_
-
-// The variables defined in the type-parameterized test macros are
-// static as typically these macros are used in a .h file that can be
-// #included in multiple translation units linked together.
-# define TYPED_TEST_CASE_P(CaseName) \
-  static ::testing::internal::TypedTestCasePState \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName)
-
-# define TYPED_TEST_P(CaseName, TestName) \
-  namespace GTEST_CASE_NAMESPACE_(CaseName) { \
-  template <typename gtest_TypeParam_> \
-  class TestName : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\
-          __FILE__, __LINE__, #CaseName, #TestName); \
-  } \
-  template <typename gtest_TypeParam_> \
-  void GTEST_CASE_NAMESPACE_(CaseName)::TestName<gtest_TypeParam_>::TestBody()
-
-/* # define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ */
-/*   namespace GTEST_CASE_NAMESPACE_(CaseName) { \ */
-/*   typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ */
-/*   } \ */
-/*   static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \ */
-/*       GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\ */
-/*           __FILE__, __LINE__, #__VA_ARGS__) */
-
-// The 'Types' template argument below must have spaces around it
-// since some compilers may choke on '>>' when passing a template
-// instance (e.g. Types<int>)
-# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
-  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTestCase<CaseName, \
-          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
-          ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
-
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-
-// Depending on the platform, different string classes are available.
-// On Linux, in addition to ::std::string, Google also makes use of
-// class ::string, which has the same interface as ::std::string, but
-// has a different implementation.
-//
-// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that
-// ::string is available AND is a distinct type to ::std::string, or
-// define it to 0 to indicate otherwise.
-//
-// If the user's ::std::string and ::string are the same class due to
-// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0.
-//
-// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined
-// heuristically.
-
-namespace testing {
-
-// Declares the flags.
-
-// This flag temporary enables the disabled tests.
-GTEST_DECLARE_bool_(also_run_disabled_tests);
-
-// This flag brings the debugger on an assertion failure.
-GTEST_DECLARE_bool_(break_on_failure);
-
-// This flag controls whether Google Test catches all test-thrown exceptions
-// and logs them as failures.
-GTEST_DECLARE_bool_(catch_exceptions);
-
-// This flag enables using colors in terminal output. Available values are
-// "yes" to enable colors, "no" (disable colors), or "auto" (the default)
-// to let Google Test decide.
-GTEST_DECLARE_string_(color);
-
-// This flag sets up the filter to select by name using a glob pattern
-// the tests to run. If the filter is not given all tests are executed.
-GTEST_DECLARE_string_(filter);
-
-// This flag causes the Google Test to list tests. None of the tests listed
-// are actually run if the flag is provided.
-GTEST_DECLARE_bool_(list_tests);
-
-// This flag controls whether Google Test emits a detailed XML report to a file
-// in addition to its normal textual output.
-GTEST_DECLARE_string_(output);
-
-// This flags control whether Google Test prints the elapsed time for each
-// test.
-GTEST_DECLARE_bool_(print_time);
-
-// This flag specifies the random number seed.
-GTEST_DECLARE_int32_(random_seed);
-
-// This flag sets how many times the tests are repeated. The default value
-// is 1. If the value is -1 the tests are repeating forever.
-GTEST_DECLARE_int32_(repeat);
-
-// This flag controls whether Google Test includes Google Test internal
-// stack frames in failure stack traces.
-GTEST_DECLARE_bool_(show_internal_stack_frames);
-
-// When this flag is specified, tests' order is randomized on every iteration.
-GTEST_DECLARE_bool_(shuffle);
-
-// This flag specifies the maximum number of stack frames to be
-// printed in a failure message.
-GTEST_DECLARE_int32_(stack_trace_depth);
-
-// When this flag is specified, a failed assertion will throw an
-// exception if exceptions are enabled, or exit the program with a
-// non-zero code otherwise.
-GTEST_DECLARE_bool_(throw_on_failure);
-
-// When this flag is set with a "host:port" string, on supported
-// platforms test results are streamed to the specified port on
-// the specified host machine.
-GTEST_DECLARE_string_(stream_result_to);
-
-// The upper limit for valid stack trace depths.
-const int kMaxStackTraceDepth = 100;
-
-namespace internal {
-
-class AssertHelper;
-class DefaultGlobalTestPartResultReporter;
-class ExecDeathTest;
-class NoExecDeathTest;
-class FinalSuccessChecker;
-class GTestFlagSaver;
-class TestResultAccessor;
-class TestEventListenersAccessor;
-class TestEventRepeater;
-class WindowsDeathTest;
-class UnitTestImpl* GetUnitTestImpl();
-void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
-                                    const String& message);
-
-// Converts a streamable value to a String.  A NULL pointer is
-// converted to "(null)".  When the input value is a ::string,
-// ::std::string, ::wstring, or ::std::wstring object, each NUL
-// character in it is replaced with "\\0".
-// Declared in gtest-internal.h but defined here, so that it has access
-// to the definition of the Message class, required by the ARM
-// compiler.
-template <typename T>
-String StreamableToString(const T& streamable) {
-  return (Message() << streamable).GetString();
-}
-
-}  // namespace internal
-
-// The friend relationship of some of these classes is cyclic.
-// If we don't forward declare them the compiler might confuse the classes
-// in friendship clauses with same named classes on the scope.
-class Test;
-class TestCase;
-class TestInfo;
-class UnitTest;
-
-// A class for indicating whether an assertion was successful.  When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-//   1. Defining predicate functions to be used with Boolean test assertions
-//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-//   2. Defining predicate-format functions to be
-//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false (5 is odd)
-//   Expected: true
-//
-// instead of a more opaque
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false
-//   Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess() << n << " is even";
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-//   Value of: IsEven(Fib(6))
-//     Actual: true (8 is even)
-//   Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-//   // Verifies that Foo() returns an even number.
-//   EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-//   testing::AssertionResult IsEven(const char* expr, int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure()
-//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
-//   }
-//
-// If Foo() returns 5, you will see the following message:
-//
-//   Expected: Foo() is even
-//     Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
-  // Copy constructor.
-  // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
-  // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  explicit AssertionResult(bool success) : success_(success) {}
-
-  // Returns true iff the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != NULL ?  message_->c_str() : "";
-  }
-  // TODO(vladl@google.com): Remove this after making sure no clients use it.
-  // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
-
-  // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
-    AppendMessage(Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
-    AppendMessage(Message() << basic_manipulator);
-    return *this;
-  }
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == NULL)
-      message_.reset(new ::std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  // Stores result of the assertion predicate.
-  bool success_;
-  // Stores the message describing the condition in case the expectation
-  // construct is not satisfied with the predicate's outcome.
-  // Referenced via a pointer to avoid taking too much stack frame space
-  // with test assertions.
-  internal::scoped_ptr< ::std::string> message_;
-
-  GTEST_DISALLOW_ASSIGN_(AssertionResult);
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
-// The abstract class that all tests inherit from.
-//
-// In Google Test, a unit test program contains one or many TestCases, and
-// each TestCase contains one or many Tests.
-//
-// When you define a test using the TEST macro, you don't need to
-// explicitly derive from Test - the TEST macro automatically does
-// this for you.
-//
-// The only time you derive from Test is when defining a test fixture
-// to be used a TEST_F.  For example:
-//
-//   class FooTest : public testing::Test {
-//    protected:
-//     virtual void SetUp() { ... }
-//     virtual void TearDown() { ... }
-//     ...
-//   };
-//
-//   TEST_F(FooTest, Bar) { ... }
-//   TEST_F(FooTest, Baz) { ... }
-//
-// Test is not copyable.
-class GTEST_API_ Test {
- public:
-  friend class TestInfo;
-
-  // Defines types for pointers to functions that set up and tear down
-  // a test case.
-  typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc;
-  typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc;
-
-  // The d'tor is virtual as we intend to inherit from Test.
-  virtual ~Test();
-
-  // Sets up the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::SetUpTestCase() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
-  // SetUpTestCase() method to shadow the one defined in the super
-  // class.
-  static void SetUpTestCase() {}
-
-  // Tears down the stuff shared by all tests in this test case.
-  //
-  // Google Test will call Foo::TearDownTestCase() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
-  // TearDownTestCase() method to shadow the one defined in the super
-  // class.
-  static void TearDownTestCase() {}
-
-  // Returns true iff the current test has a fatal failure.
-  static bool HasFatalFailure();
-
-  // Returns true iff the current test has a non-fatal failure.
-  static bool HasNonfatalFailure();
-
-  // Returns true iff the current test has a (either fatal or
-  // non-fatal) failure.
-  static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); }
-
-  // Logs a property for the current test.  Only the last value for a given
-  // key is remembered.
-  // These are public static so they can be called from utility functions
-  // that are not members of the test fixture.
-  // The arguments are const char* instead strings, as Google Test is used
-  // on platforms where string doesn't compile.
-  //
-  // Note that a driving consideration for these RecordProperty methods
-  // was to produce xml output suited to the Greenspan charting utility,
-  // which at present will only chart values that fit in a 32-bit int. It
-  // is the user's responsibility to restrict their values to 32-bit ints
-  // if they intend them to be used with Greenspan.
-  static void RecordProperty(const char* key, const char* value);
-  static void RecordProperty(const char* key, int value);
-
- protected:
-  // Creates a Test object.
-  Test();
-
-  // Sets up the test fixture.
-  virtual void SetUp();
-
-  // Tears down the test fixture.
-  virtual void TearDown();
-
- private:
-  // Returns true iff the current test has the same fixture class as
-  // the first test in the current test case.
-  static bool HasSameFixtureClass();
-
-  // Runs the test after the test fixture has been set up.
-  //
-  // A sub-class must implement this to define the test logic.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM.
-  // Instead, use the TEST or TEST_F macro.
-  virtual void TestBody() = 0;
-
-  // Sets up, executes, and tears down the test.
-  void Run();
-
-  // Deletes self.  We deliberately pick an unusual name for this
-  // internal method to avoid clashing with names used in user TESTs.
-  void DeleteSelf_() { delete this; }
-
-  // Uses a GTestFlagSaver to save and restore all Google Test flags.
-  const internal::GTestFlagSaver* const gtest_flag_saver_;
-
-  // Often a user mis-spells SetUp() as Setup() and spends a long time
-  // wondering why it is never called by Google Test.  The declaration of
-  // the following method is solely for catching such an error at
-  // compile time:
-  //
-  //   - The return type is deliberately chosen to be not void, so it
-  //   will be a conflict if a user declares void Setup() in his test
-  //   fixture.
-  //
-  //   - This method is private, so it will be another compiler error
-  //   if a user calls it from his test fixture.
-  //
-  // DO NOT OVERRIDE THIS FUNCTION.
-  //
-  // If you see an error about overriding the following function or
-  // about it being private, you have mis-spelled SetUp() as Setup().
-  struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
-
-  // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
-};
-
-typedef internal::TimeInMillis TimeInMillis;
-
-// A copyable object representing a user specified test property which can be
-// output as a key/value string pair.
-//
-// Don't inherit from TestProperty as its destructor is not virtual.
-class TestProperty {
- public:
-  // C'tor.  TestProperty does NOT have a default constructor.
-  // Always use this constructor (with parameters) to create a
-  // TestProperty object.
-  TestProperty(const char* a_key, const char* a_value) :
-    key_(a_key), value_(a_value) {
-  }
-
-  // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
-
-  // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
-
-  // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const char* new_value) {
-    value_ = new_value;
-  }
-
- private:
-  // The key supplied by the user.
-  internal::String key_;
-  // The value supplied by the user.
-  internal::String value_;
-};
-
-// The result of a single Test.  This includes a list of
-// TestPartResults, a list of TestProperties, a count of how many
-// death tests there are in the Test, and how much time it took to run
-// the Test.
-//
-// TestResult is not copyable.
-class GTEST_API_ TestResult {
- public:
-  // Creates an empty TestResult.
-  TestResult();
-
-  // D'tor.  Do not inherit from TestResult.
-  ~TestResult();
-
-  // Gets the number of all test parts.  This is the sum of the number
-  // of successful test parts and the number of failed test parts.
-  int total_part_count() const;
-
-  // Returns the number of the test properties.
-  int test_property_count() const;
-
-  // Returns true iff the test passed (i.e. no test part failed).
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the test failed.
-  bool Failed() const;
-
-  // Returns true iff the test fatally failed.
-  bool HasFatalFailure() const;
-
-  // Returns true iff the test has a non-fatal failure.
-  bool HasNonfatalFailure() const;
-
-  // Returns the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns the i-th test part result among all the results. i can range
-  // from 0 to test_property_count() - 1. If i is not in that range, aborts
-  // the program.
-  const TestPartResult& GetTestPartResult(int i) const;
-
-  // Returns the i-th test property. i can range from 0 to
-  // test_property_count() - 1. If i is not in that range, aborts the
-  // program.
-  const TestProperty& GetTestProperty(int i) const;
-
- private:
-  friend class TestInfo;
-  friend class UnitTest;
-  friend class internal::DefaultGlobalTestPartResultReporter;
-  friend class internal::ExecDeathTest;
-  friend class internal::TestResultAccessor;
-  friend class internal::UnitTestImpl;
-  friend class internal::WindowsDeathTest;
-
-  // Gets the vector of TestPartResults.
-  const std::vector<TestPartResult>& test_part_results() const {
-    return test_part_results_;
-  }
-
-  // Gets the vector of TestProperties.
-  const std::vector<TestProperty>& test_properties() const {
-    return test_properties_;
-  }
-
-  // Sets the elapsed time.
-  void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; }
-
-  // Adds a test property to the list. The property is validated and may add
-  // a non-fatal failure if invalid (e.g., if it conflicts with reserved
-  // key names). If a property is already recorded for the same key, the
-  // value will be updated, rather than storing multiple values for the same
-  // key.
-  void RecordProperty(const TestProperty& test_property);
-
-  // Adds a failure if the key is a reserved attribute of Google Test
-  // testcase tags.  Returns true if the property is valid.
-  // TODO(russr): Validate attribute names are legal and human readable.
-  static bool ValidateTestProperty(const TestProperty& test_property);
-
-  // Adds a test part result to the list.
-  void AddTestPartResult(const TestPartResult& test_part_result);
-
-  // Returns the death test count.
-  int death_test_count() const { return death_test_count_; }
-
-  // Increments the death test count, returning the new count.
-  int increment_death_test_count() { return ++death_test_count_; }
-
-  // Clears the test part results.
-  void ClearTestPartResults();
-
-  // Clears the object.
-  void Clear();
-
-  // Protects mutable state of the property vector and of owned
-  // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
-
-  // The vector of TestPartResults
-  std::vector<TestPartResult> test_part_results_;
-  // The vector of TestProperties
-  std::vector<TestProperty> test_properties_;
-  // Running count of death tests.
-  int death_test_count_;
-  // The elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-  // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
-};  // class TestResult
-
-// A TestInfo object stores the following information about a test:
-//
-//   Test case name
-//   Test name
-//   Whether the test should be run
-//   A function pointer that creates the test object when invoked
-//   Test result
-//
-// The constructor of TestInfo registers itself with the UnitTest
-// singleton such that the RUN_ALL_TESTS() macro knows which tests to
-// run.
-class GTEST_API_ TestInfo {
- public:
-  // Destructs a TestInfo object.  This function is not virtual, so
-  // don't inherit from TestInfo.
-  ~TestInfo();
-
-  // Returns the test case name.
-  const char* test_case_name() const { return test_case_name_.c_str(); }
-
-  // Returns the test name.
-  const char* name() const { return name_.c_str(); }
-
-  // Returns the name of the parameter type, or NULL if this is not a typed
-  // or a type-parameterized test.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
-  }
-
-  // Returns the text representation of the value parameter, or NULL if this
-  // is not a value-parameterized test.
-  const char* value_param() const {
-    if (value_param_.get() != NULL)
-      return value_param_->c_str();
-    return NULL;
-  }
-
-  // Returns true if this test should run, that is if the test is not disabled
-  // (or it is disabled but the also_run_disabled_tests flag has been specified)
-  // and its full name matches the user-specified filter.
-  //
-  // Google Test allows the user to filter the tests by their full names.
-  // The full name of a test Bar in test case Foo is defined as
-  // "Foo.Bar".  Only the tests that match the filter will run.
-  //
-  // A filter is a colon-separated list of glob (not regex) patterns,
-  // optionally followed by a '-' and a colon-separated list of
-  // negative patterns (tests to exclude).  A test is run if it
-  // matches one of the positive patterns and does not match any of
-  // the negative patterns.
-  //
-  // For example, *A*:Foo.* is a filter that matches any string that
-  // contains the character 'A' or starts with "Foo.".
-  bool should_run() const { return should_run_; }
-
-  // Returns the result of the test.
-  const TestResult* result() const { return &result_; }
-
- private:
-
-#if GTEST_HAS_DEATH_TEST
-  friend class internal::DefaultDeathTestFactory;
-#endif  // GTEST_HAS_DEATH_TEST
-  friend class Test;
-  friend class TestCase;
-  friend class internal::UnitTestImpl;
-  friend TestInfo* internal::MakeAndRegisterTestInfo(
-      const char* test_case_name, const char* name,
-      const char* type_param,
-      const char* value_param,
-      internal::TypeId fixture_class_id,
-      Test::SetUpTestCaseFunc set_up_tc,
-      Test::TearDownTestCaseFunc tear_down_tc,
-      internal::TestFactoryBase* factory);
-
-  // Constructs a TestInfo object. The newly constructed instance assumes
-  // ownership of the factory object.
-  TestInfo(const char* test_case_name, const char* name,
-           const char* a_type_param,
-           const char* a_value_param,
-           internal::TypeId fixture_class_id,
-           internal::TestFactoryBase* factory);
-
-  // Increments the number of death tests encountered in this test so
-  // far.
-  int increment_death_test_count() {
-    return result_.increment_death_test_count();
-  }
-
-  // Creates the test object, runs it, records its result, and then
-  // deletes it.
-  void Run();
-
-  static void ClearTestResult(TestInfo* test_info) {
-    test_info->result_.Clear();
-  }
-
-  // These fields are immutable properties of the test.
-  const std::string test_case_name_;     // Test case name
-  const std::string name_;               // Test name
-  // Name of the parameter type, or NULL if this is not a typed or a
-  // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
-  // Text representation of the value parameter, or NULL if this is not a
-  // value-parameterized test.
-  const internal::scoped_ptr<const ::std::string> value_param_;
-  const internal::TypeId fixture_class_id_;   // ID of the test fixture class
-  bool should_run_;                 // True iff this test should run
-  bool is_disabled_;                // True iff this test is disabled
-  bool matches_filter_;             // True if this test matches the
-                                    // user-specified filter.
-  internal::TestFactoryBase* const factory_;  // The factory that creates
-                                              // the test object
-
-  // This field is mutable and needs to be reset before running the
-  // test for the second time.
-  TestResult result_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
-};
-
-// A test case, which consists of a vector of TestInfos.
-//
-// TestCase is not copyable.
-class GTEST_API_ TestCase {
- public:
-  // Creates a TestCase with the given name.
-  //
-  // TestCase does NOT have a default constructor.  Always use this
-  // constructor to create a TestCase object.
-  //
-  // Arguments:
-  //
-  //   name:         name of the test case
-  //   a_type_param: the name of the test's type parameter, or NULL if
-  //                 this is not a type-parameterized test.
-  //   set_up_tc:    pointer to the function that sets up the test case
-  //   tear_down_tc: pointer to the function that tears down the test case
-  TestCase(const char* name, const char* a_type_param,
-           Test::SetUpTestCaseFunc set_up_tc,
-           Test::TearDownTestCaseFunc tear_down_tc);
-
-  // Destructor of TestCase.
-  virtual ~TestCase();
-
-  // Gets the name of the TestCase.
-  const char* name() const { return name_.c_str(); }
-
-  // Returns the name of the parameter type, or NULL if this is not a
-  // type-parameterized test case.
-  const char* type_param() const {
-    if (type_param_.get() != NULL)
-      return type_param_->c_str();
-    return NULL;
-  }
-
-  // Returns true if any test in this test case should run.
-  bool should_run() const { return should_run_; }
-
-  // Gets the number of successful tests in this test case.
-  int successful_test_count() const;
-
-  // Gets the number of failed tests in this test case.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests in this test case.
-  int disabled_test_count() const;
-
-  // Get the number of tests in this test case that should run.
-  int test_to_run_count() const;
-
-  // Gets the number of all tests in this test case.
-  int total_test_count() const;
-
-  // Returns true iff the test case passed.
-  bool Passed() const { return !Failed(); }
-
-  // Returns true iff the test case failed.
-  bool Failed() const { return failed_test_count() > 0; }
-
-  // Returns the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const { return elapsed_time_; }
-
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  const TestInfo* GetTestInfo(int i) const;
-
- private:
-  friend class Test;
-  friend class internal::UnitTestImpl;
-
-  // Gets the (mutable) vector of TestInfos in this TestCase.
-  std::vector<TestInfo*>& test_info_list() { return test_info_list_; }
-
-  // Gets the (immutable) vector of TestInfos in this TestCase.
-  const std::vector<TestInfo*>& test_info_list() const {
-    return test_info_list_;
-  }
-
-  // Returns the i-th test among all the tests. i can range from 0 to
-  // total_test_count() - 1. If i is not in that range, returns NULL.
-  TestInfo* GetMutableTestInfo(int i);
-
-  // Sets the should_run member.
-  void set_should_run(bool should) { should_run_ = should; }
-
-  // Adds a TestInfo to this test case.  Will delete the TestInfo upon
-  // destruction of the TestCase object.
-  void AddTestInfo(TestInfo * test_info);
-
-  // Clears the results of all tests in this test case.
-  void ClearResult();
-
-  // Clears the results of all tests in the given test case.
-  static void ClearTestCaseResult(TestCase* test_case) {
-    test_case->ClearResult();
-  }
-
-  // Runs every test in this TestCase.
-  void Run();
-
-  // Runs SetUpTestCase() for this TestCase.  This wrapper is needed
-  // for catching exceptions thrown from SetUpTestCase().
-  void RunSetUpTestCase() { (*set_up_tc_)(); }
-
-  // Runs TearDownTestCase() for this TestCase.  This wrapper is
-  // needed for catching exceptions thrown from TearDownTestCase().
-  void RunTearDownTestCase() { (*tear_down_tc_)(); }
-
-  // Returns true iff test passed.
-  static bool TestPassed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Passed();
-  }
-
-  // Returns true iff test failed.
-  static bool TestFailed(const TestInfo* test_info) {
-    return test_info->should_run() && test_info->result()->Failed();
-  }
-
-  // Returns true iff test is disabled.
-  static bool TestDisabled(const TestInfo* test_info) {
-    return test_info->is_disabled_;
-  }
-
-  // Returns true if the given test should run.
-  static bool ShouldRunTest(const TestInfo* test_info) {
-    return test_info->should_run();
-  }
-
-  // Shuffles the tests in this test case.
-  void ShuffleTests(internal::Random* random);
-
-  // Restores the test order to before the first shuffle.
-  void UnshuffleTests();
-
-  // Name of the test case.
-  internal::String name_;
-  // Name of the parameter type, or NULL if this is not a typed or a
-  // type-parameterized test.
-  const internal::scoped_ptr<const ::std::string> type_param_;
-  // The vector of TestInfos in their original order.  It owns the
-  // elements in the vector.
-  std::vector<TestInfo*> test_info_list_;
-  // Provides a level of indirection for the test list to allow easy
-  // shuffling and restoring the test order.  The i-th element in this
-  // vector is the index of the i-th test in the shuffled test list.
-  std::vector<int> test_indices_;
-  // Pointer to the function that sets up the test case.
-  Test::SetUpTestCaseFunc set_up_tc_;
-  // Pointer to the function that tears down the test case.
-  Test::TearDownTestCaseFunc tear_down_tc_;
-  // True iff any test in this test case should run.
-  bool should_run_;
-  // Elapsed time, in milliseconds.
-  TimeInMillis elapsed_time_;
-
-  // We disallow copying TestCases.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase);
-};
-
-// An Environment object is capable of setting up and tearing down an
-// environment.  The user should subclass this to define his own
-// environment(s).
-//
-// An Environment object does the set-up and tear-down in virtual
-// methods SetUp() and TearDown() instead of the constructor and the
-// destructor, as:
-//
-//   1. You cannot safely throw from a destructor.  This is a problem
-//      as in some cases Google Test is used where exceptions are enabled, and
-//      we may want to implement ASSERT_* using exceptions where they are
-//      available.
-//   2. You cannot use ASSERT_* directly in a constructor or
-//      destructor.
-class Environment {
- public:
-  // The d'tor is virtual as we need to subclass Environment.
-  virtual ~Environment() {}
-
-  // Override this to define how to set up the environment.
-  virtual void SetUp() {}
-
-  // Override this to define how to tear down the environment.
-  virtual void TearDown() {}
- private:
-  // If you see an error about overriding the following function or
-  // about it being private, you have mis-spelled SetUp() as Setup().
-  struct Setup_should_be_spelled_SetUp {};
-  virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
-};
-
-// The interface for tracing execution of tests. The methods are organized in
-// the order the corresponding events are fired.
-class TestEventListener {
- public:
-  virtual ~TestEventListener() {}
-
-  // Fired before any test activity starts.
-  virtual void OnTestProgramStart(const UnitTest& unit_test) = 0;
-
-  // Fired before each iteration of tests starts.  There may be more than
-  // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration
-  // index, starting from 0.
-  virtual void OnTestIterationStart(const UnitTest& unit_test,
-                                    int iteration) = 0;
-
-  // Fired before environment set-up for each iteration of tests starts.
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0;
-
-  // Fired after environment set-up for each iteration of tests ends.
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0;
-
-  // Fired before the test case starts.
-  virtual void OnTestCaseStart(const TestCase& test_case) = 0;
-
-  // Fired before the test starts.
-  virtual void OnTestStart(const TestInfo& test_info) = 0;
-
-  // Fired after a failed assertion or a SUCCEED() invocation.
-  virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
-
-  // Fired after the test ends.
-  virtual void OnTestEnd(const TestInfo& test_info) = 0;
-
-  // Fired after the test case ends.
-  virtual void OnTestCaseEnd(const TestCase& test_case) = 0;
-
-  // Fired before environment tear-down for each iteration of tests starts.
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0;
-
-  // Fired after environment tear-down for each iteration of tests ends.
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
-
-  // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
-
-  // Fired after all test activities have ended.
-  virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
-};
-
-// The convenience class for users who need to override just one or two
-// methods and are not concerned that a possible change to a signature of
-// the methods they override will not be caught during the build.  For
-// comments about each method please see the definition of TestEventListener
-// above.
-class EmptyTestEventListener : public TestEventListener {
- public:
-  virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationStart(const UnitTest& /*unit_test*/,
-                                    int /*iteration*/) {}
-  virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestCaseStart(const TestCase& /*test_case*/) {}
-  virtual void OnTestStart(const TestInfo& /*test_info*/) {}
-  virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {}
-  virtual void OnTestEnd(const TestInfo& /*test_info*/) {}
-  virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {}
-  virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {}
-  virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {}
-  virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/,
-                                  int /*iteration*/) {}
-  virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {}
-};
-
-// TestEventListeners lets users add listeners to track events in Google Test.
-class GTEST_API_ TestEventListeners {
- public:
-  TestEventListeners();
-  ~TestEventListeners();
-
-  // Appends an event listener to the end of the list. Google Test assumes
-  // the ownership of the listener (i.e. it will delete the listener when
-  // the test program finishes).
-  void Append(TestEventListener* listener);
-
-  // Removes the given event listener from the list and returns it.  It then
-  // becomes the caller's responsibility to delete the listener. Returns
-  // NULL if the listener is not found in the list.
-  TestEventListener* Release(TestEventListener* listener);
-
-  // Returns the standard listener responsible for the default console
-  // output.  Can be removed from the listeners list to shut down default
-  // console output.  Note that removing this object from the listener list
-  // with Release transfers its ownership to the caller and makes this
-  // function return NULL the next time.
-  TestEventListener* default_result_printer() const {
-    return default_result_printer_;
-  }
-
-  // Returns the standard listener responsible for the default XML output
-  // controlled by the --gtest_output=xml flag.  Can be removed from the
-  // listeners list by users who want to shut down the default XML output
-  // controlled by this flag and substitute it with custom one.  Note that
-  // removing this object from the listener list with Release transfers its
-  // ownership to the caller and makes this function return NULL the next
-  // time.
-  TestEventListener* default_xml_generator() const {
-    return default_xml_generator_;
-  }
-
- private:
-  friend class TestCase;
-  friend class TestInfo;
-  friend class internal::DefaultGlobalTestPartResultReporter;
-  friend class internal::NoExecDeathTest;
-  friend class internal::TestEventListenersAccessor;
-  friend class internal::UnitTestImpl;
-
-  // Returns repeater that broadcasts the TestEventListener events to all
-  // subscribers.
-  TestEventListener* repeater();
-
-  // Sets the default_result_printer attribute to the provided listener.
-  // The listener is also added to the listener list and previous
-  // default_result_printer is removed from it and deleted. The listener can
-  // also be NULL in which case it will not be added to the list. Does
-  // nothing if the previous and the current listener objects are the same.
-  void SetDefaultResultPrinter(TestEventListener* listener);
-
-  // Sets the default_xml_generator attribute to the provided listener.  The
-  // listener is also added to the listener list and previous
-  // default_xml_generator is removed from it and deleted. The listener can
-  // also be NULL in which case it will not be added to the list. Does
-  // nothing if the previous and the current listener objects are the same.
-  void SetDefaultXmlGenerator(TestEventListener* listener);
-
-  // Controls whether events will be forwarded by the repeater to the
-  // listeners in the list.
-  bool EventForwardingEnabled() const;
-  void SuppressEventForwarding();
-
-  // The actual list of listeners.
-  internal::TestEventRepeater* repeater_;
-  // Listener responsible for the standard result output.
-  TestEventListener* default_result_printer_;
-  // Listener responsible for the creation of the XML output file.
-  TestEventListener* default_xml_generator_;
-
-  // We disallow copying TestEventListeners.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
-};
-
-// A UnitTest consists of a vector of TestCases.
-//
-// This is a singleton class.  The only instance of UnitTest is
-// created when UnitTest::GetInstance() is first called.  This
-// instance is never deleted.
-//
-// UnitTest is not copyable.
-//
-// This class is thread-safe as long as the methods are called
-// according to their specification.
-class GTEST_API_ UnitTest {
- public:
-  // Gets the singleton UnitTest object.  The first time this method
-  // is called, a UnitTest object is constructed and returned.
-  // Consecutive calls will return the same object.
-  static UnitTest* GetInstance();
-
-  // Runs all tests in this UnitTest object and prints the result.
-  // Returns 0 if successful, or 1 otherwise.
-  //
-  // This method can only be called from the main thread.
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  int Run() GTEST_MUST_USE_RESULT_;
-
-  // Returns the working directory when the first TEST() or TEST_F()
-  // was executed.  The UnitTest object owns the string.
-  const char* original_working_dir() const;
-
-  // Returns the TestCase object for the test that's currently running,
-  // or NULL if no test is running.
-  const TestCase* current_test_case() const;
-
-  // Returns the TestInfo object for the test that's currently running,
-  // or NULL if no test is running.
-  const TestInfo* current_test_info() const;
-
-  // Returns the random seed used at the start of the current test run.
-  int random_seed() const;
-
-#if GTEST_HAS_PARAM_TEST
-  // Returns the ParameterizedTestCaseRegistry object used to keep track of
-  // value-parameterized tests and instantiate and register them.
-  //
-  // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-  internal::ParameterizedTestCaseRegistry& parameterized_test_registry();
-#endif  // GTEST_HAS_PARAM_TEST
-
-  // Gets the number of successful test cases.
-  int successful_test_case_count() const;
-
-  // Gets the number of failed test cases.
-  int failed_test_case_count() const;
-
-  // Gets the number of all test cases.
-  int total_test_case_count() const;
-
-  // Gets the number of all test cases that contain at least one test
-  // that should run.
-  int test_case_to_run_count() const;
-
-  // Gets the number of successful tests.
-  int successful_test_count() const;
-
-  // Gets the number of failed tests.
-  int failed_test_count() const;
-
-  // Gets the number of disabled tests.
-  int disabled_test_count() const;
-
-  // Gets the number of all tests.
-  int total_test_count() const;
-
-  // Gets the number of tests that should run.
-  int test_to_run_count() const;
-
-  // Gets the elapsed time, in milliseconds.
-  TimeInMillis elapsed_time() const;
-
-  // Returns true iff the unit test passed (i.e. all test cases passed).
-  bool Passed() const;
-
-  // Returns true iff the unit test failed (i.e. some test case failed
-  // or something outside of all tests failed).
-  bool Failed() const;
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  const TestCase* GetTestCase(int i) const;
-
-  // Returns the list of event listeners that can be used to track events
-  // inside Google Test.
-  TestEventListeners& listeners();
-
- private:
-  // Registers and returns a global test environment.  When a test
-  // program is run, all global test environments will be set-up in
-  // the order they were registered.  After all tests in the program
-  // have finished, all global test environments will be torn-down in
-  // the *reverse* order they were registered.
-  //
-  // The UnitTest object takes ownership of the given environment.
-  //
-  // This method can only be called from the main thread.
-  Environment* AddEnvironment(Environment* env);
-
-  // Adds a TestPartResult to the current TestResult object.  All
-  // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc)
-  // eventually call this to report their results.  The user code
-  // should use the assertion macros instead of calling this directly.
-  void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
-                         const internal::String& message,
-                         const internal::String& os_stack_trace);
-
-  // Adds a TestProperty to the current TestResult object. If the result already
-  // contains a property with the same key, the value will be updated.
-  void RecordPropertyForCurrentTest(const char* key, const char* value);
-
-  // Gets the i-th test case among all the test cases. i can range from 0 to
-  // total_test_case_count() - 1. If i is not in that range, returns NULL.
-  TestCase* GetMutableTestCase(int i);
-
-  // Accessors for the implementation object.
-  internal::UnitTestImpl* impl() { return impl_; }
-  const internal::UnitTestImpl* impl() const { return impl_; }
-
-  // These classes and funcions are friends as they need to access private
-  // members of UnitTest.
-  friend class Test;
-  friend class internal::AssertHelper;
-  friend class internal::ScopedTrace;
-  friend Environment* AddGlobalTestEnvironment(Environment* env);
-  friend internal::UnitTestImpl* internal::GetUnitTestImpl();
-  friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const internal::String& message);
-
-  // Creates an empty UnitTest.
-  UnitTest();
-
-  // D'tor
-  virtual ~UnitTest();
-
-  // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
-  // Google Test trace stack.
-  void PushGTestTrace(const internal::TraceInfo& trace);
-
-  // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace();
-
-  // Protects mutable state in *impl_.  This is mutable as some const
-  // methods need to lock it too.
-  mutable internal::Mutex mutex_;
-
-  // Opaque implementation object.  This field is never changed once
-  // the object is constructed.  We don't mark it as const here, as
-  // doing so will cause a warning in the constructor of UnitTest.
-  // Mutable state in *impl_ is protected by mutex_.
-  internal::UnitTestImpl* impl_;
-
-  // We disallow copying UnitTest.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
-};
-
-// A convenient wrapper for adding an environment for the test
-// program.
-//
-// You should call this before RUN_ALL_TESTS() is called, probably in
-// main().  If you use gtest_main, you need to call this before main()
-// starts for it to take effect.  For example, you can define a global
-// variable like this:
-//
-//   testing::Environment* const foo_env =
-//       testing::AddGlobalTestEnvironment(new FooEnvironment);
-//
-// However, we strongly recommend you to write your own main() and
-// call AddGlobalTestEnvironment() there, as relying on initialization
-// of global variables makes the code harder to read and may cause
-// problems when you register multiple environments from different
-// translation units and the environments have dependencies among them
-// (remember that the compiler doesn't guarantee the order in which
-// global variables from different translation units are initialized).
-inline Environment* AddGlobalTestEnvironment(Environment* env) {
-  return UnitTest::GetInstance()->AddEnvironment(env);
-}
-
-// Initializes Google Test.  This must be called before calling
-// RUN_ALL_TESTS().  In particular, it parses a command line for the
-// flags that Google Test recognizes.  Whenever a Google Test flag is
-// seen, it is removed from argv, and *argc is decremented.
-//
-// No value is returned.  Instead, the Google Test flag variables are
-// updated.
-//
-// Calling the function for the second time has no user-visible effect.
-GTEST_API_ void InitGoogleTest(int* argc, char** argv);
-
-// This overloaded version can be used in Windows programs compiled in
-// UNICODE mode.
-GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv);
-
-namespace internal {
-
-// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc)
-// operand to be used in a failure message.  The type (but not value)
-// of the other operand may affect the format.  This allows us to
-// print a char* as a raw pointer when it is compared against another
-// char*, and print it as a C string when it is compared against an
-// std::string object, for example.
-//
-// The default implementation ignores the type of the other operand.
-// Some specialized versions are used to handle formatting wide or
-// narrow C strings.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-template <typename T1, typename T2>
-String FormatForComparisonFailureMessage(const T1& value,
-                                         const T2& /* other_operand */) {
-  // C++Builder compiles this incorrectly if the namespace isn't explicitly
-  // given.
-  return ::testing::PrintToString(value);
-}
-
-// The helper function for {ASSERT|EXPECT}_EQ.
-template <typename T1, typename T2>
-AssertionResult CmpHelperEQ(const char* expected_expression,
-                            const char* actual_expression,
-                            const T1& expected,
-                            const T2& actual) {
-#ifdef _MSC_VER
-# pragma warning(push)          // Saves the current warning state.
-# pragma warning(disable:4389)  // Temporarily disables warning on
-                               // signed/unsigned mismatch.
-#endif
-
-  if (expected == actual) {
-    return AssertionSuccess();
-  }
-
-#ifdef _MSC_VER
-# pragma warning(pop)          // Restores the warning state.
-#endif
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   FormatForComparisonFailureMessage(expected, actual),
-                   FormatForComparisonFailureMessage(actual, expected),
-                   false);
-}
-
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression,
-                                       const char* actual_expression,
-                                       BiggestInt expected,
-                                       BiggestInt actual);
-
-// The helper class for {ASSERT|EXPECT}_EQ.  The template argument
-// lhs_is_null_literal is true iff the first argument to ASSERT_EQ()
-// is a null pointer literal.  The following default implementation is
-// for lhs_is_null_literal being false.
-template <bool lhs_is_null_literal>
-class EqHelper {
- public:
-  // This templatized version is for the general case.
-  template <typename T1, typename T2>
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 const T1& expected,
-                                 const T2& actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-
-  // With this overloaded version, we allow anonymous enums to be used
-  // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous
-  // enums can be implicitly cast to BiggestInt.
-  //
-  // Even though its body looks the same as the above version, we
-  // cannot merge the two, as it will make anonymous enums unhappy.
-  static AssertionResult Compare(const char* expected_expression,
-                                 const char* actual_expression,
-                                 BiggestInt expected,
-                                 BiggestInt actual) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-};
-
-// This specialization is used when the first argument to ASSERT_EQ()
-// is a null pointer literal, like NULL, false, or 0.
-template <>
-class EqHelper<true> {
- public:
-  // We define two overloaded versions of Compare().  The first
-  // version will be picked when the second argument to ASSERT_EQ() is
-  // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or
-  // EXPECT_EQ(false, a_bool).
-  template <typename T1, typename T2>
-  static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      const T1& expected,
-      const T2& actual,
-      // The following line prevents this overload from being considered if T2
-      // is not a pointer type.  We need this because ASSERT_EQ(NULL, my_ptr)
-      // expands to Compare("", "", NULL, my_ptr), which requires a conversion
-      // to match the Secret* in the other overload, which would otherwise make
-      // this template match better.
-      typename EnableIf<!is_pointer<T2>::value>::type* = 0) {
-    return CmpHelperEQ(expected_expression, actual_expression, expected,
-                       actual);
-  }
-
-  // This version will be picked when the second argument to ASSERT_EQ() is a
-  // pointer, e.g. ASSERT_EQ(NULL, a_pointer).
-  template <typename T>
-  static AssertionResult Compare(
-      const char* expected_expression,
-      const char* actual_expression,
-      // We used to have a second template parameter instead of Secret*.  That
-      // template parameter would deduce to 'long', making this a better match
-      // than the first overload even without the first overload's EnableIf.
-      // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to
-      // non-pointer argument" (even a deduced integral argument), so the old
-      // implementation caused warnings in user code.
-      Secret* /* expected (NULL) */,
-      T* actual) {
-    // We already know that 'expected' is a null pointer.
-    return CmpHelperEQ(expected_expression, actual_expression,
-                       static_cast<T*>(NULL), actual);
-  }
-};
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
-// of similar code.
-//
-// For each templatized helper function, we also define an overloaded
-// version for BiggestInt in order to reduce code bloat and allow
-// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
-// with gcc 4.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}\
-GTEST_API_ AssertionResult CmpHelper##op_name(\
-    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-
-// Implements the helper function for {ASSERT|EXPECT}_NE
-GTEST_IMPL_CMP_HELPER_(NE, !=);
-// Implements the helper function for {ASSERT|EXPECT}_LE
-GTEST_IMPL_CMP_HELPER_(LE, <=);
-// Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, < );
-// Implements the helper function for {ASSERT|EXPECT}_GE
-GTEST_IMPL_CMP_HELPER_(GE, >=);
-// Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, > );
-
-#undef GTEST_IMPL_CMP_HELPER_
-
-// The helper function for {ASSERT|EXPECT}_STREQ.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const char* expected,
-                                          const char* actual);
-
-// The helper function for {ASSERT|EXPECT}_STRCASEEQ.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression,
-                                              const char* actual_expression,
-                                              const char* expected,
-                                              const char* actual);
-
-// The helper function for {ASSERT|EXPECT}_STRNE.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
-
-// The helper function for {ASSERT|EXPECT}_STRCASENE.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                              const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
-
-// Helper function for *_STREQ on wide strings.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression,
-                                          const char* actual_expression,
-                                          const wchar_t* expected,
-                                          const wchar_t* actual);
-
-// Helper function for *_STRNE on wide strings.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                                          const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
-
-}  // namespace internal
-
-// IsSubstring() and IsNotSubstring() are intended to be used as the
-// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by
-// themselves.  They check whether needle is a substring of haystack
-// (NULL is considered a substring of itself only), and return an
-// appropriate error message when they fail.
-//
-// The {needle,haystack}_expr arguments are the stringified
-// expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-
-#if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-#endif  // GTEST_HAS_STD_WSTRING
-
-namespace internal {
-
-// Helper template function for comparing floating-points.
-//
-// Template parameter:
-//
-//   RawType: the raw floating-point type (either float or double)
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-template <typename RawType>
-AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression,
-                                         const char* actual_expression,
-                                         RawType expected,
-                                         RawType actual) {
-  const FloatingPoint<RawType> lhs(expected), rhs(actual);
-
-  if (lhs.AlmostEquals(rhs)) {
-    return AssertionSuccess();
-  }
-
-  ::std::stringstream expected_ss;
-  expected_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-              << expected;
-
-  ::std::stringstream actual_ss;
-  actual_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
-            << actual;
-
-  return EqFailure(expected_expression,
-                   actual_expression,
-                   StringStreamToString(&expected_ss),
-                   StringStreamToString(&actual_ss),
-                   false);
-}
-
-// Helper function for implementing ASSERT_NEAR.
-//
-// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
-GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
-                                                const char* expr2,
-                                                const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
-                                                double abs_error);
-
-// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
-// A class that enables one to stream messages to assertion macros
-class GTEST_API_ AssertHelper {
- public:
-  // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
-               const char* message);
-  ~AssertHelper();
-
-  // Message assignment is a semantic trick to enable assertion
-  // streaming; see the GTEST_MESSAGE_ macro below.
-  void operator=(const Message& message) const;
-
- private:
-  // We put our data in a struct so that the size of the AssertHelper class can
-  // be as small as possible.  This is important because gcc is incapable of
-  // re-using stack space even for temporary variables, so every EXPECT_EQ
-  // reserves stack space for another AssertHelper.
-  struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
-                     const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
-
-    TestPartResult::Type const type;
-    const char*        const file;
-    int                const line;
-    String             const message;
-
-   private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
-  };
-
-  AssertHelperData* const data_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
-};
-
-}  // namespace internal
-
-#if GTEST_HAS_PARAM_TEST
-// The pure interface class that all value-parameterized tests inherit from.
-// A value-parameterized class must inherit from both ::testing::Test and
-// ::testing::WithParamInterface. In most cases that just means inheriting
-// from ::testing::TestWithParam, but more complicated test hierarchies
-// may need to inherit from Test and WithParamInterface at different levels.
-//
-// This interface has support for accessing the test parameter value via
-// the GetParam() method.
-//
-// Use it with one of the parameter generator defining functions, like Range(),
-// Values(), ValuesIn(), Bool(), and Combine().
-//
-// class FooTest : public ::testing::TestWithParam<int> {
-//  protected:
-//   FooTest() {
-//     // Can use GetParam() here.
-//   }
-//   virtual ~FooTest() {
-//     // Can use GetParam() here.
-//   }
-//   virtual void SetUp() {
-//     // Can use GetParam() here.
-//   }
-//   virtual void TearDown {
-//     // Can use GetParam() here.
-//   }
-// };
-// TEST_P(FooTest, DoesBar) {
-//   // Can use GetParam() method here.
-//   Foo foo;
-//   ASSERT_TRUE(foo.DoesBar(GetParam()));
-// }
-// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10));
-
-template <typename T>
-class WithParamInterface {
- public:
-  typedef T ParamType;
-  virtual ~WithParamInterface() {}
-
-  // The current parameter value. Is also available in the test fixture's
-  // constructor. This member function is non-static, even though it only
-  // references static data, to reduce the opportunity for incorrect uses
-  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
-  // uses a fixture whose parameter type is int.
-  const ParamType& GetParam() const { return *parameter_; }
-
- private:
-  // Sets parameter value. The caller is responsible for making sure the value
-  // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
-
-  // Static value used for accessing parameter during a test lifetime.
-  static const ParamType* parameter_;
-
-  // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
-};
-
-template <typename T>
-const T* WithParamInterface<T>::parameter_ = NULL;
-
-// Most value-parameterized classes can ignore the existence of
-// WithParamInterface, and can just inherit from ::testing::TestWithParam.
-
-template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
-
-#endif  // GTEST_HAS_PARAM_TEST
-
-// Macros for indicating success/failure in test code.
-
-// ADD_FAILURE unconditionally adds a failure to the current test.
-// SUCCEED generates a success - it doesn't automatically make the
-// current test successful, as a test is only successful when it has
-// no failure.
-//
-// EXPECT_* verifies that a certain condition is satisfied.  If not,
-// it behaves like ADD_FAILURE.  In particular:
-//
-//   EXPECT_TRUE  verifies that a Boolean condition is true.
-//   EXPECT_FALSE verifies that a Boolean condition is false.
-//
-// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except
-// that they will also abort the current function on failure.  People
-// usually want the fail-fast behavior of FAIL and ASSERT_*, but those
-// writing data-driven tests often find themselves using ADD_FAILURE
-// and EXPECT_* more.
-//
-// Examples:
-//
-//   EXPECT_TRUE(server.StatusIsOK());
-//   ASSERT_FALSE(server.HasPendingRequest(port))
-//       << "There are still pending requests " << "on port " << port;
-
-// Generates a nonfatal failure with a generic message.
-#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed")
-
-// Generates a nonfatal failure at the given source file location with
-// a generic message.
-#define ADD_FAILURE_AT(file, line) \
-  GTEST_MESSAGE_AT_(file, line, "Failed", \
-                    ::testing::TestPartResult::kNonFatalFailure)
-
-// Generates a fatal failure with a generic message.
-#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed")
-
-// Define this macro to 1 to omit the definition of FAIL(), which is a
-// generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
-#endif
-
-// Generates a success with a generic message.
-#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded")
-
-// Define this macro to 1 to omit the definition of SUCCEED(), which
-// is a generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
-#endif
-
-// Macros for testing exceptions.
-//
-//    * {ASSERT|EXPECT}_THROW(statement, expected_exception):
-//         Tests that the statement throws the expected exception.
-//    * {ASSERT|EXPECT}_NO_THROW(statement):
-//         Tests that the statement doesn't throw any exception.
-//    * {ASSERT|EXPECT}_ANY_THROW(statement):
-//         Tests that the statement throws an exception.
-
-#define EXPECT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_THROW(statement, expected_exception) \
-  GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_)
-#define ASSERT_NO_THROW(statement) \
-  GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_)
-#define ASSERT_ANY_THROW(statement) \
-  GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_)
-
-// Boolean assertions. Condition can be either a Boolean expression or an
-// AssertionResult. For more information on how to use AssertionResult with
-// these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
-  GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
-                      GTEST_FATAL_FAILURE_)
-
-// Includes the auto-generated header that implements a family of
-// generic predicate assertion macros.
-// Copyright 2006, Google Inc.
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-//     * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//     * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following disclaimer
-// in the documentation and/or other materials provided with the
-// distribution.
-//     * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 09/24/2010 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
-//
-// Implements a family of generic predicate assertion macros.
-
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-
-// Makes sure this header is not included before gtest.h.
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
-
-// This header implements a family of generic predicate assertion
-// macros:
-//
-//   ASSERT_PRED_FORMAT1(pred_format, v1)
-//   ASSERT_PRED_FORMAT2(pred_format, v1, v2)
-//   ...
-//
-// where pred_format is a function or functor that takes n (in the
-// case of ASSERT_PRED_FORMATn) values and their source expression
-// text, and returns a testing::AssertionResult.  See the definition
-// of ASSERT_EQ in gtest.h for an example.
-//
-// If you don't care about formatting, you can use the more
-// restrictive version:
-//
-//   ASSERT_PRED1(pred, v1)
-//   ASSERT_PRED2(pred, v1, v2)
-//   ...
-//
-// where pred is an n-ary function or functor that returns bool,
-// and the values v1, v2, ..., must support the << operator for
-// streaming to std::ostream.
-//
-// We also define the EXPECT_* variations.
-//
-// For now we only support predicates whose arity is at most 5.
-// Please email googletestframework@googlegroups.com if you need
-// support for higher arities.
-
-// GTEST_ASSERT_ is the basic statement to which all of the assertions
-// in this file reduce.  Don't use this in your code.
-
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
-    on_failure(gtest_ar.failure_message())
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
-  if (pred(v1)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1),\
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
-// this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
-
-// Unary predicate assertion macros.
-#define EXPECT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT1(pred_format, v1) \
-  GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2) {
-  if (pred(v1, v2)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2),\
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
-// this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
-
-// Binary predicate assertion macros.
-#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \
-  GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED2(pred, v1, v2) \
-  GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
-  if (pred(v1, v2, v3)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3),\
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
-// this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
-
-// Ternary predicate assertion macros.
-#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \
-  GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED3(pred, v1, v2, v3) \
-  GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
-  if (pred(v1, v2, v3, v4)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4),\
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
-// this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
-
-// 4-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
-  GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED4(pred, v1, v2, v3, v4) \
-  GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
-
-
-
-// Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
-          typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
-  if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
-
-  return AssertionFailure() << pred_text << "("
-                            << e1 << ", "
-                            << e2 << ", "
-                            << e3 << ", "
-                            << e4 << ", "
-                            << e5 << ") evaluates to false, where"
-                            << "\n" << e1 << " evaluates to " << v1
-                            << "\n" << e2 << " evaluates to " << v2
-                            << "\n" << e3 << " evaluates to " << v3
-                            << "\n" << e4 << " evaluates to " << v4
-                            << "\n" << e5 << " evaluates to " << v5;
-}
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
-// Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5),\
-                on_failure)
-
-// Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
-// this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
-
-// 5-ary predicate assertion macros.
-#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_)
-#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
-  GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
-  GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
-
-
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-
-// Macros for testing equalities and inequalities.
-//
-//    * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual
-//    * {ASSERT|EXPECT}_NE(v1, v2):           Tests that v1 != v2
-//    * {ASSERT|EXPECT}_LT(v1, v2):           Tests that v1 < v2
-//    * {ASSERT|EXPECT}_LE(v1, v2):           Tests that v1 <= v2
-//    * {ASSERT|EXPECT}_GT(v1, v2):           Tests that v1 > v2
-//    * {ASSERT|EXPECT}_GE(v1, v2):           Tests that v1 >= v2
-//
-// When they are not, Google Test prints both the tested expressions and
-// their actual values.  The values must be compatible built-in types,
-// or you will get a compiler error.  By "compatible" we mean that the
-// values can be compared by the respective operator.
-//
-// Note:
-//
-//   1. It is possible to make a user-defined type work with
-//   {ASSERT|EXPECT}_??(), but that requires overloading the
-//   comparison operators and is thus discouraged by the Google C++
-//   Usage Guide.  Therefore, you are advised to use the
-//   {ASSERT|EXPECT}_TRUE() macro to assert that two objects are
-//   equal.
-//
-//   2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on
-//   pointers (in particular, C strings).  Therefore, if you use it
-//   with two C strings, you are testing how their locations in memory
-//   are related, not how their content is related.  To compare two C
-//   strings by content, use {ASSERT|EXPECT}_STR*().
-//
-//   3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to
-//   {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you
-//   what the actual value is when it fails, and similarly for the
-//   other comparisons.
-//
-//   4. Do not depend on the order in which {ASSERT|EXPECT}_??()
-//   evaluate their arguments, which is undefined.
-//
-//   5. These macros evaluate their arguments exactly once.
-//
-// Examples:
-//
-//   EXPECT_NE(5, Foo());
-//   EXPECT_EQ(NULL, a_pointer);
-//   ASSERT_LT(i, array_size);
-//   ASSERT_GT(records.size(), 0) << "There is no record left.";
-
-#define EXPECT_EQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
-#define EXPECT_NE(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual)
-#define EXPECT_LE(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
-#define EXPECT_LT(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
-#define EXPECT_GE(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
-#define EXPECT_GT(val1, val2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
-
-#define GTEST_ASSERT_EQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal:: \
-                      EqHelper<GTEST_IS_NULL_LITERAL_(expected)>::Compare, \
-                      expected, actual)
-#define GTEST_ASSERT_NE(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2)
-#define GTEST_ASSERT_LE(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2)
-#define GTEST_ASSERT_LT(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2)
-#define GTEST_ASSERT_GE(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2)
-#define GTEST_ASSERT_GT(val1, val2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2)
-
-// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of
-// ASSERT_XY(), which clashes with some users' own code.
-
-#if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
-#endif
-
-#if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
-#endif
-
-// C String Comparisons.  All tests treat NULL and any non-NULL string
-// as different.  Two NULLs are equal.
-//
-//    * {ASSERT|EXPECT}_STREQ(s1, s2):     Tests that s1 == s2
-//    * {ASSERT|EXPECT}_STRNE(s1, s2):     Tests that s1 != s2
-//    * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case
-//    * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case
-//
-// For wide or narrow string objects, you can use the
-// {ASSERT|EXPECT}_??() macros.
-//
-// Don't depend on the order in which the arguments are evaluated,
-// which is undefined.
-//
-// These macros evaluate their arguments exactly once.
-
-#define EXPECT_STREQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
-#define EXPECT_STRNE(s1, s2) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define EXPECT_STRCASEEQ(expected, actual) \
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
-#define EXPECT_STRCASENE(s1, s2)\
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
-
-#define ASSERT_STREQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual)
-#define ASSERT_STRNE(s1, s2) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
-#define ASSERT_STRCASEEQ(expected, actual) \
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual)
-#define ASSERT_STRCASENE(s1, s2)\
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
-
-// Macros for comparing floating-point numbers.
-//
-//    * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual):
-//         Tests that two float values are almost equal.
-//    * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual):
-//         Tests that two double values are almost equal.
-//    * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error):
-//         Tests that v1 and v2 are within the given distance to each other.
-//
-// Google Test uses ULP-based comparison to automatically pick a default
-// error bound that is appropriate for the operands.  See the
-// FloatingPoint template class in gtest-internal.h if you are
-// interested in the implementation details.
-
-#define EXPECT_FLOAT_EQ(expected, actual)\
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
-
-#define EXPECT_DOUBLE_EQ(expected, actual)\
-  EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
-
-#define ASSERT_FLOAT_EQ(expected, actual)\
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
-                      expected, actual)
-
-#define ASSERT_DOUBLE_EQ(expected, actual)\
-  ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
-                      expected, actual)
-
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
-
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
-
-// These predicate format functions work on floating-point values, and
-// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
-//
-//   EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0);
-
-// Asserts that val1 is less than, or almost equal to, val2.  Fails
-// otherwise.  In particular, it fails if either val1 or val2 is NaN.
-GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
-                                   float val1, float val2);
-GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                                    double val1, double val2);
-
-
-#if GTEST_OS_WINDOWS
-
-// Macros that test for HRESULT failure and success, these are only useful
-// on Windows, and rely on Windows SDK macros and APIs to compile.
-//
-//    * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr)
-//
-// When expr unexpectedly fails or succeeds, Google Test prints the
-// expected result and the actual result with both a human-readable
-// string representation of the error, if available, as well as the
-// hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
-
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
-
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
-
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
-
-#endif  // GTEST_OS_WINDOWS
-
-// Macros that execute statement and check that it doesn't generate new fatal
-// failures in the current thread.
-//
-//   * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement);
-//
-// Examples:
-//
-//   EXPECT_NO_FATAL_FAILURE(Process());
-//   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
-//
-#define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
-#define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
-
-// Causes a trace (including the source file path, the current line
-// number, and the given message) to be included in every test failure
-// message generated by code in the current scope.  The effect is
-// undone when the control leaves the current scope.
-//
-// The message argument can be anything streamable to std::ostream.
-//
-// In the implementation, we include the current line number as part
-// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
-// to appear in the same block - as long as they are on different
-// lines.
-#define SCOPED_TRACE(message) \
-  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, ::testing::Message() << (message))
-
-// Compile-time assertion for type equality.
-// StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
-// the same type.  The value it returns is not interesting.
-//
-// Instead of making StaticAssertTypeEq a class template, we make it a
-// function template that invokes a helper class template.  This
-// prevents a user from misusing StaticAssertTypeEq<T1, T2> by
-// defining objects of that type.
-//
-// CAVEAT:
-//
-// When used inside a method of a class template,
-// StaticAssertTypeEq<T1, T2>() is effective ONLY IF the method is
-// instantiated.  For example, given:
-//
-//   template <typename T> class Foo {
-//    public:
-//     void Bar() { testing::StaticAssertTypeEq<int, T>(); }
-//   };
-//
-// the code:
-//
-//   void Test1() { Foo<bool> foo; }
-//
-// will NOT generate a compiler error, as Foo<bool>::Bar() is never
-// actually instantiated.  Instead, you need:
-//
-//   void Test2() { Foo<bool> foo; foo.Bar(); }
-//
-// to cause a compiler error.
-template <typename T1, typename T2>
-bool StaticAssertTypeEq() {
-  (void)internal::StaticAssertTypeEqHelper<T1, T2>();
-  return true;
-}
-
-// Defines a test.
-//
-// The first parameter is the name of the test case, and the second
-// parameter is the name of the test within the test case.
-//
-// The convention is to end the test case name with "Test".  For
-// example, a test case for the Foo class can be named FooTest.
-//
-// The user should put his test code between braces after using this
-// macro.  Example:
-//
-//   TEST(FooTest, InitializesCorrectly) {
-//     Foo foo;
-//     EXPECT_TRUE(foo.StatusIsOK());
-//   }
-
-// Note that we call GetTestTypeId() instead of GetTypeId<
-// ::testing::Test>() here to get the type ID of testing::Test.  This
-// is to work around a suspected linker bug when using Google Test as
-// a framework on Mac OS X.  The bug causes GetTypeId<
-// ::testing::Test>() to return different values depending on whether
-// the call is from the Google Test framework itself or from user test
-// code.  GetTestTypeId() is guaranteed to always return the same
-// value, as it always calls GetTypeId<>() from the Google Test
-// framework.
-#define GTEST_TEST(test_case_name, test_name)\
-  GTEST_TEST_(test_case_name, test_name, \
-              ::testing::Test, ::testing::internal::GetTestTypeId())
-
-// Define this macro to 1 to omit the definition of TEST(), which
-// is a generic name and clashes with some other libraries.
-#if !GTEST_DONT_DEFINE_TEST
-# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name)
-#endif
-
-// Defines a test that uses a test fixture.
-//
-// The first parameter is the name of the test fixture class, which
-// also doubles as the test case name.  The second parameter is the
-// name of the test within the test case.
-//
-// A test fixture class must be declared earlier.  The user should put
-// his test code between braces after using this macro.  Example:
-//
-//   class FooTest : public testing::Test {
-//    protected:
-//     virtual void SetUp() { b_.AddElement(3); }
-//
-//     Foo a_;
-//     Foo b_;
-//   };
-//
-//   TEST_F(FooTest, InitializesCorrectly) {
-//     EXPECT_TRUE(a_.StatusIsOK());
-//   }
-//
-//   TEST_F(FooTest, ReturnsElementCountCorrectly) {
-//     EXPECT_EQ(0, a_.size());
-//     EXPECT_EQ(1, b_.size());
-//   }
-
-#define TEST_F(test_fixture, test_name)\
-  GTEST_TEST_(test_fixture, test_name, test_fixture, \
-              ::testing::internal::GetTypeId<test_fixture>())
-
-// Use this macro in main() to run all tests.  It returns 0 if all
-// tests are successful, or 1 otherwise.
-//
-// RUN_ALL_TESTS() should be invoked after the command line has been
-// parsed by InitGoogleTest().
-
-#define RUN_ALL_TESTS()\
-  (::testing::UnitTest::GetInstance()->Run())
-
-}  // namespace testing
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/kokkos/kokkos/containers/src/Kokkos_DualView.hpp b/kokkos/kokkos/containers/src/Kokkos_DualView.hpp
deleted file mode 100644
index 80a30b7..0000000
--- a/kokkos/kokkos/containers/src/Kokkos_DualView.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-
-/* DualView: container class to manage data structures which exist both on Host and Device
- * member functions:
- * DualView()
- * DualView(label,dim0,dim1,dim2,...)
- * view<DeviceType>()
- * sync<DeviceType>()
- * modify<DeviceType>()
- * resize(dim0,dim1,dim2,...)
- */
-#ifndef KOKKOS_DUALVIEW_HPP
-#define KOKKOS_DUALVIEW_HPP
-
-#include <Kokkos_View.hpp>
-namespace Kokkos {
-
-template< class T , class L , class D>
-class DualView {
-public:
-
-  /* Define base types for Device and Host */
-
-  typedef Kokkos::View<T,L,D> t_dev ;
-  typedef typename t_dev::HostMirror t_host ;
-
-  /* Define typedefs for different usage scenarios */
-
-  // Define const view types
-  typedef Kokkos::View<typename t_dev::const_data_type,L,D> t_dev_const;
-  typedef typename t_dev_const::HostMirror t_host_const;
-
-  // Define const randomread view types
-  typedef Kokkos::View<typename t_dev::const_data_type,L,D,Kokkos::MemoryRandomRead> t_dev_const_randomread ;
-  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
-
-  // Define unmanaged view types
-  typedef Kokkos::View<T,L,D,Kokkos::MemoryUnmanaged> t_dev_um;
-  typedef Kokkos::View<typename t_host::data_type,typename t_host::array_layout,
-                       typename t_host::device_type,Kokkos::MemoryUnmanaged> t_host_um;
-
-  // Define const unmanaged view types
-  typedef Kokkos::View<typename t_dev::const_data_type,L,D,Kokkos::MemoryUnmanaged> t_dev_const_um;
-  typedef Kokkos::View<typename t_host::const_data_type,typename t_host::array_layout,
-                       typename t_host::device_type,Kokkos::MemoryUnmanaged> t_host_const_um;
-
-  /* provide the same typedefs as a view for scalar, data and value types */
-
-  typedef typename t_dev::value_type value_type;
-  typedef typename t_dev::const_value_type const_value_type;
-  typedef typename t_dev::scalar_type scalar_type;
-  typedef typename t_dev::const_scalar_type const_scalar_type;
-  typedef typename t_dev::non_const_scalar_type non_const_scalar_type;
-
-  /* Instances of base types */
-
-  t_dev d_view;
-  t_host h_view;
-
-
-  /* Counters to keep track of changes (dirty-flags) */
-
-  unsigned int modified_device;
-  unsigned int modified_host;
-
-  /* Return view on specific device via view<Device>() */
-
-  template< class Device >
-  const typename Kokkos::Impl::if_c< Kokkos::Impl::is_same< typename t_dev::memory_space ,
-                                      typename Device::memory_space >::value ,
-                             t_dev , t_host >::type view() const
-  {
-    return Kokkos::Impl::if_c< Kokkos::Impl::is_same< typename t_dev::memory_space ,
-                                typename Device::memory_space >::value ,
-                       t_dev , t_host >::select( d_view , h_view );
-  }
-
-
-  /* Construct views */
-
-  /* Empty Constructor */
-
-  DualView() {
-    modified_host = 0;
-    modified_device = 0;
-  }
-
-  /* Create view with allocation on both host and device */
-
-  DualView( const std::string & label ,
-    const size_t n0 = 0 ,
-    const size_t n1 = 0 ,
-    const size_t n2 = 0 ,
-    const size_t n3 = 0 ,
-    const size_t n4 = 0 ,
-    const size_t n5 = 0 ,
-    const size_t n6 = 0 ,
-    const size_t n7 = 0 )
-    : d_view( label, n0, n1, n2, n3, n4, n5, n6, n7 )
-    , h_view( create_mirror_view( d_view ) )
-  {
-    modified_host = 0;
-    modified_device = 0;
-  }
-
-  /* Update data on device or host only if other space is polluted */
-
-  template<class Device>
-  void sync() {
-    unsigned int dev = Kokkos::Impl::if_c< Kokkos::Impl::is_same< typename t_dev::memory_space ,
-                                  typename Device::memory_space >::value ,
-                                  unsigned int , unsigned int >::select( 1, 0 );
-
-    if(dev) {
-      if((modified_host > 0) && (modified_host >= modified_device)) {
-      Kokkos::deep_copy(d_view,h_view);
-      modified_host = modified_device = 0;
-      }
-    } else {
-      if((modified_device > 0) && (modified_device >= modified_host)) {
-      Kokkos::deep_copy(h_view,d_view);
-      modified_host = modified_device = 0;
-      }
-    }
-  }
-
-  /* Mark data as dirty on a device */
-
-  template<class Device>
-  void modify() {
-    unsigned int dev = Kokkos::Impl::if_c< Kokkos::Impl::is_same< typename t_dev::memory_space ,
-                                  typename Device::memory_space >::value ,
-                                  unsigned int , unsigned int >::select( 1, 0 );
-
-    if(dev) {
-      modified_device = (modified_device > modified_host ? modified_device : modified_host)  + 1;
-    } else {
-      modified_host = (modified_device > modified_host ? modified_device : modified_host)  + 1;
-    }
-  }
-
-  /* Realloc both views, no deep copy */
-
-  void realloc( const size_t n0 = 0 ,
-           const size_t n1 = 0 ,
-           const size_t n2 = 0 ,
-           const size_t n3 = 0 ,
-           const size_t n4 = 0 ,
-           const size_t n5 = 0 ,
-           const size_t n6 = 0 ,
-           const size_t n7 = 0 ) {
-     Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
-     h_view = create_mirror_view( d_view );
-
-     /* Reset dirty flags */
-     modified_device = modified_host = 0;
-  }
-
-  /* Resize both views, only do deep_copy in space which was last marked as dirty */
-
-  void resize( const size_t n0 = 0 ,
-           const size_t n1 = 0 ,
-           const size_t n2 = 0 ,
-           const size_t n3 = 0 ,
-           const size_t n4 = 0 ,
-           const size_t n5 = 0 ,
-           const size_t n6 = 0 ,
-           const size_t n7 = 0 ) {
-   if(modified_device >= modified_host) {
-     /* Resize on Device */
-     Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
-     h_view = create_mirror_view( d_view );
-
-     /* Mark Device copy as modified */
-     modified_device++;
-
-   } else {
-     /* Realloc on Device */
-
-     Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
-     t_host temp_view = create_mirror_view( d_view );
-
-     /* Remap on Host */
-     Kokkos::Impl::ViewRemap< t_host , t_host >( temp_view , h_view );
-     h_view = temp_view;
-
-     /* Mark Host copy as modified */
-     modified_host++;
-   }
-  }
-
-  size_t capacity() const {
-    return d_view.capacity();
-  }
-};
-}
-#endif
diff --git a/kokkos/kokkos/containers/src/Kokkos_Functional.hpp b/kokkos/kokkos/containers/src/Kokkos_Functional.hpp
deleted file mode 100644
index eb327af..0000000
--- a/kokkos/kokkos/containers/src/Kokkos_Functional.hpp
+++ /dev/null
@@ -1,196 +0,0 @@
-#ifndef KOKKOS_FUNCTIONAL_HPP
-#define KOKKOS_FUNCTIONAL_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <stdint.h>
-
-namespace Kokkos {
-
-namespace Impl {
-
-// MurmurHash3 was written by Austin Appleby, and is placed in the public
-// domain. The author hereby disclaims copyright to this source code.
-KOKKOS_FORCEINLINE_FUNCTION
-uint32_t getblock32 ( const uint8_t * p, int i )
-{
-// used to avoid aliasing error which could cause errors with
-// forced inlining
-  return    ((uint32_t)p[i*4+0])
-          | ((uint32_t)p[i*4+1] << 8)
-          | ((uint32_t)p[i*4+2] << 16)
-          | ((uint32_t)p[i*4+3] << 24);
-}
-
-KOKKOS_FORCEINLINE_FUNCTION
-uint32_t rotl32 ( uint32_t x, int8_t r )
-{ return (x << r) | (x >> (32 - r)); }
-
-KOKKOS_FORCEINLINE_FUNCTION
-uint32_t fmix32 ( uint32_t h )
-{
-  h ^= h >> 16;
-  h *= 0x85ebca6b;
-  h ^= h >> 13;
-  h *= 0xc2b2ae35;
-  h ^= h >> 16;
-
-  return h;
-}
-
-KOKKOS_INLINE_FUNCTION
-uint32_t MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed )
-{
-  const uint8_t * data = (const uint8_t*)key;
-  const int nblocks = len / 4;
-
-  uint32_t h1 = seed;
-
-  const uint32_t c1 = 0xcc9e2d51;
-  const uint32_t c2 = 0x1b873593;
-
-  //----------
-  // body
-
-  for(int i=0; i<nblocks; ++i)
-  {
-    uint32_t k1 = getblock32(data,i);
-
-    k1 *= c1;
-    k1 = rotl32(k1,15);
-    k1 *= c2;
-
-    h1 ^= k1;
-    h1 = rotl32(h1,13);
-    h1 = h1*5+0xe6546b64;
-  }
-
-  //----------
-  // tail
-
-  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
-
-  uint32_t k1 = 0;
-
-  switch(len & 3)
-  {
-  case 3: k1 ^= tail[2] << 16;
-  case 2: k1 ^= tail[1] << 8;
-  case 1: k1 ^= tail[0];
-          k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
-  };
-
-  //----------
-  // finalization
-
-  h1 ^= len;
-
-  h1 = fmix32(h1);
-
-  return h1;
-}
-
-} // namespace Impl
-
-
-// These should work for most types
-
-template <typename T>
-struct hash
-{
-  typedef T argument_type;
-  typedef T first_argument_type;
-  typedef uint32_t second_argument_type;
-  typedef uint32_t result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  uint32_t operator()(T const & t) const
-  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), 0); }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  uint32_t operator()(T const & t, uint32_t seed) const
-  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), seed); }
-};
-
-
-
-template <typename T>
-struct equal_to
-{
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  bool operator()(T const & a, T const & b) const
-  { return a == b; }
-};
-
-template <typename T>
-struct not_equal_to
-{
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  bool operator()(T const & a, T const & b) const
-  { return a != b; }
-};
-
-
-template <typename T>
-struct greater
-{
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  bool operator()(T const & a, T const & b) const
-  { return a > b; }
-};
-
-
-template <typename T>
-struct less
-{
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  bool operator()(T const & a, T const & b) const
-  { return a < b; }
-};
-
-template <typename T>
-struct greater_equal
-{
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  bool operator()(T const & a, T const & b) const
-  { return a >= b; }
-};
-
-
-template <typename T>
-struct less_equal
-{
-  typedef T first_argument_type;
-  typedef T second_argument_type;
-  typedef bool result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  bool operator()(T const & a, T const & b) const
-  { return a <= b; }
-};
-
-} // namespace Kokkos
-
-
-#endif //KOKKOS_FUNCTIONAL_HPP
-
-
diff --git a/kokkos/kokkos/containers/src/Kokkos_Pair.hpp b/kokkos/kokkos/containers/src/Kokkos_Pair.hpp
deleted file mode 100644
index 8fc39aa..0000000
--- a/kokkos/kokkos/containers/src/Kokkos_Pair.hpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/// \file Kokkos_Pair.hpp
-/// \brief Declaration and definition of Kokkos::pair.
-///
-/// This header file declares and defines Kokkos::pair and its related
-/// nonmember functions.
-
-#ifndef KOKKOS_CONTAINERS_PAIR_HPP
-#define KOKKOS_CONTAINERS_PAIR_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_Functional.hpp>
-#include <utility>
-
-namespace Kokkos {
-/// \struct pair
-/// \brief Replacement for std::pair that works on CUDA devices.
-///
-/// The instance methods of std::pair, including its constructors, are
-/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
-/// called on a CUDA device, such as an NVIDIA GPU.  This struct
-/// implements the same interface as std::pair, but can be used on a
-/// CUDA device as well as on the host.
-template <class T1, class T2>
-struct pair
-{
-  //! The first template parameter of this class.
-  typedef T1 first_type;
-  //! The second template parameter of this class.
-  typedef T2 second_type;
-
-  //! The first element of the pair.
-  first_type  first;
-  //! The second element of the pair.
-  second_type second;
-
-  /// \brief Default constructor.
-  ///
-  /// This calls the default constructors of T1 and T2.  It won't
-  /// compile if those default constructors are not defined and
-  /// public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first(), second()
-  {}
-
-  /// \brief Constructor that takes both elements of the pair.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(const first_type & f, const second_type & s)
-    : first(f), second(s)
-  {}
-
-  /// \brief Copy constructor.
-  ///
-  /// This calls the copy constructors of T1 and T2.  It won't compile
-  /// if those copy constructors are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair( const pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  /// \brief Assignment operator.
-  ///
-  /// This calls the assignment operators of T1 and T2.  It won't
-  /// compile if the assignment operators are not defined and public.
-  template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair<T1, T2> & operator=(const pair<U,V> &p)
-  {
-    first = p.first;
-    second = p.second;
-    return *this;
-  }
-
-  // from std::pair<U,V>
-  template <class U, class V>
-  pair( const std::pair<U,V> &p)
-    : first(p.first), second(p.second)
-  {}
-
-  /// \brief Return the std::pair version of this object.
-  ///
-  /// This is <i>not</i> a device function; you may not call it on a
-  /// CUDA device.  It is meant to be called on the host, if the user
-  /// wants an std::pair instead of a Kokkos::pair.
-  /// 
-  /// \note This is not a conversion operator, since defining a
-  ///   conversion operator made the relational operators have
-  ///   ambiguous definitions.
-  std::pair<T1,T2> to_std_pair() const
-  { return std::make_pair(first,second); }
-};
-
-//! Equality operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return lhs.first==rhs.first && lhs.second==rhs.second; }
-
-//! Inequality operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return !(lhs==rhs); }
-
-//! Less-than operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
-
-//! Less-than-or-equal-to operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return !(rhs<lhs); }
-
-//! Greater-than operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return rhs<lhs; }
-
-//! Greater-than-or-equal-to operator for Kokkos::pair.
-template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
-{ return !(lhs<rhs); }
-
-/// \brief Return a new pair.
-///
-/// This is a "nonmember constructor" for Kokkos::pair.  It works just
-/// like std::make_pair.
-template <class T1,class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-pair<T1,T2> make_pair (T1 x, T2 y)
-{ return ( pair<T1,T2>(x,y) ); }
-
-/// \brief Return a pair of references to the input arguments.
-///
-/// This compares to std::tie (new in C++11).  You can use it to
-/// assign to two variables at once, from the result of a function
-/// that returns a pair.  For example (<tt>__device__</tt> and
-/// <tt>__host__</tt> attributes omitted for brevity):
-/// \code
-/// // Declaration of the function to call.
-/// // First return value: operation count.
-/// // Second return value: whether all operations succeeded.
-/// Kokkos::pair<int, bool> someFunction ();
-///
-/// // Code that uses Kokkos::tie.
-/// int myFunction () {
-///   int count = 0;
-///   bool success = false;
-///
-///   // This assigns to both count and success.
-///   Kokkos::tie (count, success) = someFunction ();
-/// 
-///   if (! success) {
-///     // ... Some operation failed; 
-///     //     take corrective action ...
-///   }
-///   return count;
-/// }
-/// \endcode
-///
-/// The line that uses tie() could have been written like this:
-/// \code
-///   Kokkos::pair<int, bool> result = someFunction ();
-///   count = result.first;
-///   success = result.second;
-/// \endcode
-///
-/// Using tie() saves two lines of code and avoids a copy of each
-/// element of the pair.  The latter could be significant if one or
-/// both elements of the pair are more substantial objects than \c int
-/// or \c bool.
-template <class T1,class T2>
-KOKKOS_FORCEINLINE_FUNCTION
-pair<T1 &,T2 &> tie (T1 & x, T2 & y)
-{ return ( pair<T1 &,T2 &>(x,y) ); }
-
-//
-// Specialization of Kokkos::pair for a \c void second argument.  This
-// is not actually a "pair"; it only contains one element, the first.
-//
-template <class T1>
-struct pair<T1,void>
-{
-  typedef T1 first_type;
-  typedef void second_type;
-
-  first_type  first;
-  enum { second = 0 };
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first()
-  {}
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(const first_type & f)
-    : first(f)
-  {}
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair(const first_type & f, int)
-    : first(f)
-  {}
-
-  template <class U>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair( const pair<U,void> &p)
-    : first(p.first)
-  {}
-
-  template <class U>
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair<T1, void> & operator=(const pair<U,void> &p)
-  {
-    first = p.first;
-    return *this;
-  }
-};
-
-//
-// Specialization of relational operators for Kokkos::pair<T1,void>.
-//
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return lhs.first==rhs.first; }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return !(lhs==rhs); }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return lhs.first<rhs.first; }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return !(rhs<lhs); }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return rhs<lhs; }
-
-template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
-bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
-{ return !(lhs<rhs); }
-
-//
-// Specialization of hash for Kokkos::pair.
-//
-template <class T1, class T2>
-struct hash< pair<T1,T2> >
-{
-  typedef pair<T1,T2> argument_type;
-  typedef pair<T1,T2> first_argument_type;
-  typedef uint32_t second_argument_type;
-  typedef uint32_t result_type;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  uint32_t operator()( const pair<T1,T2> & p, uint32_t seed = 0u) const
-  {
-    typedef hash<T1> hash1;
-    typedef hash<T2> hash2;
-    return hash1(p.first, hash2(p.second,seed));
-  }
-};
-
-
-} // namespace Kokkos
-
-
-#endif //KOKKOS_CONTAINERS_PAIR_HPP
diff --git a/kokkos/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/kokkos/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
deleted file mode 100644
index 4c66068..0000000
--- a/kokkos/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STATICCRSGRAPH_HPP
-#define KOKKOS_STATICCRSGRAPH_HPP
-
-#include <string>
-#include <vector>
-
-#include <Kokkos_View.hpp>
-
-namespace Kokkos {
-
-/// \class StaticCrsGraph
-/// \brief Compressed row storage array.
-///
-/// \tparam DataType The type of stored entries.  If a StaticCrsGraph is
-///   used as the graph of a sparse matrix, then this is usually an
-///   integer type, the type of the column indices in the sparse
-///   matrix.
-///
-/// \tparam Arg1Type The second template parameter, corresponding
-///   either to the Device type (if there are no more template
-///   parameters) or to the Layout type (if there is at least one more
-///   template parameter).
-///
-/// \tparam Arg2Type The third template parameter, which if provided
-///   corresponds to the Device type.
-///
-/// \tparam SizeType The type of row offsets.  Usually the default
-///   parameter suffices.  However, setting a nondefault value is
-///   necessary in some cases, for example, if you want to have a
-///   sparse matrices with dimensions (and therefore column indices)
-///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
-///   entries in the sparse matrix.
-///
-/// A row has a range of entries:
-/// <ul>
-/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
-/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
-/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
-/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
-/// </ul>
-template< class DataType,
-          class Arg1Type,
-          class Arg2Type = void,
-          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
-class StaticCrsGraph {
-private:
-  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
-
-public:
-  typedef DataType                                            data_type;
-  typedef typename traits::array_layout                       array_layout;
-  typedef typename traits::device_type                        device_type;
-  typedef SizeType                                            size_type;
-
-  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
-  typedef StaticCrsGraph< DataType , array_layout , typename device_type::host_mirror_device_type , SizeType > HostMirror;
-  //typedef StaticCrsGraph< DataType , array_layout , Kokkos::Threads , SizeType > HostMirror;
-  typedef View< const size_type* , array_layout, device_type >  row_map_type;
-  typedef View<       DataType*  , array_layout, device_type >  entries_type;
-
-  entries_type entries;
-  row_map_type row_map;
-
-  //! Construct an empty view.
-  StaticCrsGraph () : entries(), row_map() {}
-
-  //! Copy constructor (shallow copy).
-  StaticCrsGraph (const StaticCrsGraph& rhs) : entries (rhs.entries), row_map (rhs.row_map)
-  {}
-
-  template<class EntriesType, class RowMapType>
-  StaticCrsGraph (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
-  {}
-
-  /** \brief  Assign to a view of the rhs array.
-   *          If the old view is the last view
-   *          then allocated memory is deallocated.
-   */
-  StaticCrsGraph& operator= (const StaticCrsGraph& rhs) {
-    entries = rhs.entries;
-    row_map = rhs.row_map;
-    return *this;
-  }
-
-  /**  \brief  Destroy this view of the array.
-   *           If the last view then allocated memory is deallocated.
-   */
-  ~StaticCrsGraph() {}
-};
-
-//----------------------------------------------------------------------------
-
-template< class StaticCrsGraphType , class InputSizeType >
-typename StaticCrsGraphType::staticcrsgraph_type
-create_staticcrsgraph( const std::string & label ,
-                 const std::vector< InputSizeType > & input );
-
-template< class StaticCrsGraphType , class InputSizeType >
-typename StaticCrsGraphType::staticcrsgraph_type
-create_staticcrsgraph( const std::string & label ,
-                 const std::vector< std::vector< InputSizeType > > & input );
-
-//----------------------------------------------------------------------------
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          typename SizeType >
-typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          typename SizeType >
-typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#include <impl/Kokkos_StaticCrsGraph_factory.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
-
diff --git a/kokkos/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/kokkos/kokkos/containers/src/Kokkos_UnorderedMap.hpp
deleted file mode 100644
index 5671734..0000000
--- a/kokkos/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ /dev/null
@@ -1,1044 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_UnorderedMap.hpp
-/// \brief Declaration and definition of Kokkos::UnorderedMap.
-///
-/// This header file declares and defines Kokkos::UnorderedMap and its
-/// related nonmember functions.
-
-#ifndef KOKKOS_UNORDERED_MAP_HPP
-#define KOKKOS_UNORDERED_MAP_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_Functional.hpp>
-#include <Kokkos_Pair.hpp>
-#include <Kokkos_View.hpp>
-#include <Kokkos_Atomic.hpp>
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_UnorderedMap_impl.hpp>
-
-#include <iostream>
-
-#include <stdint.h>
-
-namespace Kokkos {
-
-/// \class UnorderedMap
-/// \brief Thread-safe, performance-portable lookup table.
-///
-/// This class provides a lookup table.  In terms of functionality,
-/// this class compares to std::unordered_map (new in C++11).
-/// "Unordered" means that keys are not stored in any particular
-/// order, unlike (for example) std::map.  "Thread-safe" means that
-/// lookups, insertion, and deletion are safe to call by multiple
-/// threads in parallel.  "Performance-portable" means that parallel
-/// performance of these operations is reasonable, on multiple
-/// hardware platforms.  Platforms on which performance has been
-/// tested include conventional Intel x86 multicore processors, Intel
-/// Xeon Phi ("MIC"), and NVIDIA GPUs.
-///
-/// Parallel performance portability entails design decisions that
-/// might differ from one's expectation for a sequential interface.
-/// This particularly affects insertion of single elements.  In an
-/// interface intended for sequential use, insertion might reallocate
-/// memory if the original allocation did not suffice to hold the new
-/// element.  In this class, insertion does <i>not</i> reallocate
-/// memory.  This means that it might fail.  insert() returns an enum
-/// which indicates whether the insert failed.  There are three
-/// possible conditions:
-/// <ol>
-/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
-///      means that the UnorderedMap ran out of space. </li>
-/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
-///      did <i>not</i> exist in the table before. </li>
-/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
-///      <i>did</i> exist in the table before.  The new value was
-///      ignored and the old value was left in place. </li>
-/// </ol>
-///
-/// Users can access the number of failed insertions thus far by
-/// calling failed_inserts().  This requires computation, and thus is
-/// a computational kernel, <i>not</i> a device function.  Once users
-/// have the number of failed inserts, they may reserve() as much
-/// space as they need and add the remaining elements (in a second
-/// CUDA kernel launch, if applicable).  We reiterate: users may
-/// <i>not</i> call these methods in a parallel computational kernel.
-/// They must run their parallel operation to completion, then call
-/// failed_inserts(), reserve() if necessary, and run another parallel
-/// kernel to add any remaining elements.
-///
-/// \tparam Key Type of keys of the lookup table.  If \c const, users
-///   are not allowed to add or remove keys, though they are allowed
-///   to change values.  In that case, the implementation may make
-///   optimizations specific to the <tt>Device</tt>.  For example, if
-///   <tt>Device</tt> is \c Cuda, it may use texture fetches to access
-///   keys.
-///
-/// \tparam T Type of values stored in the lookup table.  You may use
-///   \c void here, in which case the table will be a set of keys.  If
-///   \c const, users are not allowed to add, remove, or change
-///   entries.  In that case, the implementation may make
-///   optimizations specific to the \c Device, such as using texture
-///   fetches to access values.
-///
-/// \tparam Device The Kokkos Device type.
-///
-/// \tparam Compare Definition of the less-than comparison function
-///   for instances of <tt>Key</tt>.  If you rely on the default
-///   template parameter for \c Hash, then there must be a
-///   specialization of Kokkos::less for \c Key (without the \c const,
-///   if \c Key is const).
-///
-/// \tparam Hash Definition of the hash function for instances of
-///   <tt>Key</tt>.  If you rely on the default template parameter for
-///   \c Hash, then there must be a specialization of Kokkos::hash for
-///   \c Key (without the \c const, if \c Key is const).
-template <   typename Key
-           , typename T
-           , typename Device
-           , typename Compare = less<typename Impl::remove_const<Key>::type>
-           , typename Hash = hash<typename Impl::remove_const<Key>::type>
-        >
-class UnorderedMap;
-
-
-// Specialization of deep_copy for two UnorderedMap objects.
-template <  typename DKey, typename DT, typename DDevice
-          , typename SKey, typename ST, typename SDevice
-          , typename Compare, typename Hash >
-inline void deep_copy(         UnorderedMap<DKey, DT, DDevice, Compare, Hash> & dst
-                       , const UnorderedMap<SKey, ST, SDevice, Compare, Hash> & src )
-{
-  Impl::UnorderedMap::deep_copy_impl(dst, src);
-}
-
-
-/// \brief First element of the return value of UnorderedMap::insert().
-///
-/// Inserting an element into an UnorderedMap is not guaranteed to
-/// succeed.  There are three possible conditions:
-/// <ol>
-/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
-///      means that the UnorderedMap ran out of space. </li>
-/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
-///      did <i>not</i> exist in the table before. </li>
-/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
-///      <i>did</i> exist in the table before.  The new value was
-///      ignored and the old value was left in place. </li>
-/// </ol>
-enum UnorderedMap_insert_state
-{
-    INSERT_FAILED
-  , INSERT_SUCCESS
-  , INSERT_EXISTING
-};
-
-
-// Specialization of UnorderedMap for nonconst Key and value (T).
-template <   typename Key
-           , typename T
-           , typename Device
-           , typename Compare
-           , typename Hash
-        >
-class UnorderedMap
-{
-public:
-  //! \name Public types and constants
-  //@{
-
-  typedef Impl::UnorderedMap::map_data<Key,T,Device,Compare,Hash> map_data;
-  typedef Impl::UnorderedMap::node_atomic node_atomic;
-
-  typedef typename map_data::device_type device_type;
-  typedef typename map_data::compare_type compare_type;
-  typedef typename map_data::hash_type hash_type;
-  typedef typename map_data::key_type key_type;
-  typedef typename map_data::mapped_type mapped_type;
-  typedef typename map_data::value_type value_type;
-  typedef typename map_data::pointer pointer;
-  typedef typename map_data::const_pointer const_pointer;
-  typedef typename map_data::node_type node_type;
-  typedef typename map_data::node_block_type node_block_type;
-  typedef typename map_data::size_type size_type;
-
-  typedef pair<UnorderedMap_insert_state, pointer> insert_result;
-
-  typedef UnorderedMap<Key,T,typename Device::host_mirror_device_type,Compare,Hash> HostMirror;
-
-  //@}
-private:
-
-  typedef typename Impl::if_c<  map_data::has_void_mapped_type
-                              , int
-                              , mapped_type
-                             >::type insert_mapped_type;
-
-public: 
-  //! \name Public member functions
-  //@{
-
-  /// \brief Constructor
-  ///
-  /// \param arg_num_nodes [in] Initial requested maximum number of
-  ///   entries in the hash table.
-  /// \param compare [in] Less-than comparison function for \c Key
-  ///   instances.  The default value usually suffices.
-  /// \param hash [in] Hash function for \c Key instances.  The
-  ///   default value usually suffices.
-  UnorderedMap(  uint32_t arg_num_nodes = 0
-                , compare_type compare = compare_type()
-                , hash_type hash = hash_type()
-               )
-    : m_data(  arg_num_nodes
-             , compare
-             , hash
-            )
-  {}
-
-  //! Clear all entries in the table.
-  void clear()
-  {
-    m_data = map_data(0, m_data.key_compare, m_data.key_hash);
-  }
-
-  //! If the table is larger than necessary, shrink it to fit.
-  void shrink_to_fit()
-  { reserve(0); }
-
-  /// \brief Reserve space for \c new_capacity entries.
-  ///
-  /// This is <i>not</i> a device function; it may <i>not</i> be
-  /// called in a parallel kernel.
-  void reserve(unsigned new_capacity)
-  {
-    const uint32_t curr_size = size();
-    new_capacity = new_capacity < curr_size ? curr_size : new_capacity;
-
-    UnorderedMap<key_type, mapped_type, device_type, compare_type, hash_type>
-      tmp(new_capacity, m_data.key_compare, m_data.key_hash);
-
-    if (new_capacity > 0u && failed_inserts() == 0u ) {
-      Impl::UnorderedMap::copy_map(tmp,*this);
-    }
-    *this = tmp;
-  }
-
-  /// \brief Check sanity of the hash table.
-  ///
-  /// "Sanity" means integrity of data structures.  Checking this is
-  /// useful for debugging.
-  void check_sanity() const
-  { m_data.check_sanity(); }
-
-  /// \brief The number of entries in the table.
-  ///
-  /// Note that this is not a device function; it cannot be called in
-  /// a parallel kernel.  The value is not stored as a variable; it
-  /// must be computed.
-  uint32_t size() const
-  {  return m_data.size(); }
-
-  /// \brief The number of unused entries in the table.
-  ///
-  /// This is <i>not</i> a device function; it may <i>not</i> be
-  /// called in a parallel kernel.  The value is not stored as a
-  /// variable; it must be computed.
-  uint32_t unused() const
-  {  return m_data.unused(); }
-
-  /// \brief The number of entries pending deletion in the table.
-  ///
-  /// This is <i>not</i> a device function; it may <i>not</i> be
-  /// called in a parallel kernel.  The value is not stored as a
-  /// variable; it must be computed.
-  uint32_t pending_delete() const
-  {  return m_data.pending_delete(); }
-
-  /// \brief The current number of failed insert() calls.
-  ///
-  /// This is <i>not</i> a device function; it may <i>not</i> be
-  /// called in a parallel kernel.  The value is not stored as a
-  /// variable; it must be computed.
-  uint32_t failed_inserts() const
-  { return m_data.failed_inserts(); }
-
-  /// \brief The maximum number of entries that the table can hold.
-  ///
-  /// This <i>is</i> a device function; it may be called in a parallel
-  /// kernel.
-  KOKKOS_INLINE_FUNCTION
-  uint32_t capacity() const
-  { return m_data.capacity(); }
-
-  /// \brief The number of hash table "buckets."
-  ///
-  /// This is different than the number of entries that the table can
-  /// hold.  Each key hashes to an index in [0, hash_capacity() - 1].
-  /// That index can hold zero or more entries.  This class decides
-  /// what hash_capacity() should be, given the user's upper bound on
-  /// the number of entries the table must be able to hold.
-  ///
-  /// This <i>is</i> a device function; it may be called in a parallel
-  /// kernel.
-  KOKKOS_INLINE_FUNCTION
-  uint32_t hash_capacity() const
-  { return m_data.hash_capacity(); }
-
-  /// \brief Remove entries that are pending deletion.
-  ///
-  /// The mark_pending_delete() method marks an entry as "pending
-  /// deletion."  This method actually removes such entries from the
-  /// table.
-  ///
-  /// This is <i>not</i> a device function; it may <i>not</i> be
-  /// called in a parallel kernel.
-  void remove_pending_delete() const
-  {  return m_data.remove_pending_delete_keys(); }
-
-  //---------------------------------------------------------------------------
-  //---------------------------------------------------------------------------
-
-  /// \brief Attempt to insert the given (key, value) pair.
-  ///
-  /// This <i>is</i> a device function; it may be called in a parallel
-  /// kernel.  As discussed in the class documentation, it need not
-  /// succeed.  The return value tells you if it did.
-  /// 
-  /// \param k [in] The key to attempt to insert.
-  /// \param v [in] The corresponding value to attempt to insert.  If
-  ///   using this class as a set (with T = void), then you need not
-  ///   provide this value.
-  KOKKOS_INLINE_FUNCTION
-  insert_result insert(const key_type & k, const insert_mapped_type & v = insert_mapped_type()) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-
-    m_data.set_modified();
-
-    insert_result result(INSERT_FAILED,NULL);
-
-    const uint32_t hash_value = m_data.key_hash(k);
-    const uint32_t hash_index = hash_value % m_data.hashes.size();
-
-    uint32_t node_index = node_atomic::invalid_next;
-
-    bool curr_equal = false;
-    uint32_t curr_index = node_atomic::invalid_next;
-    volatile uint64_t * prev_atomic = & m_data.hashes[hash_index].value;
-    uint64_t prev = 0u;
-
-    find_previous(k,prev_atomic,prev,curr_equal,curr_index);
-
-    do {
-      if (curr_equal) {
-        if (node_index != node_atomic::invalid_next) {
-          // release any node that was claimed by this thread
-          m_data.get_node(node_index).atomic = node_atomic::make_atomic(node_atomic::invalid_next, Impl::UnorderedMap::UNUSED);
-#if defined( __CUDA_ARCH__ )
-          __threadfence();
-#endif
-          volatile int * used_count = &m_data.node_blocks[node_index>>node_block_type::shift].used_count;
-          atomic_fetch_add(used_count, -1);
-        }
-        // Node already exist
-        result = insert_result(INSERT_EXISTING, &m_data.get_node(curr_index).value);
-        break;
-      }
-      else {
-        // try to insert here
-        if (node_index == node_atomic::invalid_next) {
-          node_index = find_unused_node(hash_value);
-          if (node_index == node_atomic::invalid_next) {
-            // unable to obtain an unused node
-            break;
-          }
-        }
-        // this thread has unique control of the node
-        // so can construct the value and set up the state and next index
-        node_type & n = m_data.get_node(node_index);
-        n.destruct_value();
-        n.construct_value(value_type(k,v));
-        n.atomic = node_atomic::make_atomic( curr_index, Impl::UnorderedMap::USED);
-
-        uint64_t new_atomic = node_atomic::make_atomic( node_index, node_atomic::state(prev));
-
-#if defined( __CUDA_ARCH__ )
-        __threadfence();
-#endif
-        const bool ok = atomic_compare_exchange_strong( prev_atomic, prev, new_atomic);
-        if ( ok ) {
-          // successfully inserted the node
-          result = insert_result(INSERT_SUCCESS, &n.value);
-          break;
-        }
-      }
-      // insert failed -- find correct insertion point again
-      find_previous(k,prev_atomic,prev,curr_equal,curr_index);
-    } while (true);
-    return result;
-  }
-
-  /// \brief Mark the given key for deletion.
-  ///
-  /// This does not actually free memory; it just marks the entry of
-  /// the table with the given key \c k as deleted.
-  ///
-  /// This <i>is</i> a device function; it may be called in a parallel
-  /// kernel.
-  KOKKOS_INLINE_FUNCTION
-  void mark_pending_delete(const key_type & k) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-
-    m_data.set_modified();
-
-    const uint32_t hash_value = m_data.key_hash(k);
-    const uint32_t hash_index = hash_value % m_data.hashes.size();
-
-    uint32_t node_index = node_atomic::invalid_next;
-
-    bool curr_equal = false;
-    uint32_t curr_index = node_atomic::invalid_next;
-    volatile uint64_t * prev_atomic = & m_data.hashes[hash_index].value;
-    uint64_t prev = 0u;
-
-    find_previous(k,prev_atomic,prev,curr_equal,curr_index);
-
-    do {
-      if (curr_equal) {
-        if (node_index != node_atomic::invalid_next) {
-          // release any node that was claimed by this thread
-          m_data.get_node(node_index).atomic = node_atomic::make_atomic(node_atomic::invalid_next, Impl::UnorderedMap::UNUSED);
-#if defined( __CUDA_ARCH__ )
-          __threadfence();
-#endif
-          volatile int * used_count = &m_data.node_blocks[node_index>>node_block_type::shift].used_count;
-          atomic_fetch_add(used_count, -1);
-        }
-        // mark the current node as deleted
-        volatile uint64_t * curr_atomic_ptr = &m_data.get_node(curr_index).atomic.value;
-        uint64_t curr_atomic = *curr_atomic_ptr;
-        while ( node_atomic::state(curr_atomic) == Impl::UnorderedMap::USED) {
-          uint64_t new_atomic = node_atomic::make_atomic( node_atomic::next(curr_atomic), Impl::UnorderedMap::PENDING_DELETE);
-          curr_atomic = atomic_compare_exchange(curr_atomic_ptr,curr_atomic,new_atomic);
-        }
-        return;
-      }
-      else {
-        // key does not exist
-        // insert a node with the given key marked as deleted
-        if (node_index == node_atomic::invalid_next) {
-          node_index = find_unused_node(hash_value);
-          if (node_index == node_atomic::invalid_next) {
-            return;
-          }
-        }
-
-        // this thread has unique control of the node
-        // so can construct the value and set up the state and next index
-        node_type & n = m_data.get_node(node_index);
-        n.destruct_value();
-        n.construct_value(value_type(k,insert_mapped_type()));
-        n.atomic = node_atomic::make_atomic( curr_index, Impl::UnorderedMap::PENDING_DELETE);
-
-        uint64_t new_atomic = node_atomic::make_atomic( node_index, node_atomic::state(prev));
-
-#if defined( __CUDA_ARCH__ )
-        __threadfence();
-#endif
-
-        const bool ok = atomic_compare_exchange_strong( prev_atomic, prev, new_atomic);
-        if ( ok ) {
-          return;
-        }
-      }
-      // insert failed -- find correct insertion point again
-      find_previous(k,prev_atomic,prev,curr_equal,curr_index);
-    } while (true);
-  }
-
-  // TODO protect with enable_if
-  KOKKOS_INLINE_FUNCTION
-  void mark_pending_delete( const_pointer p ) const
-  {
-    if (p) mark_pending_delete(p->first);
-  }
-
-
-  /// \brief Find the given key \c k, if it exists in the table.
-  ///
-  /// \return If the key exists in the table, a (raw) pointer to the
-  ///   value corresponding to that key; otherwise, \c NULL.
-  ///
-  /// This <i>is</i> a device function; it may be called in a parallel
-  /// kernel.
-  KOKKOS_INLINE_FUNCTION
-  pointer find( const key_type & k) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-
-    const uint32_t node_index = m_data.find_node_index(k);
-    return (node_index != node_atomic::invalid_next) ? &m_data.get_node(node_index).value : NULL;
-  }
-
-  /// \brief Get a pointer to the value with \c i as its direct index.
-  ///
-  /// \warning This method is only for expert users.
-  ///
-  /// \param i [in] Index directly into the array of entries.
-  ///
-  /// \return If the entry exists in the table, a (raw) pointer to the
-  ///   value; otherwise, \c NULL.
-  ///
-  /// This <i>is</i> a device function; it may be called in a parallel
-  /// kernel.
-  KOKKOS_INLINE_FUNCTION
-  pointer get_value(uint64_t i) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-
-    // add one to pass 0th node
-    const bool valid_range = i < m_data.capacity();
-    const bool used_node  = node_atomic::state(m_data.get_node(i).atomic) == Impl::UnorderedMap::USED;
-
-    return valid_range && used_node ? &m_data.get_node(i).value : NULL;
-  }
-
-private: // private member functions
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t find_unused_node(uint32_t hash_value) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-
-    const uint32_t num_blocks = m_data.node_blocks.size();
-    const uint32_t start_block = hash_value % num_blocks;
-    const uint32_t end_block = start_block + num_blocks;
-
-    if (m_data.no_failed_inserts()) {
-
-      for (uint32_t i = start_block; i < end_block; ++i) {
-        if (!m_data.no_failed_inserts()) break;
-
-        const uint32_t block = i % num_blocks;
-        volatile int * used_count = &m_data.node_blocks[block].used_count;
-        int count = * used_count;
-        if (static_cast<unsigned>(count) < node_block_type::size) {
-          //stores the old value into count
-          const int old_count = atomic_fetch_add(used_count, 1);
-          if (static_cast<unsigned>(old_count) < node_block_type::size) {
-            //claimed a node in this block keep looping block utill successful at claming a node
-            for (uint32_t start_node = (hash_value & node_block_type::mask); true; ++start_node) {
-              if (!m_data.no_failed_inserts()) break;
-              const uint32_t n = (block*node_block_type::size) + (start_node & node_block_type::mask);
-              volatile uint64_t * atomic = &m_data.get_node(n).atomic.value;
-              uint64_t value = *atomic;
-              if (    (node_atomic::state(value) == Impl::UnorderedMap::UNUSED)
-                  && atomic_compare_exchange_strong(atomic, value, node_atomic::make_atomic(node_atomic::invalid_next,Impl::UnorderedMap::PENDING_INSERT)) )
-              {
-                return n;
-              }
-            }
-          }
-          else {
-            //unable to claim a node from this block
-            atomic_fetch_add(used_count, -1);
-          }
-        }
-      }
-      // unable to get a free node -- insert failed
-      m_data.set_failed_insert();
-    }
-    // count the failed insert
-    volatile int * failed_inserts = &m_data.node_blocks[start_block].failed_inserts;
-    atomic_fetch_add(failed_inserts, 1);
-    return node_atomic::invalid_next;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void find_previous(const key_type & k, volatile uint64_t *& prev_atomic, uint64_t & prev, bool &curr_equal, uint32_t & curr_index) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-
-    curr_equal = false;
-    do {
-      prev = *prev_atomic;
-      curr_index = node_atomic::next(prev);
-      const bool curr_invalid = curr_index == node_atomic::invalid_next;
-
-      if (curr_invalid) break;
-
-       // global read of the key
-      volatile const key_type * const key_ptr = &m_data.get_node(curr_index).value.first;
-      const key_type curr_key = *key_ptr;
-
-      const bool curr_less = m_data.key_compare( curr_key, k);
-      const bool curr_greater = m_data.key_compare( k, curr_key);
-      curr_equal = !curr_less && !curr_greater;
-
-      if (!curr_less) break;
-
-      prev_atomic = & m_data.get_node(node_atomic::next(prev)).atomic.value;
-    } while (true);
-  }
-
-private: // private members
-  map_data m_data;
-
-  template <typename KKey, typename TT, typename DDevice, typename CCompare, typename HHash>
-  friend class UnorderedMap;
-
-  template <  class MapDst, class MapSrc >
-  friend void Impl::UnorderedMap::deep_copy_impl( MapDst & dst, const MapSrc & src);
-};
-
-
-//! Specialization of UnorderedMap for const Key and nonconst value (T).
-template <   typename Key
-           , typename T
-           , typename Device
-           , typename Compare
-           , typename Hash
-        >
-class UnorderedMap< const Key, T, Device, Compare, Hash>
-{
-public: // public types and constants
-  typedef Impl::UnorderedMap::map_data<const Key, T,Device,Compare,Hash> map_data;
-  typedef Impl::UnorderedMap::node_atomic node_atomic;
-
-  typedef typename map_data::device_type device_type;
-  typedef typename map_data::compare_type compare_type;
-  typedef typename map_data::hash_type hash_type;
-  typedef typename map_data::key_type key_type;
-  typedef typename map_data::mapped_type mapped_type;
-  typedef typename map_data::value_type value_type;
-  typedef typename map_data::pointer pointer;
-  typedef typename map_data::const_pointer const_pointer;
-  typedef typename map_data::node_type node_type;
-  typedef typename map_data::size_type size_type;
-
-  typedef UnorderedMap<Key,T,typename Device::host_mirror_device_type,Compare,Hash> HostMirror;
-
-public: //public member functions
-
-  UnorderedMap()
-    : m_data()
-  {}
-
-  template <typename UMap>
-  UnorderedMap(  const UMap & umap )
-    : m_data( umap.m_data )
-  {}
-
-  void clear()
-  {
-    m_data = map_data(0, m_data.key_compare, m_data.key_hash);
-  }
-
-  void shrink_to_fit()
-  { reserve(0); }
-
-  void reserve(unsigned new_capacity)
-  {
-    const uint32_t curr_size = size();
-    new_capacity = new_capacity < curr_size ? curr_size : new_capacity;
-
-    UnorderedMap<key_type, mapped_type, device_type, compare_type, hash_type>
-      tmp(new_capacity, m_data.key_compare, m_data.key_hash);
-
-    if (new_capacity > 0u && failed_inserts() == 0u ) {
-      Impl::UnorderedMap::copy_map(tmp,*this);
-    }
-    *this = tmp;
-  }
-
-  void check_sanity() const
-  { m_data.check_sanity(); }
-
-  uint32_t size() const
-  {  return m_data.size(); }
-
-  uint32_t unused() const
-  {  return m_data.unused(); }
-
-  uint32_t pending_delete() const
-  {  return m_data.pending_delete(); }
-
-  uint32_t failed_inserts() const
-  { return m_data.failed_inserts(); }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t capacity() const
-  { return m_data.capacity(); }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t hash_capacity() const
-  { return m_data.hash_capacity(); }
-
-  void remove_pending_delete() const
-  {  return m_data.remove_pending_delete_keys(); }
-
-  KOKKOS_INLINE_FUNCTION
-  pointer find( const key_type & k) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    const uint32_t node_index = m_data.find_node_index(k);
-    return (node_index != node_atomic::invalid_next) ? &m_data.get_node(node_index).value : NULL;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  pointer get_value(uint64_t i) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    // add one to pass 0th node
-    const bool valid_range = i < m_data.capacity();
-    const bool used_node  = node_atomic::state(m_data.get_node(i).atomic) == Impl::UnorderedMap::USED;
-
-    return valid_range && used_node ? &m_data.get_node(i).value : NULL;
-  }
-
-
-  KOKKOS_INLINE_FUNCTION
-  void mark_pending_delete(const key_type & k) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    m_data.set_modified();
-
-    const uint32_t hash_value = m_data.key_hash(k);
-    const uint32_t hash_index = hash_value % m_data.hashes.size();
-
-    bool curr_equal = false;
-    uint32_t curr_index = node_atomic::invalid_next;
-    const volatile uint64_t * prev_atomic = & m_data.hashes[hash_index].value;
-    uint64_t prev = 0u;
-
-    find_previous(k,prev_atomic,prev,curr_equal,curr_index);
-
-    do {
-      if (curr_equal) {
-        // mark the current node as deleted
-        volatile uint64_t * curr_atomic_ptr = &m_data.get_node(curr_index).atomic.value;
-        uint64_t curr_atomic = *curr_atomic_ptr;
-        while ( node_atomic::state(curr_atomic) == Impl::UnorderedMap::USED) {
-          uint64_t new_atomic = node_atomic::make_atomic( node_atomic::next(curr_atomic), Impl::UnorderedMap::PENDING_DELETE);
-          curr_atomic = atomic_compare_exchange(curr_atomic_ptr,curr_atomic,new_atomic);
-        }
-        return;
-      }
-    } while (true);
-  }
-
-
-private:
-  KOKKOS_INLINE_FUNCTION
-  void find_previous(const key_type & k, const volatile uint64_t *& prev_atomic, uint64_t & prev, bool &curr_equal, uint32_t & curr_index) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    curr_equal = false;
-    do {
-      prev = *prev_atomic;
-      curr_index = node_atomic::next(prev);
-      const bool curr_invalid = curr_index == node_atomic::invalid_next;
-
-      if (curr_invalid) break;
-
-       // global read of the key
-      volatile const key_type * const key_ptr = &m_data.get_node(curr_index).value.first;
-      const key_type curr_key = *key_ptr;
-
-      const bool curr_less = m_data.key_compare( curr_key, k);
-      const bool curr_greater = m_data.key_compare( k, curr_key);
-      curr_equal = !curr_less && !curr_greater;
-
-      if (!curr_less) break;
-
-      prev_atomic = & m_data.get_node(node_atomic::next(prev)).atomic.value;
-    } while (true);
-  }
-
-private: // private members
-  map_data m_data;
-
-  template <typename KKey, typename TT, typename DDevice, typename CCompare, typename HHash>
-  friend class UnorderedMap;
-
-  template <  class MapDst, class MapSrc >
-  friend void Impl::UnorderedMap::deep_copy_impl( MapDst & dst, const MapSrc & src);
-};
-
-
-//! Specialization of UnorderedMap for const Key and const value (T).
-template <   typename Key
-           , typename T
-           , typename Device
-           , typename Compare
-           , typename Hash
-        >
-class UnorderedMap< const Key, const T, Device, Compare, Hash>
-{
-public: // public types and constants
-  typedef Impl::UnorderedMap::map_data<const Key, const T,Device,Compare,Hash> map_data;
-  typedef Impl::UnorderedMap::node_atomic node_atomic;
-
-  typedef typename map_data::device_type device_type;
-  typedef typename map_data::compare_type compare_type;
-  typedef typename map_data::hash_type hash_type;
-  typedef typename map_data::key_type key_type;
-  typedef typename map_data::mapped_type mapped_type;
-  typedef typename map_data::value_type value_type;
-  typedef typename map_data::pointer pointer;
-  typedef typename map_data::const_pointer const_pointer;
-  typedef typename map_data::node_type node_type;
-  typedef typename map_data::size_type size_type;
-
-  typedef UnorderedMap<Key,T,typename Device::host_mirror_device_type,Compare,Hash> HostMirror;
-
-public: //public member functions
-
-  UnorderedMap()
-    : m_data()
-  {}
-
-  template <typename UMap>
-  UnorderedMap(  const UMap & umap )
-    : m_data( umap.m_data )
-  {}
-
-  void clear()
-  {
-    m_data = map_data(0, m_data.key_compare, m_data.key_hash);
-  }
-
-  void shrink_to_fit()
-  { reserve(0); }
-
-  void reserve(unsigned new_capacity)
-  {
-    const uint32_t curr_size = size();
-    new_capacity = new_capacity < curr_size ? curr_size : new_capacity;
-
-    UnorderedMap<key_type, mapped_type, device_type, compare_type, hash_type>
-      tmp(new_capacity, m_data.key_compare, m_data.key_hash);
-
-    if (new_capacity > 0u && failed_inserts() == 0u ) {
-      Impl::UnorderedMap::copy_map(tmp,*this);
-    }
-    *this = tmp;
-  }
-
-  void check_sanity() const
-  { m_data.check_sanity(); }
-
-  uint32_t size() const
-  {  return m_data.size(); }
-
-  uint32_t unused() const
-  {  return m_data.unused(); }
-
-  uint32_t pending_delete() const
-  {  return m_data.pending_delete(); }
-
-  uint32_t failed_inserts() const
-  { return m_data.failed_inserts(); }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t capacity() const
-  { return m_data.capacity(); }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t hash_capacity() const
-  { return m_data.hash_capacity(); }
-
-  void remove_pending_delete() const
-  {  return m_data.remove_pending_delete_keys(); }
-
-  KOKKOS_INLINE_FUNCTION
-  const_pointer find( const key_type & k) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    const uint32_t node_index = m_data.find_node_index(k);
-    return (node_index != node_atomic::invalid_next) ? &m_data.get_node(node_index).value : NULL;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_pointer get_value(uint64_t i) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    // add one to pass 0th node
-    const bool valid_range = i < m_data.capacity();
-    const bool used_node  = node_atomic::state(m_data.get_node(i).atomic) == Impl::UnorderedMap::USED;
-
-    return valid_range && used_node ? &m_data.get_node(i).value : NULL;
-  }
-
-private: // private members
-  map_data m_data;
-
-  template <typename KKey, typename TT, typename DDevice, typename CCompare, typename HHash>
-  friend class UnorderedMap;
-
-  template <  class MapDst, class MapSrc >
-  friend void Impl::UnorderedMap::deep_copy_impl( MapDst & dst, const MapSrc & src);
-};
-
-
-//! Specialization of UnorderedMap for const Key and T=void ("set").
-template <   typename Key
-           , typename Device
-           , typename Compare
-           , typename Hash
-        >
-class UnorderedMap< const Key, void, Device, Compare, Hash>
-{
-public: // public types and constants
-  typedef Impl::UnorderedMap::map_data<const Key, void,Device,Compare,Hash> map_data;
-  typedef Impl::UnorderedMap::node_atomic node_atomic;
-
-  typedef typename map_data::device_type device_type;
-  typedef typename map_data::compare_type compare_type;
-  typedef typename map_data::hash_type hash_type;
-  typedef typename map_data::key_type key_type;
-  typedef typename map_data::mapped_type mapped_type;
-  typedef typename map_data::value_type value_type;
-  typedef typename map_data::pointer pointer;
-  typedef typename map_data::const_pointer const_pointer;
-  typedef typename map_data::node_type node_type;
-  typedef typename map_data::size_type size_type;
-
-  typedef UnorderedMap<Key,void,typename Device::host_mirror_device_type,Compare,Hash> HostMirror;
-
-public: //public member functions
-
-  UnorderedMap()
-    : m_data()
-  {}
-
-  template <typename UMap>
-  UnorderedMap(  const UMap & umap )
-    : m_data( umap.m_data )
-  {}
-
-  void clear()
-  {
-    m_data = map_data(0, m_data.key_compare, m_data.key_hash);
-  }
-
-  void shrink_to_fit()
-  { reserve(0); }
-
-  void reserve(unsigned new_capacity)
-  {
-    const uint32_t curr_size = size();
-    new_capacity = new_capacity < curr_size ? curr_size : new_capacity;
-
-    UnorderedMap<key_type, mapped_type, device_type, compare_type, hash_type>
-      tmp(new_capacity, m_data.key_compare, m_data.key_hash);
-
-    if (new_capacity > 0u && failed_inserts() == 0u ) {
-      Impl::UnorderedMap::copy_map(tmp,*this);
-    }
-    *this = tmp;
-  }
-
-  void check_sanity() const
-  { m_data.check_sanity(); }
-
-  uint32_t size() const
-  {  return m_data.size(); }
-
-  uint32_t unused() const
-  {  return m_data.unused(); }
-
-  uint32_t pending_delete() const
-  {  return m_data.pending_delete(); }
-
-  uint32_t failed_inserts() const
-  { return m_data.failed_inserts(); }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t capacity() const
-  { return m_data.capacity(); }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t hash_capacity() const
-  { return m_data.hash_capacity(); }
-
-  void remove_pending_delete() const
-  {  return m_data.remove_pending_delete_keys(); }
-
-  KOKKOS_INLINE_FUNCTION
-  const_pointer find( const key_type & k) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    const uint32_t node_index = m_data.find_node_index(k);
-    return (node_index != node_atomic::invalid_next) ? &m_data.get_node(node_index).value : NULL;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  const_pointer get_value(uint64_t i) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    // add one to pass 0th node
-    const bool valid_range = i < m_data.capacity();
-    const bool used_node  = node_atomic::state(m_data.get_node(i).atomic) == Impl::UnorderedMap::USED;
-
-    return valid_range && used_node ? &m_data.get_node(i).value : NULL;
-  }
-
-private: // private members
-  map_data m_data;
-
-  template <typename KKey, typename TT, typename DDevice, typename CCompare, typename HHash>
-  friend class UnorderedMap;
-
-  template <  class MapDst, class MapSrc >
-  friend void Impl::UnorderedMap::deep_copy_impl( MapDst & dst, const MapSrc & src);
-};
-
-
-} // namespace Kokkos
-
-#endif //KOKKOS_UNORDERED_MAP_HPP
diff --git a/kokkos/kokkos/containers/src/Kokkos_Vector.hpp b/kokkos/kokkos/containers/src/Kokkos_Vector.hpp
deleted file mode 100644
index cdcad2d..0000000
--- a/kokkos/kokkos/containers/src/Kokkos_Vector.hpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VECTOR_HPP
-#define KOKKOS_VECTOR_HPP
-
-#include <KokkosCore_config.h>
-#include <Kokkos_DualView.hpp>
-
-/* Drop in replacement for std::vector based on Kokkos::DualView
- * Most functions only work on the host (it will not compile if called from device kernel)
- *
- */
-#ifndef KOKKOS_HAVE_CUDA
-  #ifdef KOKKOS_HAVE_PTHREAD
-    #include <Kokkos_Threads.hpp>
-  namespace Kokkos {
-  namespace Impl {
-    typedef Threads DefaultDeviceType;
-  }
-  }
-  #else
-    #ifdef KOKKOS_HAVE_OPENMP
-      #include <Kokkos_OpenMP.hpp>
-    namespace Kokkos {
-    namespace Impl {
-      typedef OpenMP DefaultDeviceType;
-    }
-    }
-    #else
-      #ifdef KOKKOS_HAVE_SERIAL
-        #include <Kokkos_Serial.hpp>
-      namespace Kokkos {
-      namespace Impl {
-        typedef Serial DefaultDeviceType;
-      }
-      }
-      #else
-        #error "No Kokkos Host Device defined"
-      #endif
-    #endif
-  #endif
-#else
-  #include <Kokkos_Cuda.hpp>
-  namespace Kokkos {
-  namespace Impl {
-    typedef Cuda DefaultDeviceType;
-  }
-  }
-#endif
-  namespace Kokkos {
-
-template <typename Scalar, class Device=Impl::DefaultDeviceType>
-class vector : public DualView<Scalar*,LayoutLeft,Device> {
-public:
-  typedef Device device_type;
-  typedef Scalar value_type;
-  typedef Scalar* pointer;
-  typedef const Scalar* const_pointer;
-  typedef Scalar* reference;
-  typedef const Scalar* const_reference;
-  typedef Scalar* iterator;
-  typedef const Scalar* const_iterator;
-
-private:
-  size_t _size;
-  typedef size_t size_type;
-  float _extra_storage;
-  typedef DualView<Scalar*,LayoutLeft,Device> DV;
-
-
-public:
-  inline Scalar& operator() (int i) const {return DV::h_view(i);};
-  inline Scalar& operator[] (int i) const {return DV::h_view(i);};
-
-
-  /* Member functions which behave like std::vector functions */
-
-  vector():DV() {
-    _size = 0;
-    _extra_storage = 1.1;
-    DV::modified_host = 1;
-  };
-
-
-  vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Device>("Vector",size_t(n*(1.1))) {
-    _size = n;
-    _extra_storage = 1.1;
-    DV::modified_host = 1;
-
-    assign(n,val);
-  }
-
-
-  void resize(size_t n) {
-    if(n>=capacity())
-      DV::resize(size_t (n*_extra_storage));
-    _size = n;
-  }
-
-  void resize(size_t n, const Scalar& val) {
-    assign(n,val);
-  }
-
-  void assign (size_t n, const Scalar& val) {
-
-    /* Resize if necessary (behavour of std:vector) */
-
-    if(n>capacity())
-      DV::resize(size_t (n*_extra_storage));
-    _size = n;
-
-	  /* Assign value either on host or on device */
-
-    if( DV::modified_host >= DV::modified_device ) {
-      set_functor_host f(DV::h_view,val);
-      parallel_for(n,f);
-      DV::t_host::device_type::fence();
-      DV::modified_host++;
-    } else {
-      set_functor f(DV::d_view,val);
-      parallel_for(n,f);
-      DV::t_dev::device_type::fence();
-      DV::modified_device++;
-    }
-  }
-
-  void reserve(size_t n) {
-    DV::resize(size_t (n*_extra_storage));
-  }
-
-  void push_back(Scalar val) {
-    DV::modified_host++;
-    if(_size == capacity()) {
-      size_t new_size = _size*_extra_storage;
-      if(new_size == _size) new_size++;
-      DV::resize(new_size);
-    }
-
-    DV::h_view(_size) = val;
-    _size++;
-
-  };
-
-  void pop_back() {
-    _size--;
-  };
-
-  void clear() {
-    _size = 0;
-  }
-
-  size_type size() const {return _size;};
-  size_type max_size() const {return 2000000000;}
-  size_type capacity() const {return DV::capacity();};
-  bool empty() const {return _size==0;};
-
-  iterator begin() const {return &DV::h_view(0);};
-
-  iterator end() const {return &DV::h_view(_size);};
-
-
-  /* std::algorithms wich work originally with iterators, here they are implemented as member functions */
-
-  size_t lower_bound(const size_t &start, const size_t &end, const Scalar &comp_val) const {
-
-    int lower = 0 > start  ? 0   : start;
-    int upper = _size > end? end : _size-1;
-    if(upper<=lower) return end;
-
-
-    Scalar lower_val = DV::h_view(lower);
-    Scalar upper_val = DV::h_view(upper);
-    size_t idx = (upper+lower)/2;
-    Scalar val = DV::h_view(idx);
-    if(val>upper_val) return upper;
-    if(val<lower_val) return start;
-
-    while(upper>lower) {
-      if(comp_val>val) {
-        lower = ++idx;
-      } else {
-        upper = idx;
-      }
-      idx = (upper+lower)/2;
-      val = DV::h_view(idx);
-    }
-    return idx;
-  }
-
-  bool is_sorted() {
-    for(int i=0;i<_size-1;i++) {
-      if(DV::h_view(i)>DV::h_view(i+1)) return false;
-    }
-    return true;
-  }
-
-  iterator find(Scalar val) const {
-    if(_size == 0) return end();
-
-    int upper,lower,current;
-    current = _size/2;
-    upper = _size-1;
-    lower = 0;
-
-    if((val<DV::h_view(0)) || (val>DV::h_view(_size-1)) ) return end();
-
-    while(upper>lower)
-    {
-      if(val>DV::h_view(current)) lower = current+1;
-      else upper = current;
-      current = (upper+lower)/2;
-    }
-
-    if(val==DV::h_view(current)) return &DV::h_view(current);
-    else return end();
-  }
-
-  /* Additional functions for data management */
-
-  void device_to_host(){
-    deep_copy(DV::h_view,DV::d_view);
-  }
-  void host_to_device() const {
-    deep_copy(DV::d_view,DV::h_view);
-  }
-
-  void on_host() {
-    DV::modified_host = DV::modified_device + 1;
-  }
-  void on_device() {
-    DV::modified_device = DV::modified_host + 1;
-  }
-
-  void set_overallocation(float extra) {
-    _extra_storage = 1.0 + extra;
-  }
-
-
-public:
-  struct set_functor {
-    typedef typename DV::t_dev::device_type device_type;
-    typename DV::t_dev _data;
-    Scalar _val;
-  
-    set_functor(typename DV::t_dev data, Scalar val) :
-      _data(data),_val(val) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int &i) const {
-      _data(i) = _val;
-    }
-  };
-
-  struct set_functor_host {
-    typedef typename DV::t_host::device_type device_type;
-    typename DV::t_host _data;
-    Scalar _val;
-
-    set_functor_host(typename DV::t_host data, Scalar val) :
-      _data(data),_val(val) {}
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int &i) const {
-      _data(i) = _val;
-    }
-  };
-
-};
-}
-#endif
diff --git a/kokkos/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/kokkos/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
deleted file mode 100644
index 15f2f0e..0000000
--- a/kokkos/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
-#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
-                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
-{
-  return view ;
-}
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view )
-{
-  // Force copy:
-  typedef Impl::ViewAssignment< Impl::LayoutDefault > alloc ;
-  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >  staticcrsgraph_type ;
-
-  typename staticcrsgraph_type::HostMirror               tmp ;
-  typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map);
-
-  // Allocation to match:
-  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
-  tmp.entries = create_mirror( view.entries );
-
-
-  // Deep copy:
-  deep_copy( tmp_row_map , view.row_map );
-  deep_copy( tmp.entries , view.entries );
-
-  return tmp ;
-}
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
-                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
-{
-  return create_mirror( view );
-}
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class StaticCrsGraphType , class InputSizeType >
-inline
-typename StaticCrsGraphType::staticcrsgraph_type
-create_staticcrsgraph( const std::string & label ,
-                 const std::vector< InputSizeType > & input )
-{
-  typedef StaticCrsGraphType                  output_type ;
-  typedef std::vector< InputSizeType >  input_type ;
-
-  typedef typename output_type::entries_type   entries_type ;
-
-  typedef View< typename output_type::size_type [] ,
-                typename output_type::array_layout ,
-                typename output_type::device_type > work_type ;
-
-  output_type output ;
-
-  // Create the row map:
-
-  const size_t length = input.size();
-
-  {
-    work_type row_work( "tmp" , length + 1 );
-
-    typename work_type::HostMirror row_work_host =
-      create_mirror_view( row_work );
-
-    size_t sum = 0 ;
-    row_work_host[0] = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      row_work_host[i+1] = sum += input[i];
-    }
-
-    deep_copy( row_work , row_work_host );
-
-    output.entries   = entries_type( label , sum );
-    output.row_map   = row_work ;
-  }
-
-  return output ;
-}
-
-//----------------------------------------------------------------------------
-
-template< class StaticCrsGraphType , class InputSizeType >
-inline
-typename StaticCrsGraphType::staticcrsgraph_type
-create_staticcrsgraph( const std::string & label ,
-                 const std::vector< std::vector< InputSizeType > > & input )
-{
-  typedef StaticCrsGraphType                                output_type ;
-  typedef std::vector< std::vector< InputSizeType > > input_type ;
-  typedef typename output_type::entries_type          entries_type ;
-  typedef typename output_type::size_type             size_type ;
-
-  typedef typename
-    Impl::assert_shape_is_rank_one< typename entries_type::shape_type >::type
-      ok_rank ;
-
-  typedef View< typename output_type::size_type [] ,
-                typename output_type::array_layout ,
-                typename output_type::device_type > work_type ;
-
-  output_type output ;
-
-    // Create the row map:
-
-  const size_t length = input.size();
-
-  {
-    work_type row_work( "tmp" , length + 1 );
-
-    typename work_type::HostMirror row_work_host =
-      create_mirror_view( row_work );
-
-    size_t sum = 0 ;
-    row_work_host[0] = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      row_work_host[i+1] = sum += input[i].size();
-    }
-
-    deep_copy( row_work , row_work_host );
-
-    output.entries   = entries_type( label , sum );
-    output.row_map   = row_work ;
-  }
-
-  // Fill in the entries:
-  {
-    typename entries_type::HostMirror host_entries =
-      create_mirror_view( output.entries );
-
-    size_t sum = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
-        host_entries( sum ) = input[i][j] ;
-      }
-    }
-
-    deep_copy( output.entries , host_entries );
-  }
-
-  return output ;
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
-
diff --git a/kokkos/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/kokkos/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
deleted file mode 100644
index 651f66c..0000000
--- a/kokkos/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_UnorderedMap.hpp>
-
-namespace Kokkos { namespace Impl { namespace UnorderedMap {
-
-uint32_t find_hash_size(uint32_t size)
-{
-  if (size == 0u) return 0u;
-
-  // these primes try to preserve randomness of hash
-  static const uint32_t primes [] = {
-        3, 7, 13, 23, 53, 97, 193, 389, 769, 1543
-      , 2237, 2423, 2617, 2797, 2999, 3167, 3359, 3539
-      , 3727, 3911, 4441 , 4787 , 5119 , 5471 , 5801 , 6143 , 6521 , 6827
-      , 7177 , 7517 , 7853 , 8887 , 9587 , 10243 , 10937 , 11617 , 12289
-      , 12967 , 13649 , 14341 , 15013 , 15727
-      , 17749 , 19121 , 20479 , 21859 , 23209 , 24593 , 25939 , 27329
-      , 28669 , 30047 , 31469 , 35507 , 38231 , 40961 , 43711 , 46439
-      , 49157 , 51893 , 54617 , 57347 , 60077 , 62801 , 70583 , 75619
-      , 80669 , 85703 , 90749 , 95783 , 100823 , 105871 , 110909 , 115963
-      , 120997 , 126031 , 141157 , 151237 , 161323 , 171401 , 181499 , 191579
-      , 201653 , 211741 , 221813 , 231893 , 241979 , 252079
-      , 282311 , 302483 , 322649 , 342803 , 362969 , 383143 , 403301 , 423457
-      , 443629 , 463787 , 483953 , 504121 , 564617 , 604949 , 645313 , 685609
-      , 725939 , 766273 , 806609 , 846931 , 887261 , 927587 , 967919 , 1008239
-      , 1123477 , 1198397 , 1273289 , 1348177 , 1423067 , 1497983 , 1572869
-      , 1647761 , 1722667 , 1797581 , 1872461 , 1947359 , 2022253
-      , 2246953 , 2396759 , 2546543 , 2696363 , 2846161 , 2995973 , 3145739
-      , 3295541 , 3445357 , 3595117 , 3744941 , 3894707 , 4044503
-      , 4493921 , 4793501 , 5093089 , 5392679 , 5692279 , 5991883 , 6291469
-      , 6591059 , 6890641 , 7190243 , 7489829 , 7789447 , 8089033
-      , 8987807 , 9586981 , 10186177 , 10785371 , 11384539 , 11983729
-      , 12582917 , 13182109 , 13781291 , 14380469 , 14979667 , 15578861
-      , 16178053 , 17895707 , 19014187 , 20132683 , 21251141 , 22369661
-      , 23488103 , 24606583 , 25725083 , 26843549 , 27962027 , 29080529
-      , 30198989 , 31317469 , 32435981 , 35791397 , 38028379 , 40265327
-      , 42502283 , 44739259 , 46976221 , 49213237 , 51450131 , 53687099
-      , 55924061 , 58161041 , 60397993 , 62634959 , 64871921
-      , 71582857 , 76056727 , 80530643 , 85004567 , 89478503 , 93952427
-      , 98426347 , 102900263 , 107374217 , 111848111 , 116322053 , 120795971
-      , 125269877 , 129743807 , 143165587 , 152113427 , 161061283 , 170009141
-      , 178956983 , 187904819 , 196852693 , 205800547 , 214748383 , 223696237
-      , 232644089 , 241591943 , 250539763 , 259487603 , 268435399
-  };
-
-
-  const size_t num_primes = sizeof(primes)/sizeof(uint32_t);
-
-  uint32_t hsize = primes[num_primes-1] ;
-  for (size_t i = 0; i < num_primes; ++i) {
-    if (size <= primes[i]) {
-      hsize = primes[i];
-      break;
-    }
-  }
-  return hsize;
-}
-
-}}} // namespace Kokkos::Impl::UnorderedMap
-
diff --git a/kokkos/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/kokkos/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
deleted file mode 100644
index c079a1d..0000000
--- a/kokkos/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
+++ /dev/null
@@ -1,818 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP
-#define KOKKOS_UNORDERED_MAP_IMPL_HPP
-
-#include <Kokkos_Functional.hpp>
-#include <Kokkos_Pair.hpp>
-#include <Kokkos_View.hpp>
-#include <Kokkos_HostSpace.hpp>
-
-#include <stdexcept>
-#include <string>
-#include <stdint.h>
-#include <iostream>
-#include <sstream>
-#include <cstdio>
-
-namespace Kokkos { namespace Impl { namespace UnorderedMap {
-
-uint32_t find_hash_size( uint32_t size );
-
-enum node_state
-{
-    UNUSED          // not used in a list
-  , USED            // used in a list
-  , PENDING_INSERT  // not used in a list, but reserved by a thread for inserting
-  , PENDING_DELETE  // node in the list is marked deleted
-  , INVALID         // the 0th node in the node view is set to invalid
-};
-
-struct node_atomic
-{
-  static const uint64_t word_mask = 0x00000000FFFFFFFFu;
-  static const uint64_t word_shift = 32u;
-  static const uint32_t invalid_next = 0xFFFFFFFFu;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static uint32_t next(uint64_t v)
-  { return static_cast<uint32_t>(v & word_mask); }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static node_state state(uint64_t v)
-  { return static_cast<node_state>((v >> word_shift)); }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  static uint64_t make_atomic( uint32_t n, node_state s)
-  { return (static_cast<uint64_t>(s) << word_shift) | static_cast<uint64_t>(n); }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  node_atomic(uint64_t v = make_atomic(invalid_next, UNUSED) )
-    : value(v)
-  {}
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  operator uint64_t() const
-  { return value; }
-
-  uint64_t value;
-};
-
-template <size_t Size>
-struct Align16
-{
-  static const size_t value = (Size & 15ull);
-};
-
-template <typename ValueType, size_t AlignPad = Align16<sizeof(ValueType) + sizeof(uint64_t) >::value >
-struct node
-{
-  typedef ValueType value_type;
-
-  // contruct a new value at the current node
-  KOKKOS_FORCEINLINE_FUNCTION
-  void construct_value( const value_type & v )
-  { new (&value) value_type(v); }
-
-  // destruct the value at the current node
-  KOKKOS_FORCEINLINE_FUNCTION
-  void destruct_value()
-  { value.~value_type(); }
-
-  value_type value;
-  uint8_t pad[AlignPad];
-  node_atomic atomic;
-};
-
-template <typename ValueType>
-struct node<ValueType, 0u>
-{
-  typedef ValueType value_type;
-
-  // contruct a new value at the current node
-  KOKKOS_FORCEINLINE_FUNCTION
-  void construct_value( const value_type & v )
-  { new (&value) value_type(v); }
-
-  // destruct the value at the current node
-  KOKKOS_FORCEINLINE_FUNCTION
-  void destruct_value()
-  { value.~value_type(); }
-
-  value_type value;
-  node_atomic atomic;
-};
-
-template <typename Node>
-struct node_block
-{
-  typedef Node node_type;
-  typedef typename StaticAssert<(sizeof(node_type) % 16u == 0u)>::type node_okay;
-
-  static const uint32_t shift = 5;
-  static const uint32_t size = 1u << shift;
-  static const uint32_t mask = size - 1u;
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  node_block()
-    : used_count(0)
-    , failed_inserts(0)
-    , pad(0)
-    , nodes()
-  {}
-
-  int32_t used_count;
-  int32_t failed_inserts;
-  uint64_t pad;
-  node_type nodes[size];
-};
-
-struct hash_list_sanity_type
-{
-  KOKKOS_INLINE_FUNCTION
-  hash_list_sanity_type()
-    : duplicate_keys_errors(0)
-    , unordered_list_errors(0)
-    , incorrect_hash_index_errors(0)
-  {}
-
-  uint32_t duplicate_keys_errors;
-  uint32_t unordered_list_errors;
-  uint32_t incorrect_hash_index_errors;
-};
-
-struct node_state_counts
-{
-  KOKKOS_INLINE_FUNCTION
-  node_state_counts()
-    : in_sync(true)
-    , no_failed_inserts(true)
-    , unused(0)
-    , used_count(0)
-    , used(0)
-    , pending_insert(0)
-    , pending_delete(0)
-    , invalid(0)
-    , failed_inserts(0)
-  {}
-
-  bool in_sync;
-  bool no_failed_inserts;
-  uint32_t unused;
-  uint32_t used_count;
-  uint32_t used;
-  uint32_t pending_insert;
-  uint32_t pending_delete;
-  uint32_t invalid;
-  uint32_t failed_inserts;
-};
-
-
-template <class MapData>
-struct sync_node_states_functor
-{
-  typedef typename MapData::device_type device_type;
-  typedef typename device_type::size_type size_type;
-  typedef typename MapData::node_block_type node_block_type;
-  typedef typename MapData::node_type node_type;
-
-  typedef node_state_counts value_type;
-
-  MapData  map;
-
-  sync_node_states_functor(MapData arg_map)
-    : map(arg_map)
-  {
-    parallel_reduce( map.capacity(), *this);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & dst)
-  {
-    dst = value_type();
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & dst, const volatile value_type & src)
-  {
-    dst.unused         += src.unused;
-    dst.used_count     += src.used_count;
-    dst.used           += src.used;
-    dst.pending_insert += src.pending_insert;
-    dst.pending_delete += src.pending_delete;
-    dst.invalid        += src.invalid;
-    dst.failed_inserts += src.failed_inserts;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void final( value_type & result ) const
-  {
-    result.in_sync = true;
-    result.no_failed_inserts = map.counts().no_failed_inserts;
-
-    map.counts = result;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type i, value_type & dst) const
-  {
-    // count block properties
-    if ((i%node_block_type::size) == 0u) {
-      dst.used_count += map.node_blocks[i>>node_block_type::shift].used_count;
-      dst.failed_inserts += map.node_blocks[i>>node_block_type::shift].failed_inserts;
-    }
-
-    const node_state state = node_atomic::state(map.get_node(i).atomic);
-
-    if (state == UNUSED)
-      ++dst.unused;
-    else if (state == USED)
-      ++dst.used;
-    else if (state == PENDING_INSERT)
-      ++dst.pending_insert;
-    else if (state == PENDING_DELETE)
-      ++dst.pending_delete;
-    else
-      ++dst.invalid;
-  }
-};
-
-
-template <class MapData>
-struct check_hash_list_functor
-{
-  typedef typename MapData::device_type device_type;
-  typedef typename device_type::size_type size_type;
-  typedef typename MapData::node_type node_type;
-  typedef hash_list_sanity_type value_type;
-
-  MapData map;
-
-  check_hash_list_functor(MapData arg_map, value_type & value)
-    : map(arg_map)
-  {
-    parallel_reduce( map.hashes.size(), *this, value);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void init( value_type & dst)
-  {
-    dst.duplicate_keys_errors       = 0;
-    dst.unordered_list_errors       = 0;
-    dst.incorrect_hash_index_errors = 0;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & dst, const volatile value_type & src)
-  {
-    dst.duplicate_keys_errors       += src.duplicate_keys_errors;
-    dst.unordered_list_errors       += src.unordered_list_errors;
-    dst.incorrect_hash_index_errors += src.incorrect_hash_index_errors;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type i, value_type & errors) const
-  {
-    const uint64_t * prev_atomic = &map.hashes[i].value;
-
-    uint32_t incorrect_hash_index_errors = 0;
-    uint32_t duplicate_keys_errors = 0;
-    uint32_t unordered_list_errors = 0;
-
-    //traverse the list
-    while ( node_atomic::next(*prev_atomic) != node_atomic::invalid_next) {
-      const uint64_t * curr_atomic = &map.get_node(node_atomic::next(*prev_atomic)).atomic.value;
-
-      const uint32_t curr_index = node_atomic::next(*prev_atomic);
-      const uint32_t next_index = node_atomic::next(*curr_atomic);
-
-      //check that the key hashes to this index
-      const uint32_t hash_value = map.key_hash(map.get_node(curr_index).value.first);
-      const uint32_t hash_index = hash_value%map.hashes.size();
-
-      if ( static_cast<uint32_t>(i) != hash_index) {
-        ++incorrect_hash_index_errors;
-      }
-
-      if (next_index != node_atomic::invalid_next) {
-        //check that the list is ordered and has no duplicates
-        const bool key_less = map.key_compare( map.get_node(curr_index).value.first, map.get_node(next_index).value.first );
-        const bool key_greater = map.key_compare( map.get_node(next_index).value.first, map.get_node(curr_index).value.first );
-        const bool key_equal = !key_less && !key_greater;
-
-        if (key_equal) {
-          ++duplicate_keys_errors;
-        }
-        else if (key_greater) {
-          ++unordered_list_errors;
-        }
-      }
-
-      prev_atomic = curr_atomic;
-    }
-
-    errors.incorrect_hash_index_errors += incorrect_hash_index_errors;
-    errors.duplicate_keys_errors += duplicate_keys_errors;
-    errors.unordered_list_errors += unordered_list_errors;
-  }
-};
-
-template <class MapData>
-struct remove_pending_delete_keys_functor
-{
-  typedef typename MapData::device_type device_type;
-  typedef typename device_type::size_type size_type;
-  typedef typename MapData::node_type node_type;
-  typedef typename MapData::node_block_type node_block_type;
-
-  node_block_type   * node_blocks;
-  node_atomic       * hashes;
-  node_state_counts * counts;
-
-  remove_pending_delete_keys_functor( MapData arg_map )
-    : node_blocks( const_cast<node_block_type *>(arg_map.node_blocks.ptr_on_device()) )
-    , hashes( const_cast<node_atomic *>(arg_map.hashes.ptr_on_device()) )
-    , counts( const_cast<node_state_counts *>(arg_map.counts.ptr_on_device()) )
-  {
-    parallel_for( arg_map.hashes.size(), *this);
-    device_type::fence();
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  node_type & get_node(uint32_t i) const
-  {
-    return node_blocks[i>>node_block_type::shift].nodes[i&node_block_type::mask];
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( size_type i) const
-  {
-    if (i == static_cast<size_type>(0)) {
-      counts->in_sync = false;
-    }
-
-    uint64_t * prev_atomic = &hashes[i].value;
-
-    while (node_atomic::next(*prev_atomic) != node_atomic::invalid_next) {
-      uint64_t * curr_atomic = &get_node( node_atomic::next(*prev_atomic)).atomic.value;
-      uint64_t prev = *prev_atomic;
-      uint64_t curr = *curr_atomic;
-      if (node_atomic::state(curr) == PENDING_DELETE) {
-        const uint32_t curr_index = node_atomic::next(prev);
-        const uint32_t curr_block = curr_index >> node_block_type::shift;
-
-        //remove the node
-        *prev_atomic = node_atomic::make_atomic( node_atomic::next(curr), node_atomic::state(prev) );
-        *curr_atomic = node_atomic::make_atomic( node_atomic::invalid_next, UNUSED );
-        volatile int * used_count = &node_blocks[curr_block].used_count;
-        atomic_fetch_add(used_count, -1);
-      }
-      else {
-        prev_atomic = curr_atomic;
-      }
-    }
-  }
-};
-
-template <typename Key, typename T, typename Device, typename Compare, typename Hash>
-struct map_data
-{
-  typedef map_data<Key,T,Device,Compare,Hash> self_type;
-
-  typedef typename remove_const<Key>::type key_type;
-  typedef typename add_const<Key>::type const_key_type;
-
-  typedef typename remove_const<T>::type mapped_type;
-  typedef typename add_const<T>::type const_mapped_type;
-
-  typedef Device device_type;
-  typedef Compare compare_type;
-  typedef Hash hash_type;
-
-  typedef map_data< key_type, mapped_type, Device, Compare, Hash>              insertable_map_type;
-  typedef map_data< const_key_type, mapped_type, Device, Compare, Hash>        modifiable_map_type;
-  typedef map_data< const_key_type, const_mapped_type, Device, Compare, Hash>  const_map_type;
-
-  static const bool has_const_key_type = is_const<Key>::value;
-  static const bool has_void_mapped_type = is_same<T,void>::value;
-  static const bool has_const_mapped_type = has_void_mapped_type || is_const<T>::value;
-  static const bool is_const_map = has_const_key_type && has_const_mapped_type;
-
-
-  typedef pair<const_key_type, mapped_type> value_type;
-
-  typedef typename if_c< is_const_map, value_type const *, value_type *>::type pointer;
-  typedef value_type const * const_pointer;
-
-  typedef node<value_type> node_type;
-  typedef node_block<node_type> node_block_type;
-
-
-  typedef uint32_t size_type;
-
-  typedef typename if_c<   has_const_key_type
-                         , View< const node_atomic *, device_type, MemoryTraits<RandomRead> >
-                         , View< node_atomic *, device_type >
-                       >::type hash_view;
-
-  typedef typename if_c<   is_const_map
-                         , View< const node_block_type *, device_type, MemoryTraits<RandomRead> >
-                         , View< node_block_type *, device_type >
-                       >::type node_block_view;
-
-
-  typedef View< node_state_counts, device_type > counts_view;
-
-  map_data()
-    : node_blocks()
-    , hashes()
-    , counts()
-    , key_compare()
-    , key_hash()
-  {}
-
-  map_data(  uint32_t num_nodes
-           , compare_type compare
-           , hash_type hash
-          )
-    : node_blocks("UnorderedMap_nodes", (static_cast<uint32_t>((num_nodes+node_block_type::size-1u)/node_block_type::size)))
-    , hashes("UnorderedMap_hashes", find_hash_size(capacity()) )
-    , counts("UnorderedMap_counts")
-    , key_compare(compare)
-    , key_hash(hash)
-  {}
-
-  template <typename MMapType>
-  KOKKOS_INLINE_FUNCTION
-  map_data( const MMapType & m)
-    : node_blocks(m.node_blocks)
-    , hashes(m.hashes)
-    , counts(m.counts)
-    , key_compare(m.key_compare)
-    , key_hash(m.key_hash)
-  {}
-
-  template <typename MMapType>
-  KOKKOS_INLINE_FUNCTION
-  map_data & operator=( const MMapType & m)
-  {
-    node_blocks = m.node_blocks;
-    hashes      = m.hashes;
-    counts      = m.counts;
-    key_compare = m.key_compare;
-    key_hash    = m.key_hash;
-
-    return *this;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t capacity() const
-  {
-    return node_blocks.size() * node_block_type::size;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t hash_capacity() const
-  {
-    return static_cast<uint32_t>(hashes.size());
-  }
-
-  bool in_sync() const
-  {
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    bool result = false;
-    deep_copy(&result, &counts.ptr_on_device()->in_sync, sizeof(bool) );
-    return result;
-  }
-
-  void sync_node_states() const
-  {
-    if (!in_sync()) {
-      sync_node_states_functor<const_map_type>(*this);
-      device_type::fence();
-    }
-  }
-
-  uint32_t size() const
-  {
-    sync_node_states();
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    uint32_t result = 0;
-    deep_copy(&result, &counts.ptr_on_device()->used, sizeof(uint32_t) );
-    return result;
-  }
-
-  uint32_t unused() const
-  {
-    sync_node_states();
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    uint32_t result = 0;
-    deep_copy(&result, &counts.ptr_on_device()->unused, sizeof(uint32_t) );
-    return result;
-  }
-
-  uint32_t pending_insert() const
-  {
-    sync_node_states();
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    uint32_t result = 0;
-    deep_copy(&result, &counts.ptr_on_device()->pending_insert, sizeof(uint32_t) );
-    return result;
-  }
-
-  uint32_t pending_delete() const
-  {
-    sync_node_states();
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    uint32_t result = 0;
-    deep_copy(&result, &counts.ptr_on_device()->pending_delete, sizeof(uint32_t) );
-    return result;
-  }
-
-  uint32_t failed_inserts() const
-  {
-    sync_node_states();
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    uint32_t result = 0;
-    deep_copy(&result, &counts.ptr_on_device()->failed_inserts, sizeof(uint32_t) );
-    return result;
-  }
-
-  uint32_t used_count() const
-  {
-    sync_node_states();
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    uint32_t result = 0;
-    deep_copy(&result, &counts.ptr_on_device()->used_count, sizeof(uint32_t) );
-    return result;
-  }
-
-  uint32_t invalid_count() const
-  {
-    sync_node_states();
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > deep_copy;
-    uint32_t result = 0;
-    deep_copy(&result, &counts.ptr_on_device()->invalid, sizeof(uint32_t) );
-    return result;
-  }
-
-  hash_list_sanity_type check_hash_sanity() const
-  {
-    hash_list_sanity_type result;
-    check_hash_list_functor<const_map_type>(*this, result);
-    device_type::fence();
-    return result;
-  }
-
-  void check_sanity() const
-  {
-    sync_node_states();
-
-    hash_list_sanity_type list_check;
-
-    check_hash_list_functor<const_map_type>(*this, list_check);
-
-    device_type::fence();
-
-    std::ostringstream out;
-
-    int total_errors = 0;
-
-    if (failed_inserts() > 0u) {
-      out << "Error: " << failed_inserts() << " failed insertions\n";
-      total_errors += failed_inserts();
-    }
-
-    if (list_check.duplicate_keys_errors > 0u) {
-      out << "Error: found " << list_check.duplicate_keys_errors << " duplicate keys found in lists\n";
-      ++total_errors;
-    }
-
-    if (list_check.unordered_list_errors > 0u) {
-      out << "Error: found " << list_check.unordered_list_errors << " unsorted lists\n";
-      ++total_errors;
-    }
-
-    if (list_check.incorrect_hash_index_errors > 0u) {
-      out << "Error: found " << list_check.incorrect_hash_index_errors << " keys incorrectly hashed\n";
-      ++total_errors;
-    }
-
-    if (invalid_count() > 0u) {
-      out << "Error: found " << invalid_count() << " invalid nodes \n";
-      ++total_errors;
-    }
-
-    if (pending_insert() > 0u) {
-      out << "Error: found " << pending_insert() << " pending insert nodes (should always be 0)\n";
-      ++total_errors;
-    }
-
-    if (used_count() != size() + pending_delete()) {
-      out << "Error: used_count(" << used_count() << ") != size(" << size() << ") + pending_delete("
-          << pending_delete() << ") = " << size() + pending_delete() << "\n";
-      ++total_errors;
-    }
-
-    if (total_errors > 0) {
-      out << "Total Errors: " << total_errors << std::endl;
-      throw std::runtime_error( out.str() );
-    }
-  }
-
-  void remove_pending_delete_keys() const
-  {
-    remove_pending_delete_keys_functor<self_type> remove_keys(*this);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  uint32_t find_node_index( const key_type & k) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-
-    const uint32_t hash_value = key_hash(k);
-    const uint32_t hash_index = hash_value % hashes.size();
-
-    uint64_t prev = hashes[hash_index];
-
-    uint32_t index = node_atomic::invalid_next;
-    do {
-      const uint32_t curr_index = node_atomic::next(prev);
-
-      if ( curr_index != node_atomic::invalid_next ) {
-        const node_type & curr_node = get_node(curr_index);
-        const uint64_t curr = get_node(curr_index).atomic;
-
-        const bool curr_greater = key_compare( k, curr_node.value.first);
-        const bool curr_less =  key_compare( curr_node.value.first, k);
-        const bool curr_equal = !curr_less && !curr_greater;
-
-        if (curr_greater) {
-          index = node_atomic::invalid_next;
-          break;
-        } else if (curr_equal) {
-          // return existing node
-          index = curr_index;
-          break;
-        }
-        else {
-          // Current is less -- advance to next node
-          prev = curr;
-        }
-      }
-      else {
-        break;
-      }
-    } while (true);
-
-    return index;
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename if_c< is_const_map, const node_type, node_type>::type & get_node(uint32_t i) const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    return node_blocks[i>>node_block_type::shift].nodes[i&node_block_type::mask];
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  void set_modified() const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    if (counts().in_sync) {
-      counts().in_sync = false;
-#if defined( __CUDA_ARCH__ )
-        __threadfence();
-#endif
-    }
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  bool no_failed_inserts() const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    return counts().no_failed_inserts;
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION
-  void set_failed_insert() const
-  {
-    //KOKKOS_RESTRICT_EXECUTION_TO( typename Device::memory_space );
-    if (counts().no_failed_inserts) {
-      counts().no_failed_inserts = false;
-#if defined( __CUDA_ARCH__ )
-        __threadfence();
-#endif
-    }
-  }
-
-  // Data members
-  node_block_view  node_blocks;
-  hash_view        hashes;
-  counts_view      counts;
-  compare_type     key_compare;
-  hash_type        key_hash;
-};
-
-
-template <  class MapDst, class MapSrc >
-inline void deep_copy_impl( MapDst & dst, const MapSrc & src )
-{
-  deep_copy_data_impl(dst.m_data, src.m_data);
-}
-
-template < class MapDst, class MapSrc >
-struct copy_map_functor
-{
-  typedef typename MapDst::device_type device_type;
-  typedef typename device_type::size_type size_type;
-  typedef typename MapDst::const_pointer const_pointer;
-
-
-  MapDst dst;
-  MapSrc src;
-
-  copy_map_functor( const MapDst & arg_dst, const MapSrc & arg_src )
-    : dst(arg_dst), src(arg_src)
-  {
-    parallel_for(src.capacity(), *this);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(size_type i) const
-  {
-    const_pointer ptr = src.get_value(i);
-
-    if (ptr != NULL) {
-      dst.insert(ptr->first,ptr->second);
-    }
-  }
-};
-
-template < class MapDst, class MapSrc >
-void copy_map(MapDst & dst, const MapSrc & src)
-{
-  copy_map_functor<MapDst,MapSrc> func(dst,src);
-}
-
-template <  class MapDst, class MapSrc >
-inline void deep_copy_data_impl( MapDst & dst, const MapSrc & src )
-{
-  typedef typename MapDst::node_block_type node_block_type;
-  typedef Kokkos::Impl::DeepCopy< typename MapDst::device_type::memory_space, typename MapSrc::device_type::memory_space > raw_deep_copy;
-  dst.node_blocks = typename MapDst::node_block_view("UnorderedMap_nodes", src.node_blocks.size());
-  dst.hashes = typename MapDst::hash_view("UnorderedMap_hashes", src.hashes.size());
-
-  raw_deep_copy(const_cast<node_block_type*>(dst.node_blocks.ptr_on_device()), src.node_blocks.ptr_on_device(), sizeof(node_block_type) * src.node_blocks.size());
-  raw_deep_copy(const_cast<node_atomic*>(dst.hashes.ptr_on_device()), src.hashes.ptr_on_device(), sizeof(node_atomic) * src.hashes.size());
-  raw_deep_copy(const_cast<node_state_counts*>(dst.counts.ptr_on_device()), src.counts.ptr_on_device(), sizeof(node_state_counts));
-
-  dst.key_compare = src.key_compare;
-  dst.key_hash = src.key_hash;
-}
-
-}}} // namespace Kokkos::Impl::UnorderedMap
-
-#endif //KOKKOS_UNORDERED_MAP_IMPL_HPP
-
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/kokkos/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
deleted file mode 100644
index 569f000..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDAEXEC_HPP
-#define KOKKOS_CUDAEXEC_HPP
-
-#include <string>
-#include <Kokkos_Parallel.hpp>
-#include <Cuda/Kokkos_Cuda_abort.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#if defined( __CUDACC__ )
-
-namespace Kokkos {
-namespace Impl {
-
-class CudaExec {
-public:
-
-  __device__ inline
-  CudaExec( const int shmem_begin , const int shmem_end )
-    : m_shmem_end(   shmem_end )
-    , m_shmem_iter(  shmem_begin )
-    {}
-
-  __device__ inline
-  void * get_shmem( const int size )
-  {
-    extern __shared__ int sh[];
-
-    // m_shmem_iter is in bytes, convert to integer offsets
-    const int offset = m_shmem_iter >> power_of_two<sizeof(int)>::value ;
-
-    m_shmem_iter += size ;
-
-    if ( m_shmem_end < m_shmem_iter ) {
-      cuda_abort("Cuda::get_shmem out of memory");
-    }
-
-    return sh + offset ;
-  }
-
-private:
-
-  const int m_shmem_end ;
-        int m_shmem_iter ;
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#if defined( __CUDA_ARCH__ )
-
-namespace Kokkos {
-
-inline __device__ 
-void * Cuda::get_shmem( const int size ) { return m_exec.get_shmem( size ); }
-
-} // namespace Kokkos
-
-#endif /* defined( __CUDA_ARCH__ ) */
-#endif /* defined( __CUDACC__ ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-struct CudaTraits {
-  enum { WarpSize       = 32      /* 0x0020 */ };
-  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
-  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
-
-  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
-  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
-  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
-
-  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
-  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
-  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
-  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
-
-  typedef unsigned long
-    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
-
-  enum { ConstantMemoryUseThreshold = 0x000100 /* 256 bytes */ };
-
-  KOKKOS_INLINE_FUNCTION static
-  CudaSpace::size_type warp_count( CudaSpace::size_type i )
-    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  CudaSpace::size_type warp_align( CudaSpace::size_type i )
-    {
-      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
-      return ( i + WarpIndexMask ) & Mask ;
-    }
-};
-
-//----------------------------------------------------------------------------
-
-CudaSpace::size_type cuda_internal_maximum_warp_count();
-CudaSpace::size_type cuda_internal_maximum_grid_count();
-CudaSpace::size_type cuda_internal_maximum_shared_words();
-
-CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
-CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
-CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ )
-
-/** \brief  Access to constant memory on the device */
-__device__ __constant__
-Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
-kokkos_impl_cuda_constant_memory_buffer ;
-
-template< typename T >
-inline
-__device__
-T * kokkos_impl_cuda_shared_memory()
-{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-// See section B.17 of Cuda C Programming Guide Version 3.2
-// for discussion of
-//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
-// function qualifier which could be used to improve performance.
-//----------------------------------------------------------------------------
-// Maximize L1 cache and minimize shared memory:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
-// For 2.0 capability: 48 KB L1 and 16 KB shared
-//----------------------------------------------------------------------------
-
-template< class DriverType >
-__global__
-static void cuda_parallel_launch_constant_memory()
-{
-  const DriverType & driver =
-    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
-
-  driver();
-}
-
-template< class DriverType >
-__global__
-static void cuda_parallel_launch_local_memory( const DriverType driver )
-{
-  driver();
-}
-
-template < class DriverType ,
-           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
-struct CudaParallelLaunch ;
-
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , true > {
-
-  inline
-  CudaParallelLaunch( const DriverType & driver ,
-                      const dim3       & grid ,
-                      const dim3       & block ,
-                      const int          shmem )
-  {
-    if ( grid.x && block.x ) {
-
-      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
-           sizeof( DriverType ) ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
-      }
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      else if ( shmem ) {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
-      } else {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
-      }
-
-      // Copy functor to constant memory on the device
-      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
-
-      // Invoke the driver function on the device
-      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem >>>();
-
-#if defined( KOKKOS_EXPRESSION_CHECK )
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-template < class DriverType >
-struct CudaParallelLaunch< DriverType , false > {
-
-  inline
-  CudaParallelLaunch( const DriverType & driver ,
-                      const dim3       & grid ,
-                      const dim3       & block ,
-                      const int          shmem )
-  {
-    if ( grid.x && block.x ) {
-
-      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
-        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
-      }
-      else if ( shmem ) {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
-      } else {
-        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
-      }
-
-      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem >>>( driver );
-
-#if defined( KOKKOS_EXPRESSION_CHECK )
-      Kokkos::Cuda::fence();
-#endif
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* defined( __CUDACC__ ) */
-
-#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu b/kokkos/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu
deleted file mode 100644
index 908ad0f..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_CudaSpace.cu
+++ /dev/null
@@ -1,300 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-#include <Kokkos_Cuda.hpp>
-#include <Kokkos_CudaSpace.hpp>
-
-#include <Cuda/Kokkos_Cuda_Internal.hpp>
-#include <impl/Kokkos_MemoryTracking.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-DeepCopy<HostSpace,CudaSpace>
-  ::DeepCopy( void * dst , const void * src , size_t n )
-{
-  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
-}
-
-DeepCopy<CudaSpace,HostSpace>
-  ::DeepCopy( void * dst , const void * src , size_t n )
-{
-  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
-}
-
-DeepCopy<CudaSpace,CudaSpace>
-  ::DeepCopy( void * dst , const void * src , size_t n )
-{
-  CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) );
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace {
-
-class CudaMemoryTrackingEntry : public Impl::MemoryTrackingEntry
-{
-public:
-
-  void * const                    ptr_alloc ;
-  const unsigned                  size ;
-  const unsigned                  count ;
-  Impl::cuda_texture_object_type  tex_obj ;
-
-  CudaMemoryTrackingEntry( const std::string & arg_label ,
-                           const std::type_info & arg_info ,
-                           void * const           arg_ptr ,
-                           const unsigned         arg_size ,
-                           const unsigned         arg_count )
-    : Impl::MemoryTrackingEntry( arg_label , arg_info , arg_ptr , arg_size * arg_count )
-    , ptr_alloc( arg_ptr )
-    , size( arg_size )
-    , count( arg_count )
-    , tex_obj( 0 )
-    {}
-
-  ~CudaMemoryTrackingEntry();
-};
-
-CudaMemoryTrackingEntry::~CudaMemoryTrackingEntry()
-{
-  cudaError_t sync_err = cudaDeviceSynchronize();
-
-  if ( tex_obj ) {
-
-  }
-
-  cudaError_t free_err = cudaFree( ptr_alloc );
-
-  if ( cudaSuccess != sync_err || cudaSuccess != free_err ) {
-    std::cerr << "cudaFree( " << ptr_alloc << " ) FAILED for " ;
-    Impl::MemoryTrackingEntry::print( std::cerr );
-  }
-}
-
-Impl::MemoryTracking & cuda_space_singleton()
-{
-  static Impl::MemoryTracking self("Kokkos::CudaSpace");
-  return self ;
-}
-
-}
-
-/*--------------------------------------------------------------------------*/
-
-/*--------------------------------------------------------------------------*/
-
-void * CudaSpace::allocate(
-  const std::string    & label ,
-  const std::type_info & scalar_type ,
-  const size_t           scalar_size ,
-  const size_t           scalar_count )
-{
-  if ( HostSpace::in_parallel() ) {
-    Kokkos::Impl::throw_runtime_exception( "Kokkos::CudaSpace::allocate ERROR : Called with HostSpace::in_parallel" );
-  }
-
-  const size_t size = scalar_size * scalar_count ;
-
-  void * ptr = 0 ;
-
-  if ( 0 < scalar_size * scalar_count ) {
-
-    try {
-      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-      CUDA_SAFE_CALL( cudaMalloc( (void**) &ptr, size) );
-      CUDA_SAFE_CALL( cudaThreadSynchronize() );
-    }
-    catch( std::runtime_error & err) {
-      std::ostringstream msg ;
-      msg << "Kokkos::Impl::CudaSpace::allocate( "
-          << label
-          << " , " << scalar_type.name()
-          << " , " << scalar_size
-          << " , " << scalar_count
-          << " ) FAILED memory allocation\n" 
-          << err.what();
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    } 
-
-    cuda_space_singleton().insert(
-      new CudaMemoryTrackingEntry( label , scalar_type , ptr , scalar_size , scalar_count ) );
-  }
-
-  return ptr ;
-}
-
-void CudaSpace::increment( const void * ptr )
-{
-  if ( HostSpace::in_parallel() ) {
-    Kokkos::Impl::throw_runtime_exception( "Kokkos::CudaSpace::increment ERROR : Called with HostSpace::in_parallel" );
-  }
-
-  cuda_space_singleton().increment( ptr );
-}
-
-void CudaSpace::decrement( const void * ptr )
-{
-  if ( HostSpace::in_parallel() ) {
-    Kokkos::Impl::throw_runtime_exception( "Kokkos::CudaSpace::decrement ERROR : Called with HostSpace::in_parallel" );
-  }
-
-  cuda_space_singleton().decrement( ptr );
-}
-
-void CudaSpace::print_memory_view( std::ostream & o )
-{
-  cuda_space_singleton().print( o , std::string("  ") );
-}
-
-//----------------------------------------------------------------------------
-
-std::string CudaSpace::query_label( const void * p )
-{
-  const Impl::MemoryTrackingEntry * entry =
-    cuda_space_singleton().query( p );
-
-  return entry ? entry->label : std::string("ERROR NOT FOUND");
-}
-
-void CudaSpace::access_error()
-{
-  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
-
-  Kokkos::Impl::throw_runtime_exception( msg );
-}
-
-void CudaSpace::access_error( const void * const ptr )
-{
-  std::ostringstream msg ;
-  msg << "Kokkos::CudaSpace::access_error:" ;
-  msg << " attempt to access Cuda-data labeled(" ;
-  msg << query_label( ptr ) ;
-  msg << ") from non-Cuda execution" ;
-  Kokkos::Impl::throw_runtime_exception( msg.str() );
-}
-
-/*--------------------------------------------------------------------------*/
-
-} // namespace Kokkos
-
-#if defined( CUDA_VERSION ) && ( 500 <= CUDA_VERSION )
-
-namespace Kokkos {
-namespace Impl {
-
-::cudaTextureObject_t
-cuda_texture_object_attach(
-  const cudaChannelFormatDesc & desc ,
-  const void * const            ptr )
-{
-  if ( 0 == ptr ) return 0 ;
-
-  const unsigned max_count = 1 << 28 ;
-
-  CudaMemoryTrackingEntry * entry =
-    dynamic_cast<CudaMemoryTrackingEntry *>( cuda_space_singleton().query( ptr ) );
-
-  const bool ok_found  = 0 != entry ;
-  const bool ok_ptr    = ok_found && ptr == entry->ptr_alloc ;
-  const bool ok_count  = ok_found && entry->count < max_count ;
-
-  if ( ok_found && ok_ptr && ok_count ) {
-
-    // Can only create texture object on device architure 3.0 or better
-
-    if ( 0 == entry->tex_obj && 300 <= Cuda::device_arch() ) {
-
-      struct cudaResourceDesc resDesc ;
-      struct cudaTextureDesc  texDesc ;
-
-      memset( & resDesc , 0 , sizeof(resDesc) );
-      memset( & texDesc , 0 , sizeof(texDesc) );
-
-      resDesc.resType                = cudaResourceTypeLinear ;
-      resDesc.res.linear.desc        = desc ;
-      resDesc.res.linear.sizeInBytes = entry->size * entry->count ;
-      resDesc.res.linear.devPtr      = entry->ptr_alloc ;
-
-      cudaCreateTextureObject( & entry->tex_obj, & resDesc, & texDesc, NULL);
-    }
-  }
-  else {
-    std::ostringstream msg ;
-    msg << "CudaSpace::texture_object_attach( " << ptr << " ) FAILED: " ;
-
-    if ( ! ok_found ) {
-      msg << "Not View allocated" ;
-    }
-    else if ( ! ok_ptr ) {
-      msg << "Not the originally allocated View \"" << entry->label << "\"" ;
-    }
-    else if ( ! ok_count ) {
-      msg << "Cuda texture object limit exceeded "
-          << max_count << " <= " << entry->count ;
-    }
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  return entry->tex_obj ;
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif
-
-
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu b/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu
deleted file mode 100644
index 6a7d0cd..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cu
+++ /dev/null
@@ -1,587 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-/*--------------------------------------------------------------------------*/
-/* Kokkos interfaces */
-
-#include <Kokkos_Cuda.hpp>
-#include <Cuda/Kokkos_Cuda_Internal.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-/* Standard 'C' libraries */
-#include <stdlib.h>
-
-/* Standard 'C++' libraries */
-#include <vector>
-#include <iostream>
-#include <sstream>
-#include <string>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-
-void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
-{
-  std::ostringstream out ;
-  out << name << " error: " << cudaGetErrorString(e);
-  if (file) {
-    out << " " << file << ":" << line;
-  }
-  throw_runtime_exception( out.str() );
-}
-
-//----------------------------------------------------------------------------
-// Some significant cuda device properties:
-//
-// cudaDeviceProp::name                : Text label for device
-// cudaDeviceProp::major               : Device major number
-// cudaDeviceProp::minor               : Device minor number
-// cudaDeviceProp::warpSize            : number of threads per warp
-// cudaDeviceProp::multiProcessorCount : number of multiprocessors
-// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
-// cudaDeviceProp::totalConstMem       : capacity of constant memory
-// cudaDeviceProp::totalGlobalMem      : capacity of global memory
-// cudaDeviceProp::maxGridSize[3]      : maximum grid size
-
-//
-//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
-//
-// struct cudaDeviceProp {
-//   char name[256];
-//   size_t totalGlobalMem;
-//   size_t sharedMemPerBlock;
-//   int regsPerBlock;
-//   int warpSize;
-//   size_t memPitch;
-//   int maxThreadsPerBlock;
-//   int maxThreadsDim[3];
-//   int maxGridSize[3];
-//   size_t totalConstMem;
-//   int major;
-//   int minor;
-//   int clockRate;
-//   size_t textureAlignment;
-//   int deviceOverlap;
-//   int multiProcessorCount;
-//   int kernelExecTimeoutEnabled;
-//   int integrated;
-//   int canMapHostMemory;
-//   int computeMode;
-//   int concurrentKernels;
-//   int ECCEnabled;
-//   int pciBusID;
-//   int pciDeviceID;
-//   int tccDriver;
-//   int asyncEngineCount;
-//   int unifiedAddressing;
-//   int memoryClockRate;
-//   int memoryBusWidth;
-//   int l2CacheSize;
-//   int maxThreadsPerMultiProcessor;
-// };
-
-
-namespace {
-
-
-
-class CudaInternalDevices {
-public:
-  enum { MAXIMUM_DEVICE_COUNT = 8 };
-  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
-  int                    m_cudaDevCount ;
-
-  CudaInternalDevices();
-
-  static const CudaInternalDevices & singleton();
-};
-
-CudaInternalDevices::CudaInternalDevices()
-{
-  // See 'cudaSetDeviceFlags' for host-device thread interaction
-  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
-
-  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
-
-  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
-    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
-  }
-}
-
-const CudaInternalDevices & CudaInternalDevices::singleton()
-{
-  static CudaInternalDevices self ; return self ;
-}
-
-}
-
-//----------------------------------------------------------------------------
-
-class CudaInternal {
-private:
-
-  CudaInternal( const CudaInternal & );
-  CudaInternal & operator = ( const CudaInternal & );
-
-public:
-
-  typedef Cuda::size_type size_type ;
-
-  int         m_cudaDev ;
-  unsigned    m_maxWarpCount ;
-  unsigned    m_maxBlock ;
-  unsigned    m_maxSharedWords ;
-  size_type   m_scratchSpaceCount ;
-  size_type   m_scratchFlagsCount ;
-  size_type   m_scratchUnifiedCount ;
-  size_type   m_scratchUnifiedSupported ;
-  size_type * m_scratchSpace ;
-  size_type * m_scratchFlags ;
-  size_type * m_scratchUnified ;
-
-  static CudaInternal & raw_singleton();
-  static CudaInternal & singleton();
-
-  const CudaInternal & assert_initialized() const ;
-
-  int is_initialized() const
-    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
-
-  void initialize( int cuda_device_id );
-  void finalize();
-
-  void print_configuration( std::ostream & ) const ;
-
-  ~CudaInternal();
-
-  CudaInternal()
-    : m_cudaDev( -1 )
-    , m_maxWarpCount( 0 )
-    , m_maxBlock( 0 ) 
-    , m_maxSharedWords( 0 )
-    , m_scratchSpaceCount( 0 )
-    , m_scratchFlagsCount( 0 )
-    , m_scratchUnifiedCount( 0 )
-    , m_scratchUnifiedSupported( 0 )
-    , m_scratchSpace( 0 )
-    , m_scratchFlags( 0 )
-    , m_scratchUnified( 0 )
-    {}
-
-  size_type * scratch_space( const size_type size );
-  size_type * scratch_flags( const size_type size );
-  size_type * scratch_unified( const size_type size );
-};
-
-//----------------------------------------------------------------------------
-
-
-void CudaInternal::print_configuration( std::ostream & s ) const
-{
-  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
-
-#if defined( KOKKOS_HAVE_CUDA )
-    s << "macro  KOKKOS_HAVE_CUDA      : defined" << std::endl ;
-#endif
-#if defined( KOKKOS_HAVE_CUDA_ARCH )
-    s << "macro  KOKKOS_HAVE_CUDA_ARCH = " << KOKKOS_HAVE_CUDA_ARCH
-      << " = capability " << KOKKOS_HAVE_CUDA_ARCH / 100
-      << "." << ( KOKKOS_HAVE_CUDA_ARCH % 100 ) / 10
-      << std::endl ;
-#endif
-#if defined( CUDA_VERSION )
-    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
-      << " = version " << CUDA_VERSION / 1000
-      << "." << ( CUDA_VERSION % 1000 ) / 10
-      << std::endl ;
-#endif
-
-  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
-    s << "Kokkos::Cuda[ " << i << " ] "
-      << dev_info.m_cudaProp[i].name
-      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
-      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) 
-      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
-    if ( m_cudaDev == i ) s << " : Selected" ;
-    s << std::endl ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-CudaInternal::~CudaInternal()
-{
-  if ( m_scratchSpace ||
-       m_scratchFlags ||
-       m_scratchUnified ) {
-    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
-              << std::endl ;
-    std::cerr.flush();
-  }
-}
-
-CudaInternal & CudaInternal::raw_singleton()
-{ static CudaInternal self ; return self ; }
-
-const CudaInternal & CudaInternal::assert_initialized() const
-{
-  if ( m_cudaDev == -1 ) {
-    const std::string msg("CATASTROPHIC FAILURE: Using Kokkos::Cuda before calling Kokkos::Cuda::initialize(...)");
-    throw_runtime_exception( msg );
-  }
-  return *this ;
-}
-
-CudaInternal & CudaInternal::singleton()
-{
-  CudaInternal & s = raw_singleton();
-  s.assert_initialized();
-  return s ;
-}
-
-void CudaInternal::initialize( int cuda_device_id )
-{
-  enum { WordSize = sizeof(size_type) };
-
-  if ( ! Cuda::host_mirror_device_type::is_initialized() ) {
-    const std::string msg("Cuda::initialize ERROR : Cuda::host_mirror_device_type is not initialized");
-    throw_runtime_exception( msg );
-  }
-
-  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
-
-  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
-
-  const bool ok_id   = 0 <= cuda_device_id &&
-                            cuda_device_id < dev_info.m_cudaDevCount ;
-
-  // Need device capability 2.0 or better
-
-  const bool ok_dev = ok_id &&
-    ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
-      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
-
-  if ( ok_init && ok_dev ) {
-
-    const struct cudaDeviceProp & cudaProp =
-      dev_info.m_cudaProp[ cuda_device_id ];
-
-    m_cudaDev = cuda_device_id ;
-
-    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
-    CUDA_SAFE_CALL( cudaDeviceReset() );
-    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-
-    //----------------------------------
-    // Maximum number of warps,
-    // at most one warp per thread in a warp for reduction.
-
-    // HCE 2012-February :
-    // Found bug in CUDA 4.1 that sometimes a kernel launch would fail
-    // if the thread count == 1024 and a functor is passed to the kernel.
-    // Copying the kernel to constant memory and then launching with
-    // thread count == 1024 would work fine.
-    //
-    // HCE 2012-October :
-    // All compute capabilities support at least 16 warps (512 threads).
-    // However, we have found that 8 warps typically gives better performance.
-
-    m_maxWarpCount = 8 ;
-
-    // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
-
-    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
-      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
-    }
-
-    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
-
-    //----------------------------------
-
-    m_maxBlock = cudaProp.maxGridSize[0] ;
-
-    //----------------------------------
-
-    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
-
-    if ( ! m_scratchUnifiedSupported ) {
-      std::cout << "Kokkos::Cuda device "
-                << cudaProp.name << " capability "
-                << cudaProp.major << "." << cudaProp.minor
-                << " does not support unified virtual address space"
-                << std::endl ;
-    }
-
-    //----------------------------------
-    // Multiblock reduction uses scratch flags for counters
-    // and scratch space for partial reduction values.
-    // Allocate some initial space.  This will grow as needed.
-
-    {
-      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
-
-      (void) scratch_unified( 16 * sizeof(size_type) );
-      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
-      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
-    }
-  }
-  else {
-
-    std::ostringstream msg ;
-    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
-
-    if ( ! ok_init ) {
-      msg << " : Already initialized" ;
-    }
-    if ( ! ok_id ) {
-      msg << " : Device identifier out of range "
-          << "[0.." << dev_info.m_cudaDevCount << "]" ;
-    }
-    else if ( ! ok_dev ) {
-      msg << " : Device " ;
-      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
-      msg << "." ;
-      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
-      msg << " has insufficient capability, required 2.0 or better" ;
-    }
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  } 
-}
-
-//----------------------------------------------------------------------------
-
-typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
-enum { sizeScratchGrain = sizeof(ScratchGrain) };
-
-
-Cuda::size_type *
-CudaInternal::scratch_flags( const Cuda::size_type size )
-{
-  assert_initialized();
-
-  if ( m_scratchFlagsCount * sizeScratchGrain < size ) {
-
-    Cuda::memory_space::decrement( m_scratchFlags );
-  
-    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
-
-    m_scratchFlags = (size_type *)
-      Cuda::memory_space::allocate(
-        std::string("InternalScratchFlags") ,
-        typeid( ScratchGrain ),
-        sizeof( ScratchGrain ),
-        m_scratchFlagsCount );
-
-    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
-  }
-
-  return m_scratchFlags ;
-}
-
-Cuda::size_type *
-CudaInternal::scratch_space( const Cuda::size_type size )
-{
-  assert_initialized();
-
-  if ( m_scratchSpaceCount * sizeScratchGrain < size ) {
-
-    Cuda::memory_space::decrement( m_scratchSpace );
-  
-    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
-
-    m_scratchSpace = (size_type *)
-      Cuda::memory_space::allocate(
-        std::string("InternalScratchSpace") ,
-        typeid( ScratchGrain ),
-        sizeof( ScratchGrain ),
-        m_scratchSpaceCount );
-  }
-
-  return m_scratchSpace ;
-}
-
-Cuda::size_type *
-CudaInternal::scratch_unified( const Cuda::size_type size )
-{
-  assert_initialized();
-
-  if ( m_scratchUnifiedSupported ) {
-
-    const bool allocate   = m_scratchUnifiedCount * sizeScratchGrain < size ;
-    const bool deallocate = m_scratchUnified && ( 0 == size || allocate );
-
-    if ( allocate || deallocate ) {
-      CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-    }
-
-    if ( deallocate ) {
-
-      CUDA_SAFE_CALL( cudaFreeHost( m_scratchUnified ) );
-
-      m_scratchUnified = 0 ;
-      m_scratchUnifiedCount = 0 ;
-    }
-
-    if ( allocate ) {
-
-      m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
-
-      CUDA_SAFE_CALL( cudaHostAlloc( (void **)( & m_scratchUnified ) ,
-                      m_scratchUnifiedCount * sizeScratchGrain ,
-                      cudaHostAllocDefault ) );
-    }
-  }
-
-  return m_scratchUnified ;
-}
-
-//----------------------------------------------------------------------------
-
-void CudaInternal::finalize()
-{
-  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
-
-    Cuda::memory_space::decrement( m_scratchSpace );
-    Cuda::memory_space::decrement( m_scratchFlags );
-    (void) scratch_unified( 0 );
-
-    m_cudaDev            = -1 ;
-    m_maxWarpCount       = 0 ;
-    m_maxBlock           = 0 ; 
-    m_maxSharedWords     = 0 ;
-    m_scratchSpaceCount  = 0 ;
-    m_scratchFlagsCount  = 0 ;
-    m_scratchSpace       = 0 ;
-    m_scratchFlags       = 0 ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-Cuda::size_type cuda_internal_maximum_warp_count()
-{ return CudaInternal::singleton().m_maxWarpCount ; }
-
-Cuda::size_type cuda_internal_maximum_grid_count()
-{ return CudaInternal::singleton().m_maxBlock ; }
-
-Cuda::size_type cuda_internal_maximum_shared_words()
-{ return CudaInternal::singleton().m_maxSharedWords ; }
-
-Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_space( size ); }
-
-Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_flags( size ); }
-
-Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
-{ return CudaInternal::singleton().scratch_unified( size ); }
-
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-Cuda::size_type Cuda::detect_device_count()
-{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
-
-int Cuda::is_initialized()
-{ return Impl::CudaInternal::raw_singleton().is_initialized(); }
-
-void Cuda::initialize( const Cuda::SelectDevice config )
-{ Impl::CudaInternal::raw_singleton().initialize( config.cuda_device_id ); }
-
-std::vector<unsigned>
-Cuda::detect_device_arch()
-{
-  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
-
-  std::vector<unsigned> output( s.m_cudaDevCount );
-
-  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
-    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
-  }
-
-  return output ;
-}
-
-Cuda::size_type Cuda::device_arch()
-{
-  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
-
-  const struct cudaDeviceProp & cudaProp =
-    Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
-
-  return cudaProp.major * 100 + cudaProp.minor ;
-}
-
-void Cuda::finalize()
-{ Impl::CudaInternal::raw_singleton().finalize(); }
-
-void Cuda::print_configuration( std::ostream & s , const bool )
-{ Impl::CudaInternal::raw_singleton().print_configuration( s ); }
-
-bool Cuda::sleep() { return false ; }
-
-bool Cuda::wake() { return true ; }
-
-void Cuda::fence()
-{ 
-  CUDA_SAFE_CALL( cudaDeviceSynchronize() );
-}
-
-unsigned Cuda::team_max()
-{
-  return Impl::CudaInternal::singleton().m_maxWarpCount << Impl::CudaTraits::WarpIndexShift ;
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
deleted file mode 100644
index f386075..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_INTERNAL_HPP
-#define KOKKOS_CUDA_INTERNAL_HPP
-
-namespace Kokkos {
-namespace Impl {
-
-void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
-
-inline
-void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
-{
-  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
-}
-
-}
-}
-
-#define CUDA_SAFE_CALL( call )  \
-	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
-
-#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
-
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
deleted file mode 100644
index cf4cfb1..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ /dev/null
@@ -1,829 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_PARALLEL_HPP
-#define KOKKOS_CUDA_PARALLEL_HPP
-
-#include <iostream>
-#include <stdio.h>
-
-#if defined( __CUDACC__ )
-
-#include <utility>
-#include <Kokkos_Parallel.hpp>
-
-#include <Cuda/Kokkos_CudaExec.hpp>
-#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class WorkSpec >
-class ParallelFor< FunctorType , WorkSpec /* size_t */ , Cuda > {
-private:
-
-  const FunctorType     m_functor ;
-  const Cuda::size_type m_work ;  
-
-  ParallelFor();
-  ParallelFor & operator = ( const ParallelFor & );
-
-public:
-
-  inline
-  __device__
-  void operator()(void) const
-  {
-    const Cuda::size_type work_stride = blockDim.x * gridDim.x ;
-
-    for ( Cuda::size_type
-            iwork = threadIdx.x + blockDim.x * blockIdx.x ;
-            iwork < m_work ;
-            iwork += work_stride ) {
-      m_functor( iwork );
-    }
-  }
-
-  ParallelFor( const FunctorType  & functor ,
-               const size_t         work )
-    : m_functor( functor )
-    , m_work(    work )
-    {
-      const dim3 block( CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1, 1);
-      const dim3 grid( std::min( ( m_work + block.x - 1 ) / block.x , cuda_internal_maximum_grid_count() ) , 1 , 1 );
-
-      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
-    }
-};
-
-template< class FunctorType >
-class ParallelFor< FunctorType , ParallelWorkRequest , Cuda > {
-private:
-
-  const FunctorType          m_functor ;
-  const ParallelWorkRequest  m_work ;
-  const int                  m_shmem ;
-
-  ParallelFor();
-  ParallelFor & operator = ( const ParallelFor & );
-
-public:
-
-  inline
-  __device__
-  void operator()(void) const
-  {
-    CudaExec exec( 0 , m_shmem );
-    m_functor( Cuda( exec ) );
-  }
-
-  ParallelFor( const FunctorType         & functor ,
-               const ParallelWorkRequest &  work )
-    : m_functor( functor )
-    , m_work( std::min( work.league_size , size_t(cuda_internal_maximum_grid_count()) ) ,
-              std::min( work.team_size ,   size_t(CudaTraits::WarpSize * cuda_internal_maximum_warp_count()) ) )
-    , m_shmem( FunctorShmemSize< FunctorType >::value( functor ) )
-    {
-      const dim3 grid(  m_work.league_size , 1 , 1 );
-      const dim3 block( m_work.team_size , 1, 1 );
-
-      CudaParallelLaunch< ParallelFor >( *this , grid , block , m_shmem );
-    }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType >
-class ParallelFor< FunctorType , CudaWorkConfig , Cuda > {
-public:
-
-  const FunctorType m_work_functor ;
-
-  inline
-  __device__
-  void operator()(void) const
-  {
-    Cuda::size_type iwork = threadIdx.x + blockDim.x * (
-                            threadIdx.y + blockDim.y * (
-                            threadIdx.z + blockDim.z * (
-                            blockIdx.x + gridDim.x * (
-                            blockIdx.y + gridDim.y * (
-                            blockIdx.z )))));
-
-    m_work_functor( iwork );
-  }
-
-  ParallelFor( const FunctorType    & functor ,
-               const CudaWorkConfig & work_config )
-  : m_work_functor( functor )
-  {
-    const dim3 grid( work_config.grid[0] ,
-                     work_config.grid[1] ,
-                     work_config.grid[2] );
-
-    const dim3 block( work_config.block[0] ,
-                      work_config.block[1] ,
-                      work_config.block[2] );
-
-    CudaParallelLaunch< ParallelFor >( *this , grid , block , work_config.shared );
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class WorkSpec >
-class ParallelReduce< FunctorType , WorkSpec , Cuda >
-{
-public:
-  typedef ReduceAdapter< FunctorType >        Reduce ;
-  typedef typename Reduce::pointer_type       pointer_type ;
-  typedef typename Reduce::reference_type     reference_type ;
-  typedef Cuda::size_type                     size_type ;
-
-  // Algorithmic constraints:
-  //  (a) blockSize is a power of two
-  //  (b) blockDim.x == BlockSize == 1 << BlockSizeShift
-  //  (c) blockDim.y == blockDim.z == 1
-
-  enum { WarpCount      = 8 };
-  enum { BlockSize      = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
-  enum { BlockSizeShift = power_of_two< BlockSize >::value };
-  enum { BlockSizeMask  = BlockSize - 1 };
-
-  enum { GridMaxComputeCapability_2x = 0x0ffff };
-  enum { GridMax = BlockSize };
-
-  const FunctorType m_functor ;
-  size_type *       m_scratch_space ;
-  size_type *       m_scratch_flags ;
-  size_type *       m_unified_space ;
-  pointer_type      m_host_pointer ;
-  size_type         m_work ;
-  size_type         m_work_per_block ;
-  size_type         m_local_block_count ;
-  size_type         m_global_block_begin ;
-  size_type         m_global_block_count ;
-
-
-  __device__ inline
-  void operator()(void) const
-  {
-    extern __shared__ size_type shared_data[];
-
-    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
-      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
-
-    {
-      reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );
-
-      m_functor.init( value );
-
-      // Number of blocks is bounded so that the reduction can be limited to two passes.
-      // Each thread block is given an approximately equal amount of work to perform.
-      // Accumulate the values for this block.
-      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
-
-      const size_type iwork_beg = blockIdx.x * m_work_per_block ;
-      const size_type iwork_end = iwork_beg + m_work_per_block < m_work
-                                ? iwork_beg + m_work_per_block : m_work ;
-
-      for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
-        m_functor( iwork , value );
-      }
-    }
-
-    // Reduce with final value at BlockSize - 1 location.
-    if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
-           m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
-           shared_data , m_scratch_space , m_scratch_flags ) ) {
-
-      // This is the final block with the final result at the final threads' location
-
-      size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
-      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
-
-      if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
-
-      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
-
-      for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
-    }
-  }
-
-  ParallelReduce( const FunctorType  & functor ,
-                  const size_t         nwork ,
-                  const pointer_type   result = 0 ,
-                  const bool execute_immediately = true )
-  : m_functor( functor )
-  , m_scratch_space( 0 )
-  , m_scratch_flags( 0 )
-  , m_unified_space( 0 )
-  , m_host_pointer( result )
-  , m_work( nwork )
-  , m_work_per_block( 0 )
-  , m_local_block_count( 0 )
-  , m_global_block_begin( 0 )
-  , m_global_block_count( 0 )
-  {
-    // At most 'max_grid' blocks:
-    const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));
-
-    // How much work per block:
-    m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
-
-    // How many block are really needed for this much work:
-    m_local_block_count  = ( nwork + m_work_per_block - 1 ) / m_work_per_block ;
-    m_global_block_count = m_local_block_count ;
-
-    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
-    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
-    m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
-
-    if ( execute_immediately ) { execute(); }
-  }
-
-  inline
-  void execute() const
-  {
-    const dim3 grid( m_local_block_count , 1 , 1 );
-    const dim3 block( BlockSize , 1 , 1 );
-    const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( m_functor );
-
-    CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
-  }
-
-  void wait() const
-  {
-    Cuda::fence();
-
-    if ( m_host_pointer ) {
-      if ( m_unified_space ) {
-        const int count = Reduce::value_count( m_functor );
-        for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
-      }
-      else {
-        const int size = Reduce::value_size( m_functor );
-        DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
-      }
-    }
-  }
-};
-
-
-template< class FunctorType >
-class ParallelReduce< FunctorType , ParallelWorkRequest , Cuda >
-{
-public:
-  typedef ReduceAdapter< FunctorType >        Reduce ;
-  typedef typename Reduce::pointer_type       pointer_type ;
-  typedef typename Reduce::reference_type     reference_type ;
-  typedef Cuda::size_type                     size_type ;
-
-  // Algorithmic constraints:
-  //  (a) blockSize is a power of two
-  //  (b) blockDim.x == BlockSize == 1 << BlockSizeShift
-  //  (b) blockDim.y == blockDim.z == 1
-
-  enum { WarpCount      = 8 };
-  enum { BlockSize      = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
-  enum { BlockSizeShift = power_of_two< BlockSize >::value };
-  enum { BlockSizeMask  = BlockSize - 1 };
-
-  enum { GridMaxComputeCapability_2x = 0x0ffff };
-  enum { GridMax = BlockSize };
-
-  const FunctorType m_functor ;
-  size_type *       m_scratch_space ;
-  size_type *       m_scratch_flags ;
-  size_type *       m_unified_space ;
-  pointer_type      m_host_pointer ;
-  size_type         m_shmem_begin ;
-  size_type         m_shmem_end ;
-  size_type         m_local_block_count ;
-  size_type         m_global_block_begin ;
-  size_type         m_global_block_count ;
-
-  __device__ inline
-  void operator()(void) const
-  {
-    extern __shared__ size_type shared_data[];
-
-    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
-      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
-
-    {
-      reference_type value = Reduce::reference( shared_data + threadIdx.x * word_count.value );
-
-      m_functor.init( value );
-
-      CudaExec exec( m_shmem_begin , m_shmem_end );
-
-      m_functor( Cuda( exec ) , value );
-    }
-
-    // Reduce with final value at BlockSize - 1 location.
-    if ( cuda_single_inter_block_reduce_scan<false,BlockSize>(
-           m_functor , m_global_block_begin + blockIdx.x , m_global_block_count ,
-           shared_data , m_scratch_space , m_scratch_flags ) ) {
-
-      // This is the final block with the final result at the final threads' location
-
-      size_type * const shared = shared_data + BlockSizeMask * word_count.value ;
-      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
-
-      if ( threadIdx.x == 0 ) { Reduce::final( m_functor , shared ); }
-
-      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
-
-      for ( unsigned i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i]; }
-    }
-  }
-
-
-  ParallelReduce( const FunctorType         & functor ,
-                  const ParallelWorkRequest & work ,
-                  const pointer_type          result = 0 ,
-                  const bool execute_immediately = true )
-  : m_functor( functor )
-  , m_scratch_space( 0 )
-  , m_scratch_flags( 0 )
-  , m_unified_space( 0 )
-  , m_host_pointer( result )
-  , m_shmem_begin( cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor ) )
-  , m_shmem_end(   cuda_single_inter_block_reduce_scan_shmem<false,BlockSize>( functor )
-                   + FunctorShmemSize< FunctorType >::value( functor ) )
-  , m_local_block_count( 0 )
-  , m_global_block_begin( 0 )
-  , m_global_block_count( 0 )
-  {
-    m_local_block_count  = std::min( int(GridMax) , int(work.league_size) );
-    m_global_block_count = std::min( int(GridMax) , int(work.league_size) );
-    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * m_local_block_count );
-    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
-    m_unified_space = cuda_internal_scratch_unified( Reduce::value_size( functor ) );
-
-    if ( execute_immediately ) { execute(); }
-  }
-
-  inline
-  void execute() const
-  {
-    const dim3 grid( m_local_block_count , 1 , 1 );
-    const dim3 block( BlockSize , 1 , 1 );
-
-    CudaParallelLaunch< ParallelReduce >( *this, grid, block, m_shmem_end ); // copy to device and execute
-  }
-
-  void wait() const
-  {
-    Cuda::fence();
-
-    if ( m_host_pointer ) {
-      if ( m_unified_space ) {
-        const int count = Reduce::value_count( m_functor );
-        for ( int i = 0 ; i < count ; ++i ) { m_host_pointer[i] = pointer_type(m_unified_space)[i] ; }
-      }
-      else {
-        const int size = Reduce::value_size( m_functor );
-        DeepCopy<HostSpace,CudaSpace>( m_host_pointer , m_scratch_space , size );
-      }
-    }
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class Functor >
-class MultiFunctorParallelReduceMember ;
-
-template<>
-class MultiFunctorParallelReduceMember<void>
-{
-private:
-
-  MultiFunctorParallelReduceMember( const MultiFunctorParallelReduceMember & );
-  MultiFunctorParallelReduceMember & operator = ( const MultiFunctorParallelReduceMember & );
-
-protected:
-
-  MultiFunctorParallelReduceMember() {}
-
-public:
-
-  virtual unsigned block_count() const = 0 ;
-
-  virtual ~MultiFunctorParallelReduceMember() {}
-
-  virtual void execute( void * const host_pointer ,
-                        const unsigned global_block_begin ,
-                        const unsigned global_block_count ) = 0 ;
-
-  virtual void wait() const = 0 ;
-};
-
-template< class Functor >
-class MultiFunctorParallelReduceMember : public MultiFunctorParallelReduceMember<void> {
-public:
-  ParallelReduce< Functor , size_t , Cuda >  m_functor ;
-
-  MultiFunctorParallelReduceMember( const Functor & f , size_t nwork )
-    : MultiFunctorParallelReduceMember<void>()
-    , m_functor( f , nwork , 0 , false )
-    {}
-
-  virtual unsigned block_count() const { return m_functor.m_local_block_count ; }
-
-  virtual void execute( void * const host_pointer ,
-                        const unsigned global_block_begin ,
-                        const unsigned global_block_count )
-  {
-    m_functor.m_host_pointer = typename ReduceAdapter< Functor >::pointer_type(host_pointer);
-    m_functor.m_global_block_begin = global_block_begin ;
-    m_functor.m_global_block_count = global_block_count ;
-    m_functor.execute();
-  }
-
-  virtual void wait() const { m_functor.wait(); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<>
-class MultiFunctorParallelReduce< Cuda >
-{
-private:
-
-  typedef std::vector< Impl::MultiFunctorParallelReduceMember<void> * > MemberVector ;
-
-  MemberVector m_functors ;
-
-public:
-
-  MultiFunctorParallelReduce()
-    : m_functors()
-    {}
-
-  ~MultiFunctorParallelReduce()
-  {
-    while ( ! m_functors.empty() ) {
-      delete m_functors.back();
-      m_functors.pop_back();
-    }
-  }
-
-  template< class FunctorType >
-  void push_back( const size_t work_count , const FunctorType & f )
-  {
-    m_functors.push_back( new Impl::MultiFunctorParallelReduceMember<FunctorType>( f , work_count ) );
-  }
-
-  void execute( void * host_pointer )
-  {
-    typename MemberVector::iterator m ;
-
-    Cuda::size_type block_count = 0 ;
-
-    for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
-      block_count += (*m)->block_count();
-    }
-
-    Cuda::size_type block_offset = 0 ;
-
-    for ( m = m_functors.begin() ; m != m_functors.end() ; ++m ) {
-      (*m)->execute( host_pointer , block_offset , block_count );
-      block_offset += (*m)->block_count();
-    }
-  }
-
-  void wait() const
-  {
-    if ( ! m_functors.empty() ) { (m_functors.back())->wait(); }
-  }
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class WorkSpec >
-class ParallelScan< FunctorType , WorkSpec , Cuda >
-{
-public:
-  typedef ReduceAdapter< FunctorType >        Reduce ;
-  typedef typename Reduce::pointer_type       pointer_type ;
-  typedef typename Reduce::reference_type     reference_type ;
-  typedef Cuda::size_type                     size_type ;
-
-  // Algorithmic constraints:
-  //  (a) blockSize is a power of two
-  //  (b) blockDim.x == BlockSize == 1 << BlockSizeShift
-  //  (b) blockDim.y == blockDim.z == 1
-  //  (c) gridDim.x  <= blockDim.x * blockDim.x
-  //  (d) gridDim.y  == gridDim.z == 1
-
-  // blockDim.x must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
-  // gridDim.x <= blockDim.x * blockDim.x
-  //
-  // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
-
-  enum { WarpCount      = 4 };
-  enum { BlockSize      = CudaTraits::WarpSize << power_of_two< WarpCount >::value };
-  enum { BlockSizeShift = power_of_two< BlockSize >::value };
-  enum { BlockSizeMask  = BlockSize - 1 };
-
-  enum { GridMaxComputeCapability_2x = 0x0ffff };
-  enum { GridMax = ( BlockSize * BlockSize ) < GridMaxComputeCapability_2x
-                 ? ( BlockSize * BlockSize ) : GridMaxComputeCapability_2x };
-
-  const FunctorType m_functor ;
-  size_type *       m_scratch_space ;
-  size_type *       m_scratch_flags ;
-  const size_type   m_work ;
-        size_type   m_work_per_block ;
-        size_type   m_final ;
-  
-  //----------------------------------------
-
-  __device__ inline
-  void initial(void) const
-  {
-    extern __shared__ size_type shared_data[];
-
-    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
-      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
-
-    size_type * const shared_value = shared_data + word_count.value * threadIdx.x ;
-
-    m_functor.init( Reduce::reference( shared_value ) );
-
-    // Number of blocks is bounded so that the reduction can be limited to two passes.
-    // Each thread block is given an approximately equal amount of work to perform.
-    // Accumulate the values for this block.
-    // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
-
-    const size_type iwork_beg = blockIdx.x * m_work_per_block ;
-    const size_type iwork_end = iwork_beg + m_work_per_block < m_work 
-                              ? iwork_beg + m_work_per_block : m_work ;
-
-    for ( size_type iwork = threadIdx.x + iwork_beg ; iwork < iwork_end ; iwork += BlockSize ) {
-      m_functor( iwork , Reduce::reference( shared_value ) , false );
-    }
-
-    // Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
-    // Blocks' scan values are written to 'blockIdx.x' location.
-    // Block-groups' scan values are at: i = ( j * BlockSize - 1 ) for i < gridDim.x
-    cuda_single_inter_block_reduce_scan<true,BlockSize>( m_functor , blockIdx.x , gridDim.x , shared_data , m_scratch_space , m_scratch_flags );
-  }
-
-  //----------------------------------------
-
-  __device__ inline
-  void final(void) const
-  {
-    extern __shared__ size_type shared_data[];
-
-    const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
-      word_count( Reduce::value_size( m_functor ) / sizeof(size_type) );
-
-    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
-    size_type * const shared_prefix = shared_data + word_count.value * threadIdx.x ;
-    size_type * const shared_accum  = shared_data + word_count.value * ( BlockSize + 1 );
-
-    // Starting value for this thread block is the previous block's total.
-    if ( blockIdx.x ) {
-      size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
-      for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
-    }
-    else if ( 0 == threadIdx.x ) {
-      m_functor.init( Reduce::reference( shared_accum ) );
-    }
-
-          unsigned iwork_beg = blockIdx.x * m_work_per_block ;
-    const unsigned iwork_end = iwork_beg + m_work_per_block ;
-
-    for ( ; iwork_beg < iwork_end ; iwork_beg += BlockSize ) {
-
-      const unsigned iwork = threadIdx.x + iwork_beg ;
-
-      __syncthreads(); // Don't overwrite previous iteration values until they are used
-
-      m_functor.init( Reduce::reference( shared_prefix + word_count.value ) );
-
-      // Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
-      for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) {
-        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
-      }
-
-      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
-
-      // Call functor to accumulate inclusive scan value for this work item
-      if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix + word_count.value ) , false ); }
-
-      // Scan block values into locations shared_data[1..BlockSize]
-      cuda_intra_block_reduce_scan<true>( m_functor , Reduce::pointer_type(shared_data+word_count.value) );
-
-      {
-        size_type * const block_total = shared_data + word_count.value * blockDim.x ;
-        for ( unsigned i = threadIdx.x ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
-      }
-
-      // Call functor with exclusive scan value
-      if ( iwork < m_work ) { m_functor( iwork , Reduce::reference( shared_prefix ) , true ); }
-    }
-  }
-
-  //----------------------------------------
-
-  __device__ inline
-  void operator()(void) const
-  {
-    if ( ! m_final ) {
-      initial();
-    }
-    else {
-      final();
-    }
-  }
-
-  ParallelScan( const FunctorType  & functor ,
-                const size_t         nwork )
-  : m_functor( functor )
-  , m_scratch_space( 0 )
-  , m_scratch_flags( 0 )
-  , m_work( nwork )
-  , m_work_per_block( 0 )
-  , m_final( false )
-  {
-    // At most 'max_grid' blocks:
-    const int max_grid = std::min( int(GridMax) , int(( nwork + BlockSizeMask ) / BlockSize ));
-
-    // How much work per block:
-    m_work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
-
-    // How many block are really needed for this much work:
-    const dim3 grid( ( nwork + m_work_per_block - 1 ) / m_work_per_block , 1 , 1 );
-    const dim3 block( BlockSize , 1 , 1 );
-    const int shmem = Reduce::value_size( functor ) * ( BlockSize + 2 );
-
-    m_scratch_space = cuda_internal_scratch_space( Reduce::value_size( functor ) * grid.x );
-    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
-
-    m_final = false ;
-    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
-
-    m_final = true ;
-    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
-  }
-
-  void wait() const { Cuda::fence(); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDA_ARCH__ )
-
-namespace Kokkos {
-namespace Impl {
-
-template< typename Type >
-struct CudaJoinFunctor {
-  typedef Type value_type ;
-
-  KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
-                    volatile const value_type & input )
-    { update += input ; }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template< typename TypeLocal , typename TypeGlobal >
-__device__ inline TypeGlobal Cuda::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
-{
-  enum { BlockSizeMax = 512 };
-
-  __shared__ TypeGlobal base_data[ BlockSizeMax + 1 ];
-
-  __syncthreads(); // Don't write in to shared data until all threads have entered this function
-
-  if ( 0 == threadIdx.x ) { base_data[0] = 0 ; }
-
-  base_data[ threadIdx.x + 1 ] = value ;
-
-  Impl::cuda_intra_block_reduce_scan<true>( Impl::CudaJoinFunctor<TypeGlobal>() , base_data + 1 );
-
-  if ( global_accum ) {
-    if ( blockDim.x == threadIdx.x + 1 ) {
-      base_data[ blockDim.x ] = atomic_fetch_add( global_accum , base_data[ blockDim.x ] );
-    }
-    __syncthreads(); // Wait for atomic
-    base_data[ threadIdx.x ] += base_data[ blockDim.x ] ;
-  }
-
-  return base_data[ threadIdx.x ];
-}
-
-template< typename Type >
-__device__ inline Type Cuda::team_scan( const Type & value )
-{ return team_scan( value , (Type*) 0 ); }
-
-} // namespace Kokkos
-
-#else /* ! defined( __CUDA_ARCH__ ) */
-
-namespace Kokkos {
-
-template< typename Type > inline Type Cuda::team_scan( const Type & ) { return 0 ; }
-
-template< typename TypeLocal , typename TypeGlobal >
-inline TypeGlobal Cuda::team_scan( const TypeLocal & , TypeGlobal * const ) { return 0 ; }
-
-} // namespace Kokkos
-
-#endif /* ! defined( __CUDA_ARCH__ ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* defined( __CUDACC__ ) */
-
-#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
-
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
deleted file mode 100644
index d9f2d8f..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
-#define KOKKOS_CUDA_REDUCESCAN_HPP
-
-#if defined( __CUDACC__ )
-
-#include <utility>
-
-#include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-// See section B.17 of Cuda C Programming Guide Version 3.2
-// for discussion of
-//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
-// function qualifier which could be used to improve performance.
-//----------------------------------------------------------------------------
-// Maximize shared memory and minimize L1 cache:
-//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
-// For 2.0 capability: 48 KB shared and 16 KB L1
-//----------------------------------------------------------------------------
-// Must have consistent '__shared__' statement across all device kernels.
-// Since there may be more than one kernel in a file then have to make this
-// a simple array of words.
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-/*
- *  Algorithmic constraints:
- *   (a) blockDim.x is a power of two
- *   (b) blockDim.x <= 512
- *   (c) blockDim.y == blockDim.z == 1
- */
-template< bool DoScan , class FunctorType >
-__device__
-void cuda_intra_block_reduce_scan( const FunctorType & functor ,
-                                   const typename ReduceAdapter< FunctorType >::pointer_type base_data )
-{
-  typedef ReduceAdapter< FunctorType >   Reduce ;
-  typedef typename Reduce::pointer_type  pointer_type ;
-
-  const unsigned value_count   = Reduce::value_count( functor );
-  const unsigned BlockSizeMask = blockDim.x - 1 ;
-
-  // Must have power of two thread count
-
-  if ( BlockSizeMask & blockDim.x ) { cuda_abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
-
-#define BLOCK_REDUCE_STEP( R , TD , S )  \
-  if ( ! ( R & ((1<<(S+1))-1) ) ) \
-    { functor.join( Reduce::reference(TD) , Reduce::reference(TD - (value_count<<S))); }
-
-#define BLOCK_SCAN_STEP( TD , N , S )  \
-  if ( N == (1<<S) ) \
-    { functor.join( Reduce::reference(TD) , Reduce::reference(TD - (value_count<<S))); }
-
-  const unsigned     rtid_intra = threadIdx.x ^ BlockSizeMask ;
-  const pointer_type tdata_intra = base_data + value_count * threadIdx.x ;
-
-  { // Intra-warp reduction:
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
-    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
-  }
-
-  __syncthreads(); // Wait for all warps to reduce
-
-  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
-    const unsigned rtid_inter = ( threadIdx.x ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
-
-    if ( rtid_inter < blockDim.x ) {
-
-      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
-
-      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
-      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
-      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
-      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
-
-      if ( DoScan ) {
-
-        int n = ( rtid_inter &  32 ) ?  32 : (
-                ( rtid_inter &  64 ) ?  64 : (
-                ( rtid_inter & 128 ) ? 128 : (
-                ( rtid_inter & 256 ) ? 256 : 0 )));
-
-        if ( ! ( rtid_inter + n < blockDim.x ) ) n = 0 ;
-
-        BLOCK_SCAN_STEP(tdata_inter,n,8)
-        BLOCK_SCAN_STEP(tdata_inter,n,7)
-        BLOCK_SCAN_STEP(tdata_inter,n,6)
-        BLOCK_SCAN_STEP(tdata_inter,n,5)
-      }
-    }
-  }
-
-  __syncthreads(); // Wait for inter-warp reduce-scan to complete
-
-  if ( DoScan ) {
-    int n = ( rtid_intra &  1 ) ?  1 : (
-            ( rtid_intra &  2 ) ?  2 : (
-            ( rtid_intra &  4 ) ?  4 : (
-            ( rtid_intra &  8 ) ?  8 : (
-            ( rtid_intra & 16 ) ? 16 : 0 ))));
-
-    if ( ! ( rtid_intra + n < blockDim.x ) ) n = 0 ;
-
-    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
-    BLOCK_SCAN_STEP(tdata_intra,n,0)
-  }
-
-#undef BLOCK_SCAN_STEP
-#undef BLOCK_REDUCE_STEP
-}
-
-//----------------------------------------------------------------------------
-/**\brief  Input value-per-thread starting at 'shared_data'.
- *         Reduction value at last thread's location.
- *
- *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
- *
- *  Global reduce result is in the last threads' 'shared_data' location.
- */
-template< bool DoScan , unsigned ArgBlockSize , class FunctorType >
-__device__
-bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
-                                          const Cuda::size_type   block_id ,
-                                          const Cuda::size_type   block_count ,
-                                          Cuda::size_type * const shared_data ,
-                                          Cuda::size_type * const global_data ,
-                                          Cuda::size_type * const global_flags )
-{
-  typedef Cuda::size_type                  size_type ;
-  typedef ReduceAdapter< FunctorType >     Reduce ;
-  typedef typename Reduce::pointer_type    pointer_type ;
-  typedef typename Reduce::reference_type  reference_type ;
-
-  enum { BlockSize      = ArgBlockSize };
-  enum { BlockSizeMask  = BlockSize - 1 };
-  enum { BlockSizeShift = power_of_two< BlockSize >::value };
-
-  const integral_nonzero_constant< size_type , Reduce::StaticValueSize / sizeof(size_type) >
-    word_count( Reduce::value_size( functor ) / sizeof(size_type) );
-
-  // Must have power of two thread count
-  if ( BlockSize != blockDim.x ) { cuda_abort("Cuda::cuda_inter_block_scan wrong blockDim.x"); }
-
-  // Reduce the accumulation for the entire block.
-  cuda_intra_block_reduce_scan<false>( functor , pointer_type(shared_data) );
-
-  {
-    // Write accumulation total to global scratch space.
-    // Accumulation total is the last thread's data.
-    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
-    size_type * const global = global_data + word_count.value * block_id ;
-
-    for ( size_type i = threadIdx.x ; i < word_count.value ; i += BlockSize ) { global[i] = shared[i] ; }
-  }
-
-  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
-  // If this block is not the last block to contribute to this group then the block is done.
-  const bool is_last_block =
-    ! __syncthreads_or( threadIdx.x ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
-
-  if ( is_last_block ) {
-
-    const size_type b = ( long(block_count) * long(threadIdx.x) ) >> BlockSizeShift ;
-    const size_type e = ( long(block_count) * long( threadIdx.x + 1 ) ) >> BlockSizeShift ;
-
-    {
-      reference_type shared_value = Reduce::reference( shared_data + word_count.value * threadIdx.x );
-
-      functor.init( shared_value );
-
-      for ( size_type i = b ; i < e ; ++i ) {
-        functor.join( shared_value , Reduce::reference( global_data + word_count.value * i ) );
-      }
-    }
-
-    cuda_intra_block_reduce_scan<DoScan>( functor , pointer_type(shared_data) );
-
-    if ( DoScan ) {
-
-      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.x ? threadIdx.x - 1 : BlockSize );
-
-      if ( ! threadIdx.x ) { functor.init( Reduce::reference( shared_value ) ); }
-
-      // Join previous inclusive scan value to each member
-      for ( size_type i = b ; i < e ; ++i ) {
-        size_type * const global_value = global_data + word_count.value * i ;
-        functor.join( Reduce::reference( shared_value ) , Reduce::reference( global_value ) );
-        Reduce::copy( functor , global_value , shared_value );
-      }
-    }
-  }
-
-  return is_last_block ;
-}
-
-template< bool DoScan , unsigned ArgBlockSize , class FunctorType >
-inline
-unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor )
-{
-  return ( ArgBlockSize + 2 ) * ReduceAdapter< FunctorType >::value_size( functor );
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( __CUDACC__ ) */
-#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
-
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
deleted file mode 100644
index 2e7ada6..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ /dev/null
@@ -1,928 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_VIEW_HPP
-#define KOKKOS_CUDA_VIEW_HPP
-
-#include <cstring>
-
-#if defined( __CUDACC__ )
-#include <cuda_runtime.h>
-#endif
-
-#include <Kokkos_View.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_CudaSpace.hpp>
-#include <Kokkos_CudaTypes.hpp>
-#include <Cuda/Kokkos_Cuda_abort.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct AssertShapeBoundsAbort< CudaSpace >
-{
-  KOKKOS_INLINE_FUNCTION
-  static void apply( const size_t /* rank */ ,
-                     const size_t /* n0 */ , const size_t /* n1 */ ,
-                     const size_t /* n2 */ , const size_t /* n3 */ ,
-                     const size_t /* n4 */ , const size_t /* n5 */ ,
-                     const size_t /* n6 */ , const size_t /* n7 */ ,
-
-                     const size_t /* arg_rank */ ,
-                     const size_t /* i0 */ , const size_t /* i1 */ ,
-                     const size_t /* i2 */ , const size_t /* i3 */ ,
-                     const size_t /* i4 */ , const size_t /* i5 */ ,
-                     const size_t /* i6 */ , const size_t /* i7 */ )
-    {
-      Kokkos::cuda_abort("Kokkos::View array bounds violation");
-    }
-};
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
-// to be an 'unsigned long long'.  This chould change with
-// future version of Cuda and this typedef would have to
-// change accordingly.
-
-#if defined( CUDA_VERSION ) && ( 500 <= CUDA_VERSION )
-
-typedef enable_if<
-  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
-  ::cudaTextureObject_t >::type cuda_texture_object_type ;
-
-cuda_texture_object_type
-cuda_texture_object_attach(
-  const cudaChannelFormatDesc & ,
-  const void * const );
-
-template< typename TextureType >
-inline
-cuda_texture_object_type
-cuda_texture_object_attach( const void * const base_view_ptr )
-{
-  return cuda_texture_object_attach( cudaCreateChannelDesc<TextureType>() , base_view_ptr );
-}
-
-#else
-
-typedef const void * cuda_texture_object_type ;
-
-template< typename TextureType >
-inline
-cuda_texture_object_type
-cuda_texture_object_attach( const void * const )
-{ return 0 ; }
-
-#endif
-
-//----------------------------------------------------------------------------
-
-template< typename ValueType >
-struct CudaTextureFetch ;
-
-/** \brief  Cuda texture fetch is limited to a subset of Cuda types.
- *          Map commonly used types to the required subset of Cuda types.
- */
-
-template< typename ValueType >
-struct CudaTextureFetch< const ValueType > {
-private:
-
-  cuda_texture_object_type  obj ;
-
-public:
-
-  const ValueType * ptr ;
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : obj( rhs.obj ) , ptr( rhs.ptr ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
-
-  explicit
-  CudaTextureFetch( ValueType * const base_view_ptr )
-    : obj( cuda_texture_object_attach<ValueType>( base_view_ptr ) )
-    , ptr( base_view_ptr ) {}
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ValueType operator[]( const iType & i ) const
-  {
-    return ptr[ i ];
-  }
-};
-
-template<>
-struct CudaTextureFetch< const int > {
-private:
-
-  cuda_texture_object_type  obj ;
-
-public:
-
-  const int * ptr ;
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : obj( rhs.obj ) , ptr( rhs.ptr ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
-
-  explicit
-  CudaTextureFetch( const int * const base_view_ptr )
-    : obj( cuda_texture_object_attach<int>( base_view_ptr ) )
-    , ptr( base_view_ptr ) {}
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  int operator[]( const iType & i ) const
-  {
-#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-#ifdef KOKKOS_USE_LDG_INTRINSIC
-    return _ldg(&ptr[i]);
-#else
-    return tex1Dfetch<int>( obj , i );
-#endif
-#else
-    return ptr[ i ];
-#endif
-  }
-};
-
-template<>
-struct CudaTextureFetch< const unsigned int > {
-private:
-
-  cuda_texture_object_type  obj ;
-
-public:
-
-  const unsigned int * ptr ;
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : obj( rhs.obj ) , ptr( rhs.ptr ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
-
-  explicit
-  CudaTextureFetch( const unsigned int * const base_view_ptr )
-    : obj( cuda_texture_object_attach<unsigned int>( base_view_ptr ) )
-    , ptr( base_view_ptr ) {}
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  unsigned int operator[]( const iType & i ) const
-  {
-#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-#ifdef KOKKOS_USE_LDG_INTRINSIC
-    return _ldg(&ptr[i]);
-#else
-    return tex1Dfetch<unsigned int>( obj , i );
-#endif
-#else
-    return ptr[ i ];
-#endif
-  }
-};
-
-template<>
-struct CudaTextureFetch< const float > {
-private:
-
-  cuda_texture_object_type  obj ;
-
-public:
-
-  const float * ptr ;
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : obj( rhs.obj ) , ptr( rhs.ptr ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
-
-  explicit
-  CudaTextureFetch( const float * const base_view_ptr )
-    : obj( cuda_texture_object_attach<float>( base_view_ptr ) )
-    , ptr( base_view_ptr ) {}
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  float operator[]( const iType & i ) const
-  {
-#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-#ifdef KOKKOS_USE_LDG_INTRINSIC
-    return _ldg(&ptr[i]);
-#else
-    return tex1Dfetch<float>( obj , i );
-#endif
-#else
-    return ptr[ i ];
-#endif
-  }
-};
-
-template<>
-struct CudaTextureFetch< const double > {
-private:
-
-  cuda_texture_object_type  obj ;
-
-public:
-
-  const double * ptr ;
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : obj( rhs.obj ) , ptr( rhs.ptr ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
-
-  explicit
-  CudaTextureFetch( const double * const base_view_ptr )
-    : obj( cuda_texture_object_attach<int2>( base_view_ptr ) )
-    , ptr( base_view_ptr ) {}
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  double operator[]( const iType & i ) const
-  {
-#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-#ifdef KOKKOS_USE_LDG_INTRINSIC
-    return _ldg(&ptr[i]);
-#else
-    int2 v = tex1Dfetch<int2>( obj , i );
-    return __hiloint2double(v.y, v.x);
-#endif
-#else
-    return ptr[ i ];
-#endif
-  }
-};
-
-template<>
-struct CudaTextureFetch< const double2 > {
-private:
-
-  cuda_texture_object_type  obj ;
-
-public:
-
-  const double2 * ptr ;
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch() : obj( 0 ) , ptr( 0 ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~CudaTextureFetch() {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch( const CudaTextureFetch & rhs )
-    : obj( rhs.obj ) , ptr( rhs.ptr ) {}
-
-  KOKKOS_INLINE_FUNCTION
-  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
-    { obj = rhs.obj ; ptr = rhs.ptr ; return *this ; }
-
-  explicit
-  CudaTextureFetch( const double2 * const base_view_ptr )
-    : obj( cuda_texture_object_attach<int4>( base_view_ptr ) )
-    , ptr( base_view_ptr ) {}
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  double2 operator[]( const iType & i ) const
-  {
-#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
-#ifdef KOKKOS_USE_LDG_INTRINSIC
-    return _ldg(&ptr[i]);
-#else
-    int4 v = tex1Dfetch<int4>(tex_obj , idx);
-    double2 retval = { __hiloint2double(v.y, v.x) , __hiloint2double(v.w, v.z) };
-    return retval ;
-#endif
-#else
-    return ptr[ i ];
-#endif
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-struct CudaTexture {};
-
-#if defined( CUDA_VERSION ) && ( 500 <= CUDA_VERSION )
-
-/** \brief  Replace LayoutDefault specialization */
-template< typename ScalarType , class Rank , class RankDynamic >
-struct ViewSpecialize< const ScalarType , const ScalarType ,
-                       LayoutLeft , Rank , RankDynamic ,
-                       CudaSpace , MemoryRandomRead >
-{ typedef CudaTexture type ; };
-
-template< typename ScalarType , class Rank , class RankDynamic >
-struct ViewSpecialize< const ScalarType , const ScalarType ,
-                       LayoutRight , Rank , RankDynamic ,
-                       CudaSpace , MemoryRandomRead >
-{ typedef CudaTexture type ; };
-
-/** \brief Scalar View matching **/
-template< typename ScalarType >
-struct ViewSpecialize< const ScalarType , const ScalarType ,
-                       LayoutLeft , unsigned_<0> , unsigned_<0> ,
-                       CudaSpace , MemoryRandomRead >
-{ typedef CudaTexture type ; };
-
-template< typename ScalarType >
-struct ViewSpecialize< const ScalarType , const ScalarType ,
-                       LayoutRight , unsigned_<0> , unsigned_<0> ,
-                       CudaSpace , MemoryRandomRead >
-{ typedef CudaTexture type ; };
-
-#endif
-
-//----------------------------------------------------------------------------
-
-template<>
-struct ViewAssignment< CudaTexture , CudaTexture , void >
-{
-  /** \brief Assign compatible views */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,CudaTexture> & dst ,
-                  const View<ST,SL,SD,SM,CudaTexture> & src ,
-                  const typename enable_if<(
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
-                  ) >::type * = 0 )
-  {
-    typedef View<DT,DL,DD,DM,CudaTexture> DstViewType ;
-
-    typedef typename DstViewType::shape_type    shape_type ;
-    typedef typename DstViewType::memory_space  memory_space ;
-    typedef typename DstViewType::memory_traits memory_traits ;
-
-    dst.m_texture  = src.m_texture ;
-    dst.m_stride   = src.m_stride ;
-
-    shape_type::assign( dst.m_shape,
-                        src.m_shape.N0 , src.m_shape.N1 , src.m_shape.N2 , src.m_shape.N3 ,
-                        src.m_shape.N4 , src.m_shape.N5 , src.m_shape.N6 , src.m_shape.N7 );
-  }
-};
-
-
-template<>
-struct ViewAssignment< CudaTexture , LayoutDefault , void >
-{
-  /** \brief Assign compatible views */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  inline
-  ViewAssignment(       View<DT,DL,DD,DM,CudaTexture> & dst ,
-                  const View<ST,SL,SD,SM,LayoutDefault> & src ,
-                  const typename enable_if<(
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
-                  )>::type * = 0 )
-  {
-    typedef View<DT,DL,DD,DM,CudaTexture> DstViewType ;
-
-    typedef typename DstViewType::shape_type  shape_type ;
-    typedef typename DstViewType::scalar_type scalar_type ;
-    typedef typename DstViewType::stride_type stride_type ;
-
-    dst.m_texture = CudaTextureFetch< scalar_type >( src.m_ptr_on_device );
-
-    shape_type::assign( dst.m_shape,
-                        src.m_shape.N0 , src.m_shape.N1 , src.m_shape.N2 , src.m_shape.N3 ,
-                        src.m_shape.N4 , src.m_shape.N5 , src.m_shape.N6 , src.m_shape.N7 );
-
-    stride_type::assign( dst.m_stride , src.m_stride.value );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-template< class T , class L, class D , class M >
-class View< T , L , D , M , Impl::CudaTexture >
-  : public ViewTraits< T , L , D , M >
-{
-public:
-
-  typedef ViewTraits< T , L , D , M > traits ;
-
-private:
-
-  template< class , class , class > friend struct Impl::ViewAssignment ;
-
-  typedef Impl::LayoutStride< typename traits::shape_type ,
-                              typename traits::array_layout > stride_type ;
-
-  Impl::CudaTextureFetch<typename traits::scalar_type > m_texture ;
-  typename traits::shape_type           m_shape ;
-  stride_type                           m_stride ;
-
-public:
-
-  typedef Impl::CudaTexture specialize ;
-
-  typedef View< typename traits::const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > const_type ;
-
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type::host_mirror_device_type ,
-                void > HostMirror ;
-
-  enum { Rank = traits::rank };
-
-  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_shape ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_shape.N0 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_shape.N1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_shape.N2 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_shape.N3 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_shape.N4 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_shape.N5 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_shape.N6 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_shape.N7 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const
-  {
-    return   m_shape.N0
-           * m_shape.N1
-           * m_shape.N2
-           * m_shape.N3
-           * m_shape.N4
-           * m_shape.N5
-           * m_shape.N6
-           * m_shape.N7
-           ;
-  }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  typename traits::size_type dimension( const iType & i ) const
-    { return Impl::dimension( m_shape , i ); }
-
-  //------------------------------------
-
-  View() : m_texture()
-   {
-     traits::shape_type::assign(m_shape,0,0,0,0,0,0,0,0);
-     stride_type::assign( m_stride , 0 );
-   }
-
-  ~View() {}
-
-  View( const View & rhs )
-    : m_texture( rhs.m_texture )
-    , m_stride(  rhs.m_stride )
-    { m_shape = rhs.m_shape ; }
-
-  View & operator = ( const View & rhs )
-    {
-      (void)Impl::ViewAssignment< Impl::CudaTexture , Impl::CudaTexture >( *this , rhs );
-      return *this ;
-    }
-
-  template< class RT , class RL, class RD , class RM , class RS >
-  View( const View<RT,RL,RD,RM,RS> & rhs )
-    : m_texture(0)
-    {
-      Impl::ViewAssignment< Impl::CudaTexture , RS >( *this , rhs );
-    }
-
-  template< class RT , class RL, class RD, class RM , class RS >
-  View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
-    {
-      Impl::ViewAssignment< Impl::CudaTexture , RS >( *this , rhs );
-      return *this ;
-    }
-
-  //------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  bool is_null() const { return 0 == m_texture.ptr ; }
-
-  //------------------------------------
-  // Rank = 1 access operators:
-
-  template < typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type , traits , LayoutLeft , 1 , iType0 >::type
-    operator[] ( const iType0 & i0 ) const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      return m_texture[ i0 ];
-    }
-
-  template < typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type , traits , LayoutRight , 1 , iType0 >::type
-    operator[] ( const iType0 & i0 ) const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      return m_texture[ i0 ];
-    }
-
-  template < typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type , traits , LayoutLeft , 1 , iType0 >::type
-    operator() ( const iType0 & i0 ) const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      return m_texture[ i0 ];
-    }
-
-  template < typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type , traits , LayoutRight , 1 , iType0 >::type
-    operator() ( const iType0 & i0 ) const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      return m_texture[ i0 ];
-    }
-
-  //------------------------------------
-  // Layout left:
-
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type , traits, LayoutLeft, 2, iType0, iType1 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i0 + m_stride.value * i1 ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutLeft, 3, iType0, iType1, iType2 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_shape, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i0 + m_stride.value * (
-                        i1 + m_shape.N1 * i2 ) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutLeft, 4, iType0, iType1, iType2, iType3 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_shape, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i0 + m_stride.value * (
-                        i1 + m_shape.N1 * (
-                        i2 + m_shape.N2 * i3 )) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutLeft, 5, iType0, iType1, iType2, iType3, iType4 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_shape, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i0 + m_stride.value * (
-                        i1 + m_shape.N1 * (
-                        i2 + m_shape.N2 * (
-                        i3 + m_shape.N3 * i4 ))) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutLeft, 6, iType0, iType1, iType2, iType3, iType4, iType5 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_shape, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i0 + m_stride.value * (
-                        i1 + m_shape.N1 * (
-                        i2 + m_shape.N2 * (
-                        i3 + m_shape.N3 * (
-                        i4 + m_shape.N4 * i5 )))) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutLeft, 7, iType0, iType1, iType2, iType3, iType4, iType5, iType6 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_shape, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i0 + m_stride.value * (
-                        i1 + m_shape.N1 * (
-                        i2 + m_shape.N2 * (
-                        i3 + m_shape.N3 * (
-                        i4 + m_shape.N4 * (
-                        i5 + m_shape.N5 * i6 ))))) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutLeft, 8, iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_shape, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i0 + m_stride.value * (
-                        i1 + m_shape.N1 * (
-                        i2 + m_shape.N2 * (
-                        i3 + m_shape.N3 * (
-                        i4 + m_shape.N4 * (
-                        i5 + m_shape.N5 * (
-                        i6 + m_shape.N6 * i7 )))))) ];
-    }
-
-
-  //------------------------------------
-  // Layout right:
-
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutRight, 2, iType0, iType1 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i1 + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutRight, 3, iType0, iType1, iType2 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_shape, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i2 + m_shape.N2 * i1 + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutRight, 4, iType0, iType1, iType2, iType3 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_shape, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i3 + m_shape.N3 * (
-                        i2 + m_shape.N2 * (
-                        i1 )) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutRight, 5, iType0, iType1, iType2, iType3, iType4 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_shape, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i4 + m_shape.N4 * (
-                        i3 + m_shape.N3 * (
-                        i2 + m_shape.N2 * (
-                        i1 ))) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutRight, 6, iType0, iType1, iType2, iType3, iType4, iType5 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_shape, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i5 + m_shape.N5 * (
-                        i4 + m_shape.N4 * (
-                        i3 + m_shape.N3 * (
-                        i2 + m_shape.N2 * (
-                        i1 )))) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutRight, 7, iType0, iType1, iType2, iType3, iType4, iType5, iType6 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_shape, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i6 + m_shape.N6 * (
-                        i5 + m_shape.N5 * (
-                        i4 + m_shape.N4 * (
-                        i3 + m_shape.N3 * (
-                        i2 + m_shape.N2 * (
-                        i1 ))))) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type ,
-                                      traits, LayoutRight, 8, iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_shape, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_texture.ptr );
-
-      return m_texture[ i7 + m_shape.N7 * (
-                        i6 + m_shape.N6 * (
-                        i5 + m_shape.N5 * (
-                        i4 + m_shape.N4 * (
-                        i3 + m_shape.N3 * (
-                        i2 + m_shape.N2 * (
-                        i1 )))))) + i0 * m_stride.value ];
-    }
-
-  //------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  typename traits::scalar_type * ptr_on_device() const { return m_texture.ptr ; }
-
-  // Stride of physical storage, dimensioned to at least Rank
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void stride( iType * const s ) const
-  {
-    enum { is_left = Impl::is_same< typename traits::array_layout , LayoutLeft >::value };
-
-    if ( 1 == Rank ) {
-      s[0] = 1 ;
-    }
-    else if ( is_left ) {
-      s[0] = 1 ;
-      s[1] = m_stride.value ;
-      for ( int i = 2 ; i < Rank ; ++i ) { s[i] = s[i-1] * dimension(i-1); }
-    }
-    else {
-      s[0] = m_stride.value ;
-      s[Rank-1] = 1 ;
-      for ( int i = Rank - 2 ; 0 < i ; --i ) { s[i] = s[i+1] * dimension(i+1); }
-    }
-  }
-};
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
-
diff --git a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
deleted file mode 100644
index e0d2fcc..0000000
--- a/kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_ABORT_HPP
-#define KOKKOS_CUDA_ABORT_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
-
-#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 401 )
-#errof "Cuda version 4.1 or greater required"
-#endif
-
-#if ( __CUDA_ARCH__ < 200 )
-#error "Cuda device capability 2.0 or greater required"
-#endif
-
-extern "C" {
-/*  Cuda runtime function, declared in <crt/device_runtime.h>
- *  Requires capability 2.x or better.
- */
-extern __device__ void __assertfail(
-  const void  *message,
-  const void  *file,
-  unsigned int line,
-  const void  *function,
-  size_t       charsize);
-}
-
-namespace Kokkos {
-
-__device__ inline
-void cuda_abort( const char * const message )
-{
-  const char empty[] = "" ;
-
-  __assertfail( (const void *) message ,
-                (const void *) empty ,
-                (unsigned int) 0 ,
-                (const void *) empty ,
-                sizeof(char) );
-}
-
-} // namespace Kokkos
-
-#else
-
-namespace Kokkos {
-KOKKOS_INLINE_FUNCTION
-void cuda_abort( const char * const ) {}
-}
-
-#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_Atomic.hpp b/kokkos/kokkos/core/src/Kokkos_Atomic.hpp
deleted file mode 100644
index 407d425..0000000
--- a/kokkos/kokkos/core/src/Kokkos_Atomic.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Atomic.hpp
-/// \brief Atomic functions
-///
-/// This header file defines prototypes for the following atomic functions:
-///   - exchange
-///   - compare and exchange
-///   - add
-///
-/// Supported types include: 
-///   - signed and unsigned 4 and 8 byte integers
-///   - float
-///   - double
-///
-/// They are implemented through GCC compatible intrinsics, OpenMP
-/// directives and native CUDA intrinsics.
-///
-/// Including this header file requires one of the following
-/// compilers:
-///   - NVCC (for CUDA device code only)
-///   - GCC (for host code only)
-///   - Intel (for host code only)
-///   - A compiler that supports OpenMP 3.1 (for host code only)
-
-#ifndef KOKKOS_ATOMIC_HPP
-#define KOKKOS_ATOMIC_HPP
-
-#include <Kokkos_Macros.hpp>
-#include <impl/Kokkos_Utility.hpp>
-
-//----------------------------------------------------------------------------
-
-#if defined( __CUDA_ARCH__ )
-
-// Compiling NVIDIA device code, must use Cuda atomics:
-
-#define KOKKOS_ATOMICS_USE_CUDA
-
-#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
-      ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
-      ! defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-// Compiling for non-Cuda atomic implementation has not been pre-selected.
-// Choose the best implementation for the detected compiler.
-// Preference: GCC, INTEL, OMP31
-
-#if defined( __GNUC__ ) || defined( __GNUG__ )
-
-#define KOKKOS_ATOMICS_USE_GCC
-
-#elif defined( __INTEL_COMPILER ) || defined( _CRAYC)
-
-#define KOKKOS_ATOMICS_USE_INTEL
-
-#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
-
-#define KOKKOS_ATOMICS_USE_OMP31
-
-#else
-
-#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
-
-#endif
-
-#endif /* Not pre-selected atomic implementation */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-inline
-const char * atomic_query_version()
-{
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-  return "KOKKOS_ATOMICS_USE_CUDA" ;
-#elif defined( KOKKOS_ATOMICS_USE_GCC )
-  return "KOKKOS_ATOMICS_USE_GCC" ;
-#elif defined( KOKKOS_ATOMICS_USE_INTEL )
-  return "KOKKOS_ATOMICS_USE_INTEL" ;
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-  return "KOKKOS_ATOMICS_USE_OMP31" ;
-#endif
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-// Atomic exchange
-//
-// template< typename T >
-// T atomic_exchange( volatile T* const dest , const T val )
-// { T tmp = *dest ; *dest = val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Exchange.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic compare-and-exchange
-//
-// template<class T>
-// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
-// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
-
-#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
-
-//----------------------------------------------------------------------------
-// Atomic fetch and add
-//
-// template<class T>
-// T atomic_fetch_add(volatile T* const dest, const T val)
-// { T tmp = *dest ; *dest += val ; return tmp ; }
-
-#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
-
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_ATOMIC_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_CrsArray.hpp b/kokkos/kokkos/core/src/Kokkos_CrsArray.hpp
deleted file mode 100644
index 8f1b838..0000000
--- a/kokkos/kokkos/core/src/Kokkos_CrsArray.hpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CRSARRAY_HPP
-#define KOKKOS_CRSARRAY_HPP
-
-#include <string>
-#include <vector>
-
-#include <Kokkos_View.hpp>
-
-namespace Kokkos {
-
-/// \class CrsArray
-/// \brief Compressed row storage array.
-///
-/// \tparam DataType The type of stored entries.  If a CrsArray is
-///   used as the graph of a sparse matrix, then this is usually an
-///   integer type, the type of the column indices in the sparse
-///   matrix.
-///
-/// \tparam Arg1Type The second template parameter, corresponding
-///   either to the Device type (if there are no more template
-///   parameters) or to the Layout type (if there is at least one more
-///   template parameter).
-///
-/// \tparam Arg2Type The third template parameter, which if provided
-///   corresponds to the Device type.
-///
-/// \tparam SizeType The type of row offsets.  Usually the default
-///   parameter suffices.  However, setting a nondefault value is
-///   necessary in some cases, for example, if you want to have a
-///   sparse matrices with dimensions (and therefore column indices)
-///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
-///   entries in the sparse matrix.
-///
-/// A row has a range of entries:
-/// <ul>
-/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
-/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
-/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
-/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
-/// </ul>
-template< class DataType,
-          class Arg1Type,
-          class Arg2Type = void,
-          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
-class CrsArray {
-private:
-  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
-
-public:
-  typedef DataType                                            data_type;
-  typedef typename traits::array_layout                       array_layout;
-  typedef typename traits::device_type                        device_type;
-  typedef SizeType                                            size_type;
-
-  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType > crsarray_type;
-  typedef CrsArray< DataType , array_layout , typename device_type::host_mirror_device_type , SizeType > HostMirror;
-  typedef View< const size_type* , array_layout, device_type >  row_map_type;
-  typedef View<       DataType*  , array_layout, device_type >  entries_type;
-
-  entries_type entries;
-  row_map_type row_map;
-
-  //! Construct an empty view.
-  CrsArray () : entries(), row_map() {}
-
-  //! Copy constructor (shallow copy).
-  CrsArray (const CrsArray& rhs) : entries (rhs.entries), row_map (rhs.row_map)
-  {}
-
-  /** \brief  Assign to a view of the rhs array.
-   *          If the old view is the last view
-   *          then allocated memory is deallocated.
-   */
-  CrsArray& operator= (const CrsArray& rhs) {
-    entries = rhs.entries;
-    row_map = rhs.row_map;
-    return *this;
-  }
-
-  /**  \brief  Destroy this view of the array.
-   *           If the last view then allocated memory is deallocated.
-   */
-  ~CrsArray() {}
-};
-
-//----------------------------------------------------------------------------
-
-template< class CrsArrayType , class InputSizeType >
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< InputSizeType > & input );
-
-template< class CrsArrayType , class InputSizeType >
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< std::vector< InputSizeType > > & input );
-
-//----------------------------------------------------------------------------
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          typename SizeType >
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          typename SizeType >
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & input );
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#include <impl/Kokkos_CrsArray_factory.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_Cuda.hpp b/kokkos/kokkos/core/src/Kokkos_Cuda.hpp
deleted file mode 100644
index 7434be2..0000000
--- a/kokkos/kokkos/core/src/Kokkos_Cuda.hpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDA_HPP
-#define KOKKOS_CUDA_HPP
-
-#include <iosfwd>
-#include <vector>
-
-#include <Kokkos_Macros.hpp>
-#ifdef KOKKOS_HAVE_OPENMP
-#include <Kokkos_OpenMP.hpp>
-#else
-#ifdef KOKKOS_HAVE_PTHREAD
-#include <Kokkos_Threads.hpp>
-#else
-#include <Kokkos_Serial.hpp>
-#endif
-#endif
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_CudaSpace.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-class CudaExec ;
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/// \class Cuda
-/// \brief Kokkos device that uses CUDA to run on GPUs.
-///
-/// A "device" represents a parallel execution model.  It tells Kokkos
-/// how to parallelize the execution of kernels in a parallel_for or
-/// parallel_reduce.  For example, the Threads device uses Pthreads or
-/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
-/// extensions, and the Serial device executes "parallel" kernels
-/// sequentially.  The Cuda device uses NVIDIA's CUDA programming
-/// model to execute kernels in parallel on GPUs.
-class Cuda {
-public:
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
-  //! The device type (same as this class).
-  typedef Cuda                  device_type ;
-  //! This device's preferred memory space.
-  typedef CudaSpace             memory_space ;
-  //! The size_type typedef best suited for this device.
-  typedef CudaSpace::size_type  size_type ;
-  //! This device's preferred array layout.
-  typedef LayoutLeft            array_layout ;
-  //! This device's host mirror type.
-#ifdef KOKKOS_HAVE_OPENMP
-  typedef Kokkos::OpenMP       host_mirror_device_type ;
-#else
-#ifdef KOKKOS_HAVE_PTHREAD
-  typedef Kokkos::Threads       host_mirror_device_type ;
-#else
-  typedef Kokkos::Serial       host_mirror_device_type ;
-#endif
-#endif
-  //@}
-  //! \name Functions that all Kokkos devices must implement.
-  //@{
-
-  /// \brief True if and only if this method is being called in a
-  ///   thread-parallel function.
-  KOKKOS_INLINE_FUNCTION static int in_parallel() { 
-#if defined( __CUDA_ARCH__ ) 
-    return true; 
-#else
-    return false;
-#endif
-  }
-
-  /** \brief  Set the device in a "sleep" state.
-   *
-   * This function sets the device in a "sleep" state in which it is
-   * not ready for work.  This may consume less resources than if the
-   * device were in an "awake" state, but it may also take time to
-   * bring the device from a sleep state to be ready for work.
-   *
-   * \return True if the device is in the "sleep" state, else false if
-   *   the device is actively working and could not enter the "sleep"
-   *   state.
-   */
-  static bool sleep();
-
-  /// \brief Wake the device from the 'sleep' state so it is ready for work.
-  ///
-  /// \return True if the device is in the "ready" state, else "false"
-  ///  if the device is actively working (which also means that it's
-  ///  awake).
-  static bool wake();
-
-  /// \brief Wait until all dispatched functors complete.
-  ///
-  /// The parallel_for or parallel_reduce dispatch of a functor may
-  /// return asynchronously, before the functor completes.  This
-  /// method does not return until all dispatched functors on this
-  /// device have completed.
-  static void fence();
-
-  //! Free any resources being consumed by the device.
-  static void finalize();
-
-  //! Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  //@}
-  //--------------------------------------------------------------------------
-  //! \name Device-specific functions
-  //@{
-
-  struct SelectDevice {
-    int cuda_device_id ;
-    SelectDevice() : cuda_device_id(0) {}
-    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
-  };
-
-  //! Initialize, telling the CUDA run-time library which device to use.
-  static void initialize( const SelectDevice = SelectDevice() );
-
-  static int is_initialized();
-
-  /// \brief Cuda device architecture of the selected device.
-  /// 
-  /// This matches the __CUDA_ARCH__ specification.
-  static size_type device_arch();
-
-  //! Query device count.
-  static size_type detect_device_count();
-
-  /** \brief  Detect the available devices and their architecture
-   *          as defined by the __CUDA_ARCH__ specification.
-   */
-  static std::vector<unsigned> detect_device_arch();
-
-  static unsigned team_max();
-
-  //@}
-  //--------------------------------------------------------------------------
-#if defined( __CUDA_ARCH__ )
-  //! \name Functions for the functor device interface
-  //@{
-
-  __device__ inline int league_size() const { return gridDim.x ; }
-  __device__ inline int league_rank() const { return blockIdx.x ; }
-
-  __device__ inline int team_size() const { return blockDim.x ; }
-  __device__ inline int team_rank() const { return threadIdx.x ; }
-
-  __device__ inline void team_barrier() const { __syncthreads(); }
-  __device__ inline unsigned int team_barrier_count(bool value) const
-             { return __syncthreads_count(value); }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  __device__ inline Type team_scan( const Type & value );
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename TypeLocal , typename TypeGlobal >
-  __device__ inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
-
-
-  //! Get a pointer to shared memory for this team.
-  __device__ inline void * get_shmem( const int size );
-
-  __device__ inline Cuda( Impl::CudaExec & exec ) : m_exec(exec) {}
-  __device__ inline Cuda( const Cuda & rhs ) : m_exec(rhs.m_exec) {}
-
-  //@}
-  //--------------------------------------------------------------------------
-
-private:
-
-  Impl::CudaExec & m_exec ;
-
-  //--------------------------------------------------------------------------
-#else
-
-  int league_size() const ;
-  int league_rank() const ;
-
-  int team_size() const ;
-  int team_rank() const ;
-
-  void team_barrier() const ;
-  unsigned int team_barrier_count(bool) const ;
-
-  template< typename T >
-    inline T team_scan(const T& value);
-
-  template< typename TypeLocal , typename TypeGlobal >
-    inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
-
-  void * get_shmem( const int size );
-
-  Cuda( Impl::CudaExec & );
-
-#endif
-
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief Cuda-specific parallel work configuration */
-
-struct CudaWorkConfig {
-  Cuda::size_type  grid[3] ;   //< Grid dimensions
-  Cuda::size_type  block[3] ;  //< Block dimensions
-  Cuda::size_type  shared ;    //< Shared memory size
-
-  CudaWorkConfig()
-  {
-    enum { WarpSize = 32 };
-    grid[0] = grid[1] = grid[2] = 1 ;
-    block[1] = block[2] = 1 ;
-    block[0] = 8 * WarpSize ;
-    shared = 0 ;
-  }
-};
-
-template< class FunctorType >
-inline
-void parallel_for( const CudaWorkConfig & work_config ,
-                   const FunctorType    & functor )
-{
-  Impl::ParallelFor< FunctorType , CudaWorkConfig , Cuda >
-    ( work_config , functor );
-}
-
-template< class FunctorType , class FinalizeType >
-inline
-void parallel_reduce( const CudaWorkConfig & work_config ,
-                      const FunctorType    & functor ,
-                      const FinalizeType   & finalize );
-
-template< class FunctorType >
-inline
-typename FunctorType::value_type
-parallel_reduce( const CudaWorkConfig & work_config ,
-                 const FunctorType    & functor );
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-#include <Cuda/Kokkos_CudaExec.hpp>
-#include <Cuda/Kokkos_Cuda_View.hpp>
-#include <Cuda/Kokkos_Cuda_Parallel.hpp>
-
-#endif /* #ifndef KOKKOS_CUDA_HPP */
-
-//----------------------------------------------------------------------------
-
-
diff --git a/kokkos/kokkos/core/src/Kokkos_CudaSpace.hpp b/kokkos/kokkos/core/src/Kokkos_CudaSpace.hpp
deleted file mode 100644
index e89ac20..0000000
--- a/kokkos/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDASPACE_HPP
-#define KOKKOS_CUDASPACE_HPP
-
-#if defined( __CUDACC__ )
-#include <cuda_runtime.h>
-#endif
-
-#include <iosfwd>
-#include <typeinfo>
-#include <string>
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Cuda/Kokkos_Cuda_abort.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Cuda memory management */
-
-class CudaSpace {
-public:
-
-  typedef CudaSpace     memory_space ;
-  typedef unsigned int  size_type ;
-
-  /** \brief  Allocate a contiguous block of memory on the Cuda device
-   *          with size = scalar_size * scalar_count.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   *
-   *  Allocation may only occur on the master thread of the process.
-   */
-  static void * allocate( const std::string    & label ,
-                          const std::type_info & scalar_type ,
-                          const size_t           scalar_size ,
-                          const size_t           scalar_count );
-
-  /** \brief  Increment the reference count of the block of memory
-   *          in which the input pointer resides.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void increment( const void * );
-
-  /** \brief  Decrement the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count falls to zero the memory is deallocated.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void decrement( const void * );
-
-  /** \brief  Print all tracked memory to the output stream. */
-  static void print_memory_view( std::ostream & );
-
-  /** \brief  Retrieve label associated with the input pointer */
-  static std::string query_label( const void * );
-
-  /*--------------------------------*/
-
-  static void access_error();
-  static void access_error( const void * const );
-
-  /*--------------------------------*/
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct DeepCopy<HostSpace,CudaSpace> {
-  DeepCopy( void * dst , const void * src , size_t );
-};
-
-template<>
-struct DeepCopy<CudaSpace,HostSpace> {
-  DeepCopy( void * dst , const void * src , size_t );
-};
-
-template<>
-struct DeepCopy<CudaSpace,CudaSpace> {
-  DeepCopy( void * dst , const void * src , size_t );
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief  Cuda code accessing Cuda data is good. */
-template<>
-struct VerifyExecutionSpaceCanAccessDataSpace< CudaSpace , CudaSpace >
-{
-  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
-  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
-};
-
-/** \brief  Cuda code accessing non-Cuda data is bad. */
-template<>
-struct VerifyExecutionSpaceCanAccessDataSpace< CudaSpace , HostSpace >
-{
-  KOKKOS_INLINE_FUNCTION static void verify(void)
-  { Kokkos::cuda_abort("Cuda code called function restricted to HostSpace"); }
-
-  KOKKOS_INLINE_FUNCTION static void verify( const void * )
-  { Kokkos::cuda_abort("Cuda code attempted to access HostSpace memory"); }
-};
-
-/** \brief  Produce error message when trying to access Cuda 
- *          memory on the host.
- */
-template<>
-struct VerifyExecutionSpaceCanAccessDataSpace< HostSpace , CudaSpace >
-{
-  inline static void verify( void ) { CudaSpace::access_error(); }
-  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_CUDASPACE_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_CudaTypes.hpp b/kokkos/kokkos/core/src/Kokkos_CudaTypes.hpp
deleted file mode 100644
index 899e7e1..0000000
--- a/kokkos/kokkos/core/src/Kokkos_CudaTypes.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_CUDATYPES_HPP
-#define KOKKOS_CUDATYPES_HPP
-
-#include <Kokkos_Macros.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ )
-
-namespace Kokkos {
-
-typedef ::int2 int2 ;
-typedef ::int3 int3 ;
-typedef ::int4 int4 ;
-
-typedef ::float2 float2 ;
-typedef ::float3 float3 ;
-typedef ::float4 float4 ;
-
-typedef ::double2 double2 ;
-typedef ::double3 double3 ;
-typedef ::double4 double4 ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#else /* NOT #if defined( __CUDACC__ ) */
-
-namespace Kokkos {
-
-struct int2 {
-        int x;
-        int y;
-};
-
-struct int3 {
-        int x;
-        int y;
-        int z;
-};
-
-struct int4 {
-        int x;
-        int y;
-        int z;
-        int w;
-};
-
-struct float2 {
-        float x;
-        float y;
-};
-
-struct float3 {
-        float x;
-        float y;
-        float z;
-};
-
-struct float4 {
-        float x;
-        float y;
-        float z;
-        float w;
-};
-
-struct double2 {
-        double x;
-        double y;
-};
-
-struct double3 {
-        double x;
-        double y;
-        double z;
-};
-
-struct double4 {
-        double x;
-        double y;
-        double z;
-        double w;
-};
-
-} // namespace Kokkos
-
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_CUDATYPES_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_HostSpace.hpp b/kokkos/kokkos/core/src/Kokkos_HostSpace.hpp
deleted file mode 100644
index 028a403..0000000
--- a/kokkos/kokkos/core/src/Kokkos_HostSpace.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_HOSTSPACE_HPP
-#define KOKKOS_HOSTSPACE_HPP
-
-#include <iosfwd>
-#include <typeinfo>
-#include <string>
-
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Traits.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Memory management on the host for devices */
-
-class HostSpace {
-public:
-
-  typedef HostSpace  memory_space ;
-  typedef size_t     size_type ;
-
-  /** \brief  Allocate a contiguous block of memory on the Cuda device
-   *          with size = scalar_size * scalar_count.
-   *
-   *  The input label is associated with the block of memory.
-   *  The block of memory is tracked via reference counting where
-   *  allocation gives it a reference count of one.
-   *
-   *  Allocation may only occur on the master thread of the process.
-   */
-  static void * allocate( const std::string    & label ,
-                          const std::type_info & scalar_type ,
-                          const size_t           scalar_size ,
-                          const size_t           scalar_count );
-
-  /** \brief  Increment the reference count of the block of memory
-   *          in which the input pointer resides.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void increment( const void * );
-
-  /** \brief  Decrement the reference count of the block of memory
-   *          in which the input pointer resides.  If the reference
-   *          count falls to zero the memory is deallocated.
-   *
-   *          Reference counting only occurs on the master thread.
-   */
-  static void decrement( const void * );
-
-  /*--------------------------------*/
-
-  /** \brief  Print all tracked memory to the output stream. */
-  static void print_memory_view( std::ostream & );
-
-  /** \brief  Retrieve label associated with the input pointer */
-  static std::string query_label( const void * );
-
-  /*--------------------------------*/
-  /* Functions unique to the HostSpace */
-
-  static int in_parallel();
-
-  static void register_in_parallel( int (*)() );
-};
-
-//----------------------------------------------------------------------------
-
-template< class ExecutionSpace , class DataSpace >
-struct VerifyExecutionSpaceCanAccessDataSpace ;
-
-template<>
-struct VerifyExecutionSpaceCanAccessDataSpace< HostSpace , HostSpace >
-{
-  inline static void verify(void) {}
-  inline static void verify(const void *) {}
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class , class > struct DeepCopy ;
-
-template<>
-struct DeepCopy<HostSpace,HostSpace> {
-  DeepCopy( void * dst , const void * src , size_t n );
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #define KOKKOS_HOSTSPACE_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_Layout.hpp b/kokkos/kokkos/core/src/Kokkos_Layout.hpp
deleted file mode 100644
index f026806..0000000
--- a/kokkos/kokkos/core/src/Kokkos_Layout.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Layout.hpp
-/// \brief Declaration of various \c MemoryLayout options.
-
-#ifndef KOKKOS_LAYOUT_HPP
-#define KOKKOS_LAYOUT_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-
-namespace Kokkos {
-
-/// \struct LayoutLeft
-/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
-///   striding of multi-indices.
-///
-/// This is an example of a \c MemoryLayout template parameter of
-/// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
-///
-/// "Layout left" indicates a mapping where the leftmost index i0
-/// refers to contiguous access, and strides increase for dimensions
-/// going right from there (i1, i2, ...).  This layout imitates how
-/// Fortran stores multi-dimensional arrays.  For the special case of
-/// a two-dimensional array, "layout left" is also called "column
-/// major."
-struct LayoutLeft { typedef LayoutLeft array_layout ; };
-
-/// \struct LayoutRight
-/// \brief Memory layout tag indicating right-to-left (C or
-///   lexigraphical scheme) striding of multi-indices.
-///
-/// This is an example of a \c MemoryLayout template parameter of
-/// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
-///
-/// "Right layout" indicates a mapping where the rightmost index ik
-/// refers to contiguous access, and strides increase for dimensions
-/// going left from there.  This layout imitates how C stores
-/// multi-dimensional arrays.  For the special case of a
-/// two-dimensional array, "layout right" is also called "row major."
-struct LayoutRight { typedef LayoutRight array_layout ; };
-
-/// \struct LayoutTileLeft
-/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
-///   striding of multi-indices by tiles.
-///
-/// This is an example of a \c MemoryLayout template parameter of
-/// View.  The memory layout describes how View maps from a
-/// multi-index (i0, i1, ..., ik) to a memory location.  
-///
-/// "Tiled layout" indicates a mapping to contiguously stored
-/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
-/// dimensions.  Indices are LayoutLeft within each tile, and the
-/// tiles themselves are arranged using LayoutLeft.  Note that the
-/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
-/// compile-time constants.  This speeds up index calculations.  If
-/// both tile dimensions are powers of two, Kokkos can optimize
-/// further.
-template < unsigned ArgN0 , unsigned ArgN1 ,
-           bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
-                                 Impl::is_power_of_two<ArgN1>::value )
-         >
-struct LayoutTileLeft {
-  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
-  enum { N0 = ArgN0 };
-  enum { N1 = ArgN1 };
-};
-
-} // namespace Kokkos
-
-#endif // #ifndef KOKKOS_LAYOUT_HPP
-
diff --git a/kokkos/kokkos/core/src/Kokkos_Macros.hpp b/kokkos/kokkos/core/src/Kokkos_Macros.hpp
deleted file mode 100644
index b46ce32..0000000
--- a/kokkos/kokkos/core/src/Kokkos_Macros.hpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MACROS_HPP
-#define KOKKOS_MACROS_HPP
-
-#include <KokkosCore_config.h>
-
-namespace Kokkos {
-class HostSpace ;
-class CudaSpace ;
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ ) && ! defined( KOKKOS_HAVE_CUDA )
-#error "Compiling Kokkos with Cuda compiler but KOKKOS_HAVE_CUDA is undefined"
-#endif
-
-#if defined( _OPENMP ) && ! defined( KOKKOS_HAVE_OPENMP )
-#error "Compiling Kokkos for OpenMP but KOKKOS_HAVE_OPENMP is undefined"
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ ) 
-
-#include <cuda.h>
-
-/*  Compiling with a CUDA compiler for device code.
- *
- *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
- *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
- *
- *  When generating device code the __CUDA_ARCH__ macro is defined as:
- *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
- */
-#if ! defined( CUDA_VERSION )
-#error "#include <cuda.h> did not define CUDA_VERSION"
-#endif
-
-#if ( CUDA_VERSION < 4010 )
-#error "Cuda version 4.1 or greater required"
-#endif
-
-#endif /* #if defined( __CUDACC__ ) */
-
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
-
-/*  Compiling with CUDA compiler for device code. */
-
-#if ( __CUDA_ARCH__ < 200 )
-#error "Cuda device capability >= 2.0 is required"
-#endif
-
-#define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
-#define KOKKOS_INLINE_FUNCTION  __device__  __host__  inline
-#define KOKKOS_FUNCTION         __device__  __host__
-
-#endif /* #if defined( __CUDACC__ ) && #if defined( __CUDA_ARCH__ ) */
-
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
-
-/*  Compiling with CUDA compiler for host code. */
-
-#define KOKKOS_FORCEINLINE_FUNCTION  __forceinline__
-
-#endif /* #if defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __INTEL_COMPILER )
-
-/*  Compiling with Intel compiler */
-/*  TBD: Version testing */
-
-#ifndef KOKKOS_FORCEINLINE_FUNCTION
-#define KOKKOS_FORCEINLINE_FUNCTION  __forceinline
-#endif
-
-#if defined( __MIC__ )
-
-/*  Compiling with Intel compiler for execution on an Intel MIC device.
- *  These devices are used in no-offload mode so the HostSpace is the MIC space.
- */
-
-#else
-
-#ifndef KOKKOS_USE_PRAGMA_SIMD
-#define KOKKOS_USE_PRAGMA_SIMD
-#endif
-
-/*
-  #pragma simd vectorlength(N)
-  #pragma ivdep
-*/
-
-#endif /* #if defined( __MIC__ ) */
-
-#endif /* #if defined( __INTEL_COMPILER ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __GNUC__ ) /* GNU C   */ || \
-    defined( __GNUG__ ) /* GNU C++ */
-
-/* Compiling with GNU compiler */
-
-#ifndef KOKKOS_FORCEINLINE_FUNCTION
-#define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
-#endif
-
-/*  Compiling with GNU compatible compiler.  */
-
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( _OPENMP )
-
-/*  Compiling with in OpenMP mode.
- *  The value of _OPENMP is an integer value YYYYMM
- *  where YYYY and MM are the year and month designation
- *  of the supported OpenMP API version.
- */
-
-#endif /* END: #if defined( _OPENMP ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#ifndef KOKKOS_FUNCTION
-#define KOKKOS_FUNCTION /* */
-#endif
-
-#ifndef KOKKOS_INLINE_FUNCTION
-#define KOKKOS_INLINE_FUNCTION inline
-#endif
-
-#ifndef KOKKOS_FORCEINLINE_FUNCTION
-#define KOKKOS_FORCEINLINE_FUNCTION  inline
-#endif
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ )
-
-namespace Kokkos { typedef CudaSpace ExecutionSpace ; }
-
-#else
-
-namespace Kokkos { typedef HostSpace ExecutionSpace ; }
-
-#endif
-
-#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
-  Kokkos::VerifyExecutionSpaceCanAccessDataSpace< \
-    Kokkos::ExecutionSpace , DATA_SPACE >::verify( DATA_PTR )
-
-#define KOKKOS_RESTRICT_EXECUTION_TO( DATA_SPACE ) \
-  Kokkos::VerifyExecutionSpaceCanAccessDataSpace< \
-    Kokkos::ExecutionSpace , DATA_SPACE >::verify()
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_MACROS_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_MemoryTraits.hpp b/kokkos/kokkos/core/src/Kokkos_MemoryTraits.hpp
deleted file mode 100644
index e1bbc35..0000000
--- a/kokkos/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MEMORYTRAITS_HPP
-#define KOKKOS_MEMORYTRAITS_HPP
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief  Memory access traits for views, an extension point.
- *
- *  These traits should be orthogonal.  If there are dependencies then
- *  the MemoryTraits template must detect and enforce dependencies.
- *
- *  A zero value is the default for a View, indicating that none of
- *  these traits are present.
- */
-enum MemoryTraitsFlags
-  { Unmanaged  = 0x01  
-  , RandomRead = 0x02  
-  };
-
-template < unsigned T >
-struct MemoryTraits {
-  enum { Unmanaged  = T & unsigned(Kokkos::Unmanaged) };
-  enum { RandomRead = T & unsigned(Kokkos::RandomRead) };
-
-  typedef MemoryTraits memory_traits ;
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-typedef Kokkos::MemoryTraits<0> MemoryManaged ;
-typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
-typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomRead > MemoryRandomRead ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief Memory alignment settings
- *
- *  Sets global value for memory alignment.
- *  Enable compatibility of views from different devices with static stride.
- *  Use compiler flag to enable overwrites.
- */
-enum { MEMORY_ALIGNMENT =
-#if defined( KOKKOS_MEMORY_ALIGNMENT )
-  KOKKOS_MEMORY_ALIGNMENT
-#else
-  128
-#endif
-  };
-
-enum { MEMORY_ALIGNMENT_THRESHOLD = 4 };
-
-} //namespace Impl
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_OpenMP.hpp b/kokkos/kokkos/core/src/Kokkos_OpenMP.hpp
deleted file mode 100644
index 3b5ffed..0000000
--- a/kokkos/kokkos/core/src/Kokkos_OpenMP.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_OPENMP_HPP
-#define KOKKOS_OPENMP_HPP
-
-#include <Kokkos_Macros.hpp>
-
-#if defined(KOKKOS_HAVE_OPENMP)
-
-#include <omp.h>
-#include <cstddef>
-#include <iosfwd>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_Layout.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-class OpenMPexec ;
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/// \class OpenMP
-/// \brief Kokkos device for multicore processors in the host memory space.
-class OpenMP {
-public:
-  //------------------------------------
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
-  typedef OpenMP                device_type ;
-  typedef HostSpace::size_type  size_type ;
-  typedef HostSpace             memory_space ;
-  typedef LayoutRight           array_layout ;
-  typedef OpenMP                host_mirror_device_type ;
-
-  //@}
-  //------------------------------------
-  //! \name Functions that all Kokkos devices must implement.
-  //@{
-
-  inline static bool in_parallel() { return omp_in_parallel(); }
-
-  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
-  static bool sleep();
-
-  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
-  static bool wake();
-
-  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
-  static void fence() {}
-
-  /// \brief Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  /// \brief Free any resources being consumed by the device.
-  static void finalize();
-
-  /** \brief  Initialize the device.
-   *
-   *  1) If the hardware locality library is enabled and OpenMP has not
-   *     already bound threads then bind OpenMP threads to maximize
-   *     core utilization and group for memory hierarchy locality.
-   *
-   *  2) Allocate a HostThread for each OpenMP thread to hold its
-   *     topology and fan in/out data.
-   */
-  static void initialize( const unsigned team_count         = 1 ,
-                          const unsigned threads_per_team   = 1 ,
-                          const unsigned use_numa_count     = 0 ,
-                          const unsigned use_cores_per_numa = 0 );
-
-  static int is_initialized();
-
-  static unsigned league_max();
-  static unsigned team_max();
-  //@}
-  //------------------------------------
-  //! \name Function for the functor device interface */
-  //@{
-
-  inline int league_rank() const ;
-  inline int league_size() const ;
-  inline int team_rank() const ;
-  inline int team_size() const ;
-
-  inline void team_barrier();
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  inline Type team_scan( const Type & value );
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename TypeLocal , typename TypeGlobal >
-  inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
-
-
-  inline void * get_shmem( const int size );
-
-  explicit inline OpenMP( Impl::OpenMPexec & );
-
-  //------------------------------------
-
-private:
-
-  Impl::OpenMPexec & m_exec ;
-
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-#include <OpenMP/Kokkos_OpenMPexec.hpp>
-#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#endif /* #if defined(KOKKOS_HAVE_OPENMP) */
-#endif /* #ifndef KOKKOS_OPENMP_HPP */
-
-
diff --git a/kokkos/kokkos/core/src/Kokkos_Parallel.hpp b/kokkos/kokkos/core/src/Kokkos_Parallel.hpp
deleted file mode 100644
index 06cc14c..0000000
--- a/kokkos/kokkos/core/src/Kokkos_Parallel.hpp
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Parallel.hpp
-/// \brief Declaration of parallel operators
-
-#ifndef KOKKOS_PARALLEL_HPP
-#define KOKKOS_PARALLEL_HPP
-
-#include <cstddef>
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_View.hpp>
-#include <impl/Kokkos_Traits.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/// \class ParallelFor
-/// \brief Implementation of the ParallelFor operator that has a
-///   partial specialization for the device.
-///
-/// This is an implementation detail of parallel_for.  Users should
-/// skip this and go directly to the nonmember function parallel_for.
-template< class FunctorType ,
-          class WorkSpec ,
-          class DeviceType = typename FunctorType::device_type >
-class ParallelFor ;
-
-} // namespace Impl
-} // namespace Kokkos
-
-namespace Kokkos {
-
-/// \class VectorParallel
-/// \brief Request for parallel_for to attempt thread+vector parallelism.
-struct VectorParallel
-{
-  const size_t nwork ;
-  VectorParallel( const size_t n ) : nwork(n) {}
-  operator size_t () const { return nwork ; }
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief Execute \c functor \c work_count times in parallel.
- *
- * A "functor" is a class containing the function to execute in
- * parallel, any data needed for that execution, and a \c device_type
- * typedef.  Here is an example functor for parallel_for:
- *
- * \code
- *  class FunctorType {
- *  public:
- *    typedef  ...  device_type ;
- *    void operator() (IntType iwork) const ;
- *  };
- * \endcode
- *
- * In the above example, \c IntType is any integer type for which a
- * valid conversion from \c size_t to \c IntType exists.  Its
- * <tt>operator()</tt> method defines the operation to parallelize,
- * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
- * This compares to a single iteration \c iwork of a \c for loop.
- */
-template< class FunctorType >
-inline
-void parallel_for( const size_t        work_count ,
-                   const FunctorType & functor )
-{
-  Impl::ParallelFor< FunctorType , size_t > tmp( functor , work_count );
-}
-
-
-/** \brief Execute \c functor \c work_count times in parallel, with vectorization.
- *
- * This is like parallel_for, except that it <i>mandates</i>
- * vectorization as well as parallelization of the given functor.  We
- * emphasize "mandates": this means that the user asserts that
- * vectorization is correct, and insists that the compiler vectorize.
- * Mandating vectorization is not always desirable, for example if the
- * body of the functor is complicated.  In some cases, users might
- * want to parallelize over threads, and use vectorization inside the
- * parallel operation.  Furthermore, the compiler might still be able
- * to vectorize through a parallel_for.  Thus, users should take care
- * not to use this execution option arbitrarily.
- */
-template< class FunctorType >
-inline
-void vector_parallel_for( const size_t        work_count ,
-                          const FunctorType & functor )
-{
-  Impl::ParallelFor< FunctorType , VectorParallel > tmp( functor , work_count );
-}
-
-template< class DeviceType >
-class MultiFunctorParallelFor ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/// \class ParallelReduce
-/// \brief Implementation detail of parallel_reduce.
-///
-/// This is an implementation detail of parallel_reduce.  Users should
-/// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType ,
-          class WorkSpec ,
-          class DeviceType = typename FunctorType::device_type >
-class ParallelReduce ;
-
-/// \class ReduceAdapter
-/// \brief Implementation detail of parallel_reduce.
-///
-/// This is an implementation detail of parallel_reduce.  Users should
-/// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType ,
-          class ValueType = typename FunctorType::value_type >
-struct ReduceAdapter ;
-
-} // namespace Impl
-} // namespace Kokkos
-
-
-namespace Kokkos {
-
-/** \brief  Parallel reduction
- *
- * Example of a parallel_reduce functor for a POD (plain old data) value type:
- * \code
- *  class FunctorType { // For POD value type
- *  public:
- *    typedef    ...     device_type ;
- *    typedef <podType>  value_type ;
- *    void operator()( <intType> iwork , <podType> & update ) const ;
- *    void init( <podType> & update ) const ;
- *    void join( volatile       <podType> & update ,
- *               volatile const <podType> & input ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> & update ) const ;
- *  };
- * \endcode
- *
- * Example of a parallel_reduce functor for an array of POD (plain old data) values:
- * \code
- *  class FunctorType { // For array of POD value
- *  public:
- *    typedef    ...     device_type ;
- *    typedef <podType>  value_type[] ;
- *    void operator()( <intType> , <podType> update[] ) const ;
- *    void init( <podType> update[] ) const ;
- *    void join( volatile       <podType> update[] ,
- *               volatile const <podType> input[] ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> update[] ) const ;
- *  };
- * \endcode
- */
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t        work_count ,
-                      const FunctorType & functor )
-{
-  Impl::ParallelReduce< FunctorType , size_t > reduce( functor , work_count );
-}
-
-/** \brief  Parallel reduction and output to host.
- *
- *  If FunctorType::value_type is
- *    - \c PodType,  then \c reference_type is <tt>PodType & </tt>.
- *    - <tt>PodType[]</tt>, then \c reference_type is <tt>PodType * </tt>.
- */
-template< class FunctorType >
-inline
-void parallel_reduce( const size_t work_count ,
-                      const FunctorType & functor ,
-                      typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
-{
-  Impl::ParallelReduce< FunctorType, size_t >
-    reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
-
-  reduce.wait();
-}
-
-template< class FunctorType >
-inline
-void parallel_reduce( const VectorParallel & work_count ,
-                      const FunctorType & functor ,
-                      typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
-{
-  Impl::ParallelReduce< FunctorType, VectorParallel >
-    reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
-
-  reduce.wait();
-}
-
-template< class DeviceType >
-class MultiFunctorParallelReduce ;
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/// \class ParallelReduce
-/// \brief Implementation detail of parallel_reduce.
-///
-/// This is an implementation detail of parallel_reduce.  Users should
-/// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType ,
-          class WorkSpec ,
-          class DeviceType = typename FunctorType::device_type >
-class ParallelScan ;
-
-} // namespace Impl
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template< class FunctorType >
-inline
-void parallel_scan( const size_t        work_count ,
-                    const FunctorType & functor )
-{
-  Impl::ParallelScan< FunctorType , size_t > scan( functor , work_count );
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \brief  Parallel work request for shared memory, league size, and team size.
- *
- *  If the shared size is too large then slow (global) memory will be used.
- *  If the league or team size are too large then they will be reduced.
- */
-struct ParallelWorkRequest {
-  size_t  league_size ; ///<  Size of league (number of teams in a league)
-  size_t  team_size ;   ///<  Size of team (number of threads in a team)
-
-  KOKKOS_INLINE_FUNCTION
-  ParallelWorkRequest() : league_size(0), team_size(0) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ParallelWorkRequest( size_t s0 , size_t s1 ) : league_size(s0), team_size(s1) {}
-};
-
-/** \brief  Execute functor in parallel with work request,
- *          the actual league_size and team_size may be smaller.
- *
- *  class FunctorType {
- *  public:
- *    typedef  ...  device_type ;
- *    void operator()( device_type ) const ;
- *  };
- */
-template< class FunctorType >
-inline
-void parallel_for( const ParallelWorkRequest & request ,
-                   const FunctorType         & functor )
-{
-  Kokkos::Impl::ParallelFor< FunctorType , ParallelWorkRequest >( functor , request );
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-/** \brief  Parallel reduction.
- *
- *  class FunctorType {
- *  public:
- *    typedef    ...     device_type ;
- *    typedef <podType>  value_type ; // POD type
- *    void operator()( device_type , <podType> & ) const ;
- *    void init( <podType> & ) const ;
- *    void join( volatile       <podType> & update ,
- *               volatile const <podType> & input ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> & update ) const ;
- *  };
- *
- *  class FunctorType { // For array of POD value
- *  public:
- *    typedef    ...     device_type ;
- *    typedef <podType>  value_type[] ;
- *    void operator()( device_type , <podType> update[] ) const ;
- *    void init( <podType> update[] ) const ;
- *    void join( volatile       <podType> update[] ,
- *               volatile const <podType> input[] ) const ;
- *
- *    typedef true_type has_final ;
- *    void final( <podType> update[] ) const ;
- *  };
- */
-template< class FunctorType >
-inline
-void parallel_reduce( const Kokkos::ParallelWorkRequest  & request ,
-                      const FunctorType          & functor )
-{
-  Impl::ParallelReduce< FunctorType , Kokkos::ParallelWorkRequest > reduce( functor , request );
-}
-
-template< class FunctorType >
-inline
-void parallel_reduce( const Kokkos::ParallelWorkRequest  & request ,
-                      const FunctorType          & functor ,
-                      typename Kokkos::Impl::ReduceAdapter< FunctorType >::reference_type result )
-{
-  Impl::ParallelReduce< FunctorType , Kokkos::ParallelWorkRequest >
-    reduce( functor , request , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
-
-  reduce.wait(); // Wait for reduce to complete and output result
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class Enable = void >
-struct FunctorHasJoin : public false_type {};
-
-template< class FunctorType >
-struct FunctorHasJoin< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::join ) >::type >
-  : public true_type {};
-
-template< class FunctorType , class Enable = void >
-struct FunctorHasFinal : public false_type {};
-
-template< class FunctorType >
-struct FunctorHasFinal< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::final ) >::type >
-  : public true_type {};
-
-template< class FunctorType , class Enable = void >
-struct FunctorShmemSize
-{
-  static inline size_t value( const FunctorType & ) { return 0 ; }
-};
-
-template< class FunctorType >
-struct FunctorShmemSize< FunctorType , typename enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
-{
-  static inline size_t value( const FunctorType & f ) { return f.shmem_size() ; }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class ScalarType >
-struct ReduceAdapter
-{
-  enum { StaticValueSize = sizeof(ScalarType) };
-
-  typedef ScalarType & reference_type  ;
-  typedef ScalarType * pointer_type  ;
-  typedef ScalarType   scalar_type  ;
-
-  KOKKOS_INLINE_FUNCTION static
-  reference_type reference( void * p ) { return *((ScalarType*) p); }
-
-  KOKKOS_INLINE_FUNCTION static
-  reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)[i]; }
-
-  KOKKOS_INLINE_FUNCTION static
-  pointer_type pointer( reference_type p ) { return & p ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  unsigned value_count( const FunctorType & ) { return 1 ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  unsigned value_size( const FunctorType & ) { return sizeof(ScalarType); }
-
-  KOKKOS_INLINE_FUNCTION static
-  void copy( const FunctorType & , void * const dst , const void * const src )
-    { *((scalar_type*)dst) = *((const scalar_type*)src); }
-
-  KOKKOS_INLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * update , volatile const void * input )
-    { f.join( *((volatile ScalarType*)update) , *((volatile const ScalarType*)input) ); }
-
-  template< class F >
-  KOKKOS_INLINE_FUNCTION static
-  void final( const F & f ,
-              typename enable_if< ( is_same<F,FunctorType>::value &&
-                                    FunctorHasFinal<F>::value )
-                                >::type * p )
-    { f.final( *((ScalarType *) p ) ); }
-
-  template< class F >
-  KOKKOS_INLINE_FUNCTION static
-  void final( const F & ,
-              typename enable_if< ( is_same<F,FunctorType>::value &&
-                                    ! FunctorHasFinal<F>::value )
-                                >::type * )
-    {}
-};
-
-template< class FunctorType , class ScalarType >
-struct ReduceAdapter< FunctorType , ScalarType[] >
-{
-  enum { StaticValueSize = 0 };
-
-  typedef ScalarType * reference_type  ;
-  typedef ScalarType * pointer_type  ;
-  typedef ScalarType   scalar_type  ;
-
-  KOKKOS_INLINE_FUNCTION static
-  ScalarType * reference( void * p ) { return (ScalarType*) p ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  reference_type reference( void * p , unsigned i ) { return ((ScalarType*) p)+i; }
-
-  KOKKOS_INLINE_FUNCTION static
-  pointer_type pointer( reference_type p ) { return p ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  unsigned value_count( const FunctorType & f ) { return f.value_count ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  unsigned value_size( const FunctorType & f ) { return f.value_count * sizeof(ScalarType); }
-
-  KOKKOS_INLINE_FUNCTION static
-  void copy( const FunctorType & f , void * const dst , const void * const src )
-    {
-      for ( int i = 0 ; i < int(f.value_count) ; ++i ) {
-        ((scalar_type*)dst)[i] = ((const scalar_type*)src)[i];
-      }
-    }
-
-  KOKKOS_INLINE_FUNCTION static
-  void join( const FunctorType & f , volatile void * update , volatile const void * input )
-    { f.join( ((volatile ScalarType*)update) , ((volatile const ScalarType*)input) ); }
-
-  template< class F >
-  KOKKOS_INLINE_FUNCTION static
-  void final( const F & f ,
-              typename enable_if< ( is_same<F,FunctorType>::value &&
-                                    FunctorHasFinal<F>::value )
-                                >::type * p )
-    { f.final( ((ScalarType *) p ) ); }
-
-  template< class F >
-  KOKKOS_INLINE_FUNCTION static
-  void final( const F & ,
-              typename enable_if< ( is_same<F,FunctorType>::value &&
-                                    ! FunctorHasFinal<F>::value )
-                                >::type * )
-    {}
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_PARALLEL_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_ParallelReduce.hpp b/kokkos/kokkos/core/src/Kokkos_ParallelReduce.hpp
deleted file mode 100644
index c6d929e..0000000
--- a/kokkos/kokkos/core/src/Kokkos_ParallelReduce.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_PARALLELREDUCE_HPP
-#define KOKKOS_PARALLELREDUCE_HPP
-
-#include <cstddef>
-#include <sstream>
-#include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-template< class FunctorType >
-void vector_parallel_reduce( const size_t work_count ,
-                             const FunctorType & functor ,
-                             typename Impl::ReduceAdapter< FunctorType >::reference_type result )
-
-{
-  Impl::ParallelReduce< FunctorType, VectorParallel >
-    reduce( functor , work_count , Kokkos::Impl::ReduceAdapter< FunctorType >::pointer( result ) );
-
-  reduce.wait();
-}
-
-//----------------------------------------------------------------------------
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_PARALLELREDUCE_HPP */
-
diff --git a/kokkos/kokkos/core/src/Kokkos_Serial.hpp b/kokkos/kokkos/core/src/Kokkos_Serial.hpp
deleted file mode 100644
index bce8cbc..0000000
--- a/kokkos/kokkos/core/src/Kokkos_Serial.hpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-/// \file Kokkos_Serial.hpp
-/// \brief Declaration and definition of Kokkos::Serial device.
-
-#ifndef KOKKOS_SERIAL_HPP
-#define KOKKOS_SERIAL_HPP
-
-#include <cstddef>
-#include <iosfwd>
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/// \class Serial
-/// \brief Kokkos device for non-parallel execution
-///
-/// A "device" represents a parallel execution model.  It tells Kokkos
-/// how to parallelize the execution of kernels in a parallel_for or
-/// parallel_reduce.  For example, the Threads device uses Pthreads or
-/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
-/// extensions, and the Cuda device uses NVIDIA's CUDA programming
-/// model.  The Serial device executes "parallel" kernels
-/// sequentially.  This is useful if you really do not want to use
-/// threads, or if you want to explore different combinations of MPI
-/// and shared-memory parallel programming models.
-class Serial {
-public:
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
-  //! The device type (same as this class).
-  typedef Serial                device_type ;
-  //! The size_type typedef best suited for this device.
-  typedef HostSpace::size_type  size_type ;
-  //! This device's preferred memory space.
-  typedef HostSpace             memory_space ;
-  //! This device's preferred array layout.
-  typedef LayoutRight           array_layout ;
-  /// \brief This device's host mirror type.
-  ///
-  /// Serial is a host device, so the host mirror type is the same as
-  /// the device type itself.
-  typedef Serial                host_mirror_device_type ;
-
-  //@}
-
-  /// \brief True if and only if this method is being called in a
-  ///   thread-parallel function.
-  ///
-  /// For the Serial device, this method <i>always</i> returns false,
-  /// because parallel_for or parallel_reduce with the Serial device
-  /// always execute sequentially.
-  inline static int in_parallel() { return false ; }
-
-  /** \brief  Set the device in a "sleep" state.
-   *
-   * This function sets the device in a "sleep" state in which it is
-   * not ready for work.  This may consume less resources than if the
-   * device were in an "awake" state, but it may also take time to
-   * bring the device from a sleep state to be ready for work.
-   *
-   * \return True if the device is in the "sleep" state, else false if
-   *   the device is actively working and could not enter the "sleep"
-   *   state.
-   */
-  static bool sleep();
-
-  /// \brief Wake the device from the 'sleep' state so it is ready for work.
-  ///
-  /// \return True if the device is in the "ready" state, else "false"
-  ///  if the device is actively working (which also means that it's
-  ///  awake).
-  static bool wake();
-
-  /// \brief Wait until all dispatched functors complete.
-  ///
-  /// The parallel_for or parallel_reduce dispatch of a functor may
-  /// return asynchronously, before the functor completes.  This
-  /// method does not return until all dispatched functors on this
-  /// device have completed.
-  static void fence() {}
-
-  static void initialize() {}
-
-  static int is_initialized() { return 1 ; }
-
-  //! Free any resources being consumed by the device.
-  static void finalize() {}
-
-  //! Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  inline int league_rank() const { return 0 ; }
-  inline int league_size() const { return 1 ; }
-  inline int team_rank() const { return 0 ; }
-  inline int team_size() const { return 1 ; }
-
-  inline void team_barrier() {}
-
-  inline std::pair<size_t,size_t> work_range( size_t n ) const 
-    { return std::pair<size_t,size_t>(0,n); }
-
-  template< typename T >
-  inline T * get_shmem( const int count );
-
-  static void * resize_reduce_scratch( const unsigned );
-};
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-//TODO: Needs constructor for Kokkos::ParallelWorkRequest CRT
-
-template< class FunctorType , class WorkSpec >
-class ParallelFor< FunctorType , WorkSpec , Serial > {
-public:
-
-  ParallelFor( const FunctorType & functor , const size_t work_count )
-    {
-      for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
-        functor( iwork );
-      }
-    }
-};
-
-template< class FunctorType , class WorkSpec >
-class ParallelReduce< FunctorType , WorkSpec , Serial > {
-public:
-
-  typedef ReduceAdapter< FunctorType >  Reduce ;
-  typedef typename Reduce::pointer_type pointer_type ;
-
-  ParallelReduce( const FunctorType  & functor ,
-                  const size_t         work_count ,
-                  pointer_type         result = 0 )
-    {
-      if ( 0 == result ) {
-        result = (pointer_type ) Serial::resize_reduce_scratch( Reduce::value_size( functor ) );
-      }
-
-      functor.init( Reduce::reference( result ) );
-
-      for ( size_t iwork = 0 ; iwork < work_count ; ++iwork ) {
-        functor( iwork , Reduce::reference( result ) );
-      }
-
-      Reduce::final( functor , result );
-    }
-
-  void wait() {}
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #define KOKKOS_SERIAL_HPP */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
diff --git a/kokkos/kokkos/core/src/Kokkos_Threads.hpp b/kokkos/kokkos/core/src/Kokkos_Threads.hpp
deleted file mode 100644
index d553f15..0000000
--- a/kokkos/kokkos/core/src/Kokkos_Threads.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_THREADS_HPP
-#define KOKKOS_THREADS_HPP
-
-#include <cstddef>
-#include <iosfwd>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-#include <Kokkos_HostSpace.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-class ThreadsExec ;
-} // namespace Impl
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-/** \brief  Device for a pool of Pthreads or C11 threads on a CPU. */
-class Threads {
-public:
-  //! \name Type declarations that all Kokkos devices must provide.
-  //@{
-
-  typedef Threads                  device_type ;
-  typedef Kokkos::HostSpace        memory_space ;
-  typedef memory_space::size_type  size_type ;
-  typedef Kokkos::LayoutRight      array_layout ;
-  typedef Kokkos::Threads          host_mirror_device_type ;
-
-  //@}
-  /*------------------------------------------------------------------------*/
-  //! \name Static functions that all Kokkos devices must implement.
-  //@{
-
-  /// \brief True if and only if this method is being called in a
-  ///   thread-parallel function.
-  static int in_parallel();
-
-  /** \brief  Set the device in a "sleep" state.
-   *
-   * This function sets the device in a "sleep" state in which it is
-   * not ready for work.  This may consume less resources than if the
-   * device were in an "awake" state, but it may also take time to
-   * bring the device from a sleep state to be ready for work.
-   *
-   * \return True if the device is in the "sleep" state, else false if
-   *   the device is actively working and could not enter the "sleep"
-   *   state.
-   */
-  static bool sleep();
-
-  /// \brief Wake the device from the 'sleep' state so it is ready for work.
-  ///
-  /// \return True if the device is in the "ready" state, else "false"
-  ///  if the device is actively working (which also means that it's
-  ///  awake).
-  static bool wake();
-
-  /// \brief Wait until all dispatched functors complete.
-  ///
-  /// The parallel_for or parallel_reduce dispatch of a functor may
-  /// return asynchronously, before the functor completes.  This
-  /// method does not return until all dispatched functors on this
-  /// device have completed.
-  static void fence();
-
-  /// \brief Free any resources being consumed by the device.
-  ///
-  /// For the Threads device, this terminates spawned worker threads.
-  static void finalize();
-
-  /// \brief Print configuration information to the given output stream.
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  //@}
-  //! \name Function for the functor device interface */
-  //@{
-
-  inline int league_rank() const ;
-  inline int league_size() const ;
-  inline int team_rank() const ;
-  inline int team_size() const ;
-
-  inline void team_barrier();
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  inline Type team_scan( const Type & value );
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename TypeLocal , typename TypeGlobal >
-  inline TypeGlobal team_scan( const TypeLocal & value , TypeGlobal * const global_accum );
-
-  inline void * get_shmem( const int size );
-
-  explicit inline Threads( Impl::ThreadsExec & );
-
-  /**@} */
-  /*------------------------------------------------------------------------*/
-  //! \name Device-specific functions
-  //@{
-
-  /** \brief Initialize the device in the "ready to work" state.
-   *
-   *  The device is initialized in a "ready to work" or "awake" state.
-   *  This state reduces latency and thus improves performance when
-   *  dispatching work.  However, the "awake" state consumes resources
-   *  even when no work is being done.  You may call sleep() to put
-   *  the device in a "sleeping" state that does not consume as many
-   *  resources, but it will take time (latency) to awaken the device
-   *  again (via the wake()) method so that it is ready for work.
-   *
-   *  Teams of threads are distributed as evenly as possible across
-   *  the requested number of numa regions and cores per numa region.
-   *  A team will not be split across a numa region.
-   *
-   *  If the 'use_' arguments are not supplied the hwloc is queried
-   *  to use all available cores.
-   */
-  static void initialize( unsigned team_count = 1 ,
-                          unsigned threads_per_team = 1 ,
-                          unsigned use_numa_count = 0 ,
-                          unsigned use_cores_per_numa = 0 );
-
-  static int is_initialized();
-
-  static unsigned league_max();
-  static unsigned team_max();
-
-  //@}
-  /*------------------------------------------------------------------------*/
-
-private:
-
-  friend class Impl::ThreadsExec ;
-
-  Impl::ThreadsExec & m_exec ;
-};
-
-/*--------------------------------------------------------------------------*/
-
-} // namespace Kokkos
-
-#include <Kokkos_Parallel.hpp>
-#include <Threads/Kokkos_ThreadsExec.hpp>
-#include <Threads/Kokkos_Threads_Parallel.hpp>
-
-#endif /* #define KOKKOS_THREADS_HPP */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
diff --git a/kokkos/kokkos/core/src/Kokkos_View.hpp b/kokkos/kokkos/core/src/Kokkos_View.hpp
deleted file mode 100644
index db18f17..0000000
--- a/kokkos/kokkos/core/src/Kokkos_View.hpp
+++ /dev/null
@@ -1,1693 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VIEW_HPP
-#define KOKKOS_VIEW_HPP
-
-#include <string>
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_MemoryTraits.hpp>
-
-#include <impl/Kokkos_StaticAssert.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Shape.hpp>
-#include <impl/Kokkos_AnalyzeShape.hpp>
-#include <impl/Kokkos_ViewSupport.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  View specialization mapping of view traits to a specialization tag */
-template< typename ScalarType , class ValueType ,
-          class ArrayLayout , class uRank , class uRankDynamic ,
-          class MemorySpace , class MemoryTraits >
-struct ViewSpecialize ;
-
-template< class DstViewSpecialize , class SrcViewSpecialize = void , class Enable = void >
-struct ViewAssignment ;
-
-} /* namespace Impl */
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-/** \class ViewTraits
- *  \brief Traits class for accessing attributes of a View.
- *
- * This is an implementation detail of View.  It is only of interest
- * to developers implementing a new specialization of View.
- *
- * Template argument permutations:
- *   - View< DataType , Device , void         , void >
- *   - View< DataType , Device , MemoryTraits , void >
- *   - View< DataType , Device , void         , MemoryTraits >
- *   - View< DataType , ArrayLayout , Device  , void >
- *   - View< DataType , ArrayLayout , Device  , MemoryTraits >
- */
-template< class DataType ,
-          class Arg1 ,
-          class Arg2 ,
-          class Arg3 >
-class ViewTraits {
-private:
-
-  // Arg1 is either Device or Layout, both of which must have 'typedef ... array_layout'.
-  // If Arg1 is not Layout then Arg1 must be Device
-  enum { Arg1IsDevice = ! Impl::is_same< Arg1 , typename Arg1::array_layout >::value };
-  enum { Arg2IsDevice = ! Arg1IsDevice };
-
-  // If Arg1 is device and Arg2 is not void then Arg2 is MemoryTraits.
-  // If Arg1 is device and Arg2 is void and Arg3 is not void then Arg3 is MemoryTraits.
-  // If Arg2 is device and Arg3 is not void then Arg3 is MemoryTraits.
-  enum { Arg2IsVoid = Impl::is_same< Arg2 , void >::value };
-  enum { Arg3IsVoid = Impl::is_same< Arg3 , void >::value };
-  enum { Arg2IsMemory = ! Arg2IsVoid && Arg1IsDevice && Arg3IsVoid };
-  enum { Arg3IsMemory = ! Arg3IsVoid && ( ( Arg1IsDevice && Arg2IsVoid ) || Arg2IsDevice ) };
-
-
-  typedef typename Arg1::array_layout  ArrayLayout ;
-  typedef typename Impl::if_c< Arg1IsDevice , Arg1 , Arg2 >::type::device_type  DeviceType ;
-
-  typedef typename Impl::if_c< Arg2IsMemory , Arg2 ,
-          typename Impl::if_c< Arg3IsMemory , Arg3 , MemoryManaged
-          >::type >::type::memory_traits  MemoryTraits ;
-
-  typedef Impl::AnalyzeShape<DataType> analysis ;
-
-public:
-
-  //------------------------------------
-  // Data type traits:
-
-  typedef DataType                            data_type ;
-  typedef typename analysis::const_type       const_data_type ;
-  typedef typename analysis::non_const_type   non_const_data_type ;
-
-  //------------------------------------
-  // Scalar type traits:
-
-  typedef typename analysis::scalar_type            scalar_type ;
-  typedef typename analysis::const_scalar_type      const_scalar_type ;
-  typedef typename analysis::non_const_scalar_type  non_const_scalar_type ;
-
-  //------------------------------------
-  // Value type traits:
-
-  typedef typename analysis::value_type            value_type ;
-  typedef typename analysis::const_value_type      const_value_type ;
-  typedef typename analysis::non_const_value_type  non_const_value_type ;
-
-  //------------------------------------
-  // Layout and shape traits:
-
-  typedef typename Impl::StaticAssertSame< ArrayLayout , typename ArrayLayout ::array_layout >::type  array_layout ;
-
-  typedef typename analysis::shape   shape_type ;
-
-  enum { rank         = shape_type::rank };
-  enum { rank_dynamic = shape_type::rank_dynamic };
-
-  //------------------------------------
-  // Device and memory space traits:
-
-  typedef typename Impl::StaticAssertSame< DeviceType   , typename DeviceType  ::device_type   >::type  device_type ;
-  typedef typename Impl::StaticAssertSame< MemoryTraits , typename MemoryTraits::memory_traits >::type  memory_traits ;
-
-  typedef typename device_type::memory_space  memory_space ;
-  typedef typename device_type::size_type     size_type ;
-
-  enum { is_hostspace = Impl::is_same< memory_space , HostSpace >::value };
-  enum { is_managed   = memory_traits::Unmanaged == 0 };
-
-  //------------------------------------
-  // Specialization:
-  typedef typename
-    Impl::ViewSpecialize< scalar_type ,
-                          value_type ,
-                          array_layout ,
-                          Impl::unsigned_<rank> ,
-                          Impl::unsigned_<rank_dynamic> ,
-                          memory_space ,
-                          memory_traits
-                        >::type specialize ;
-};
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Default view specialization has ScalarType == ValueType
- *          and LayoutLeft or LayoutRight.
- */
-struct LayoutDefault ;
-
-template< typename ScalarType , class Rank , class RankDynamic , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ScalarType , ScalarType ,
-                       LayoutLeft , Rank , RankDynamic ,
-                       MemorySpace , MemoryTraits >
-{ typedef LayoutDefault type ; };
-
-template< typename ScalarType , class Rank , class RankDynamic , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ScalarType , ScalarType ,
-                       LayoutRight , Rank , RankDynamic ,
-                       MemorySpace , MemoryTraits >
-{ typedef LayoutDefault type ; };
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief Types for compile-time detection of View usage errors */
-namespace ViewError {
-
-struct allocation_constructor_requires_managed {};
-struct user_pointer_constructor_requires_unmanaged {};
-struct device_shmem_constructor_requires_unmanaged {};
-
-struct scalar_operator_called_from_non_scalar_view {};
-
-} /* namespace ViewError */
-
-//----------------------------------------------------------------------------
-/** \brief  Enable view parentheses operator for
- *          match of layout and integral arguments.
- *          If correct rank define type from traits,
- *          otherwise define type as an error message.
- */
-template< class ReturnType , class Traits , class Layout , unsigned Rank ,
-          typename iType0 = int , typename iType1 = int ,
-          typename iType2 = int , typename iType3 = int ,
-          typename iType4 = int , typename iType5 = int ,
-          typename iType6 = int , typename iType7 = int ,
-          class Enable = void >
-struct ViewEnableArrayOper ;
-
-template< class ReturnType , class Traits , class Layout , unsigned Rank ,
-          typename iType0 , typename iType1 ,
-          typename iType2 , typename iType3 ,
-          typename iType4 , typename iType5 ,
-          typename iType6 , typename iType7 >
-struct ViewEnableArrayOper<
-   ReturnType , Traits , Layout , Rank ,
-   iType0 , iType1 , iType2 , iType3 ,
-   iType4 , iType5 , iType6 , iType7 ,
-   typename enable_if<
-     iType0(0) == 0 && iType1(0) == 0 && iType2(0) == 0 && iType3(0) == 0 &&
-     iType4(0) == 0 && iType5(0) == 0 && iType6(0) == 0 && iType7(0) == 0 &&
-     is_same< typename Traits::array_layout , Layout >::value &&
-     ( unsigned(Traits::rank) == Rank )
-   >::type >
-{
-  typedef ReturnType type ;
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-struct AllocateWithoutInitializing {};
-
-namespace {
-const AllocateWithoutInitializing allocate_without_initializing = AllocateWithoutInitializing();
-}
-
-/** \class View
- *  \brief View to an array of data.
- *
- * A View represents an array of one or more dimensions.
- * For details, please refer to Kokkos' tutorial materials.
- *
- * \section Kokkos_View_TemplateParameters Template parameters
- *
- * This class has both required and optional template parameters.  The
- * \c DataType parameter must always be provided, and must always be
- * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are
- * placeholders for different template parameters.  The default value
- * of the fifth template parameter \c Specialize suffices for most use
- * cases.  When explaining the template parameters, we won't refer to
- * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer
- * to the valid categories of template parameters, in whatever order
- * they may occur.
- *
- * Valid ways in which template arguments may be specified:
- *   - View< DataType , Device >
- *   - View< DataType , Device ,        MemoryTraits >
- *   - View< DataType , Device , void , MemoryTraits >
- *   - View< DataType , Layout , Device >
- *   - View< DataType , Layout , Device , MemoryTraits >
- *
- * \tparam DataType (required) This indicates both the type of each
- *   entry of the array, and the combination of compile-time and
- *   run-time array dimension(s).  For example, <tt>double*</tt>
- *   indicates a one-dimensional array of \c double with run-time
- *   dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int
- *   with run-time first dimension and compile-time second dimension
- *   (of 3).  In general, the run-time dimensions (if any) must go
- *   first, followed by zero or more compile-time dimensions.  For
- *   more examples, please refer to the tutorial materials.
- *
- * \tparam Device (required) The execution model for parallel
- *   operations.  Examples include Threads, OpenMP, Cuda, and Serial.
- *
- * \tparam Layout (optional) The array's layout in memory.  For
- *   example, LayoutLeft indicates a column-major (Fortran style)
- *   layout, and LayoutRight a row-major (C style) layout.  If not
- *   specified, this defaults to the preferred layout for the
- *   <tt>Device</tt>.
- *
- * \tparam MemoryTraits (optional) Assertion of the user's intended
- *   access behavior.  For example, RandomRead indicates read-only
- *   access with limited spatial locality, and Unmanaged lets users
- *   wrap externally allocated memory in a View without automatic
- *   deallocation.
- *
- * \section Kokkos_View_MT \c MemoryTraits discussion
- *
- * \subsection Kokkos_View_MT_Interp \c MemoryTraits interpretation depends on \c Device
- *
- * Some \c MemoryTraits options may have different interpretations for
- * different \c Device types.  For example, with the Cuda device,
- * RandomRead tells Kokkos to fetch the data through the texture
- * cache, whereas the non-GPU devices have no such hardware construct.
- *
- * \subsection Kokkos_View_MT_PrefUse Preferred use of \c MemoryTraits
- *
- * Users should defer applying the optional \c MemoryTraits parameter
- * until the point at which they actually plan to rely on it in a
- * computational kernel.  This minimizes the number of template
- * parameters exposed in their code, which reduces the cost of
- * compilation.  Users may always assign a View without specified
- * <tt>MemoryTraits</tt> to a compatible View with that specification.
- * For example:
- * \code
- * // Pass in the simplest types of View possible.
- * void 
- * doSomething (View<double*, Cuda> out, 
- *              View<const double*, Cuda> in) 
- * {
- *   // Assign the "generic" View in to a RandomRead View in_rr.
- *   // Note that RandomRead View objects must have const data.
- *   View<const double*, Cuda, RandomRead> in_rr = in;
- *   // ... do something with in_rr and out ... 
- * }
- * \endcode
- */
-template< class DataType ,
-          class Arg1Type ,        /* ArrayLayout or DeviceType */
-          class Arg2Type = void , /* DeviceType or MemoryTraits */
-          class Arg3Type = void , /* MemoryTraits */
-          class Specialize =
-            typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize >
-class View ;
-
-template< class DataType ,
-          class Arg1Type ,
-          class Arg2Type ,
-          class Arg3Type >
-class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::LayoutDefault >
-  : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
-{
-public:
-
-  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
-
-private:
-
-  // Assignment of compatible views requirement:
-  template< class , class , class , class , class > friend class View ;
-
-  // Assignment of compatible subview requirement:
-  template< class , class , class > friend struct Impl::ViewAssignment ;
-
-  typedef Impl::LayoutStride< typename traits::shape_type ,
-                              typename traits::array_layout > stride_type ;
-
-  typename traits::scalar_type * m_ptr_on_device ;
-  typename traits::shape_type    m_shape ;
-  stride_type                    m_stride ;
-  
-public:
-
-  typedef View< typename traits::const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > const_type ;
-
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type::host_mirror_device_type ,
-                void > HostMirror ;
-
-  //------------------------------------
-  // Shape
-
-  enum { Rank = traits::rank };
-
-  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_shape ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_shape.N0 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_shape.N1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_shape.N2 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_shape.N3 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_shape.N4 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_shape.N5 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_shape.N6 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_shape.N7 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const
-  {
-    return   m_shape.N0
-           * m_shape.N1
-           * m_shape.N2
-           * m_shape.N3
-           * m_shape.N4
-           * m_shape.N5
-           * m_shape.N6
-           * m_shape.N7
-           ;
-  }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  typename traits::size_type dimension( const iType & i ) const
-    { return Impl::dimension( m_shape , i ); }
-
-  //------------------------------------
-
-private:
-
-  template< class ViewRHS >
-  KOKKOS_INLINE_FUNCTION
-  void assign_compatible_view( const ViewRHS & rhs ,
-                               typename Impl::enable_if< Impl::ViewAssignable< View , ViewRHS >::value >::type * = 0 )
-  {
-    typedef typename traits::shape_type    shape_type ;
-    typedef typename traits::memory_space  memory_space ;
-    typedef typename traits::memory_traits memory_traits ;
-
-    Impl::ViewTracking< traits >::decrement( m_ptr_on_device );
-
-    shape_type::assign( m_shape,
-                        rhs.m_shape.N0 , rhs.m_shape.N1 , rhs.m_shape.N2 , rhs.m_shape.N3 ,
-                        rhs.m_shape.N4 , rhs.m_shape.N5 , rhs.m_shape.N6 , rhs.m_shape.N7 );
-
-    stride_type::assign( m_stride , rhs.m_stride.value );
-
-    m_ptr_on_device = rhs.m_ptr_on_device ;
-
-    Impl::ViewTracking< traits >::increment( m_ptr_on_device );
-  }
-
-public:
-
-  //------------------------------------
-  // Destructor, constructors, assignment operators:
-
-  KOKKOS_INLINE_FUNCTION
-  ~View() { Impl::ViewTracking< traits >::decrement( m_ptr_on_device ); }
-
-  KOKKOS_INLINE_FUNCTION
-  View() : m_ptr_on_device(0)
-    {
-      traits::shape_type::assign(m_shape,0,0,0,0,0,0,0,0);
-      stride_type::assign(m_stride,0);
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  View( const View & rhs ) : m_ptr_on_device(0) { assign_compatible_view( rhs ); }
-
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View & rhs ) { assign_compatible_view( rhs ); return *this ; }
-
-  //------------------------------------
-  // Construct or assign compatible view:
-
-  template< class RT , class RL , class RD , class RM >
-  KOKKOS_INLINE_FUNCTION
-  View( const View<RT,RL,RD,RM,typename traits::specialize> & rhs )
-    : m_ptr_on_device(0) { assign_compatible_view( rhs ); }
-
-  template< class RT , class RL , class RD , class RM >
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View<RT,RL,RD,RM,typename traits::specialize> & rhs )
-    { assign_compatible_view( rhs ); return *this ; }
-
-  //------------------------------------
-  // Allocation of a managed view with possible alignment padding.
-
-  typedef Impl::if_c< traits::is_managed ,
-                      std::string ,
-                      Impl::ViewError::allocation_constructor_requires_managed >
-   if_allocation_constructor ;
-
-  explicit inline
-  View( const typename if_allocation_constructor::type & label ,
-        const size_t n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 )
-    : m_ptr_on_device(0)
-    {
-      typedef typename traits::device_type   device_type ;
-      typedef typename traits::memory_space  memory_space ;
-      typedef typename traits::shape_type    shape_type ;
-      typedef typename traits::scalar_type   scalar_type ;
-
-      shape_type ::assign( m_shape, n0, n1, n2, n3, n4, n5, n6, n7 );
-      stride_type::assign_with_padding( m_stride , m_shape );
-
-      m_ptr_on_device = (scalar_type *)
-        memory_space::allocate( if_allocation_constructor::select( label ) ,
-                                typeid(scalar_type) ,
-                                sizeof(scalar_type) ,
-                                Impl::capacity( m_shape , m_stride ) );
-
-      Impl::ViewInitialize< device_type > init( *this );
-    }
-
-  explicit inline
-  View( const AllocateWithoutInitializing & ,
-        const typename if_allocation_constructor::type & label ,
-        const size_t n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 )
-    : m_ptr_on_device(0)
-    {
-      typedef typename traits::device_type   device_type ;
-      typedef typename traits::memory_space  memory_space ;
-      typedef typename traits::shape_type    shape_type ;
-      typedef typename traits::scalar_type   scalar_type ;
-
-      shape_type ::assign( m_shape, n0, n1, n2, n3, n4, n5, n6, n7 );
-      stride_type::assign_with_padding( m_stride , m_shape );
-
-      m_ptr_on_device = (scalar_type *)
-        memory_space::allocate( if_allocation_constructor::select( label ) ,
-                                typeid(scalar_type) ,
-                                sizeof(scalar_type) ,
-                                Impl::capacity( m_shape , m_stride ) );
-    }
-
-  //------------------------------------
-  // Assign an unmanaged View from pointer, can be called in functors.
-  // No alignment padding is performed.
-
-  typedef Impl::if_c< ! traits::is_managed ,
-                      typename traits::scalar_type * ,
-                      Impl::ViewError::user_pointer_constructor_requires_unmanaged >
-    if_user_pointer_constructor ;
-
-  View( typename if_user_pointer_constructor::type ptr ,
-        const size_t n0 = 0 ,
-        const size_t n1 = 0 ,
-        const size_t n2 = 0 ,
-        const size_t n3 = 0 ,
-        const size_t n4 = 0 ,
-        const size_t n5 = 0 ,
-        const size_t n6 = 0 ,
-        const size_t n7 = 0 )
-    : m_ptr_on_device(0)
-    {
-      typedef typename traits::shape_type   shape_type ;
-      typedef typename traits::scalar_type  scalar_type ;
-
-      shape_type ::assign( m_shape, n0, n1, n2, n3, n4, n5, n6, n7 );
-      stride_type::assign_no_padding( m_stride , m_shape );
-
-      m_ptr_on_device = if_user_pointer_constructor::select( ptr );
-    }
-
-  //------------------------------------
-  // Assign unmanaged View to portion of Device shared memory
-
-  typedef Impl::if_c< ! traits::is_managed ,
-                      typename traits::device_type ,
-                      Impl::ViewError::device_shmem_constructor_requires_unmanaged >
-      if_device_shmem_constructor ;
-
-  explicit KOKKOS_INLINE_FUNCTION
-  View( typename if_device_shmem_constructor::type & dev ,
-        const unsigned n0 = 0 ,
-        const unsigned n1 = 0 ,
-        const unsigned n2 = 0 ,
-        const unsigned n3 = 0 ,
-        const unsigned n4 = 0 ,
-        const unsigned n5 = 0 ,
-        const unsigned n6 = 0 ,
-        const unsigned n7 = 0 )
-    : m_ptr_on_device(0)
-    {
-      typedef typename traits::shape_type   shape_type ;
-      typedef typename traits::scalar_type  scalar_type ;
-
-      enum { align = 8 };
-      enum { mask  = align - 1 };
-
-      shape_type::assign( m_shape, n0, n1, n2, n3, n4, n5, n6, n7 );
-      stride_type::assign_no_padding( m_stride , m_shape );
-
-      typedef Impl::if_c< ! traits::is_managed ,
-                          scalar_type * ,
-                          Impl::ViewError::device_shmem_constructor_requires_unmanaged >
-        if_device_shmem_pointer ;
-
-      // Select the first argument:
-      m_ptr_on_device = if_device_shmem_pointer::select(
-       (scalar_type *) dev.get_shmem( unsigned( sizeof(scalar_type) * Impl::capacity( m_shape , m_stride ) + unsigned(mask) ) & ~unsigned(mask) ) );
-    }
-
-  static inline
-  unsigned shmem_size( const unsigned n0 = 0 ,
-                       const unsigned n1 = 0 ,
-                       const unsigned n2 = 0 ,
-                       const unsigned n3 = 0 ,
-                       const unsigned n4 = 0 ,
-                       const unsigned n5 = 0 ,
-                       const unsigned n6 = 0 ,
-                       const unsigned n7 = 0 )
-  {
-    enum { align = 8 };
-    enum { mask  = align - 1 };
-
-    typedef typename traits::shape_type   shape_type ;
-    typedef typename traits::scalar_type  scalar_type ;
-
-    shape_type  shape ;
-    stride_type stride ;
-    
-    traits::shape_type::assign( shape, n0, n1, n2, n3, n4, n5, n6, n7 );
-    stride_type::assign_no_padding( stride , shape );
-
-    return unsigned( sizeof(scalar_type) * Impl::capacity( shape , stride ) + unsigned(mask) ) & ~unsigned(mask) ;
-  }
-
-  //------------------------------------
-  // Is not allocated
-
-  KOKKOS_INLINE_FUNCTION
-  bool is_null() const { return 0 == m_ptr_on_device ; }
-
-  //------------------------------------
-  // Operators for scalar (rank zero) views.
-
-  typedef Impl::if_c< traits::rank == 0 ,
-                      typename traits::scalar_type ,
-                      Impl::ViewError::scalar_operator_called_from_non_scalar_view >
-    if_scalar_operator ;
-
-  KOKKOS_INLINE_FUNCTION
-  const View & operator = ( const typename if_scalar_operator::type & rhs ) const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-      *m_ptr_on_device = if_scalar_operator::select( rhs );
-      return *this ;
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  operator typename if_scalar_operator::type & () const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-      return if_scalar_operator::select( *m_ptr_on_device );
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  typename if_scalar_operator::type & operator()() const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-      return if_scalar_operator::select( *m_ptr_on_device );
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  typename if_scalar_operator::type & operator*() const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-      return if_scalar_operator::select( *m_ptr_on_device );
-    }
-
-  //------------------------------------
-  // Array member access operators enabled if
-  // (1) a zero value of all argument types are compile-time comparable to zero
-  // (2) the rank matches the number of arguments
-  // (3) the memory space is valid for the access
-  //------------------------------------
-  // LayoutLeft, rank 1:
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & , traits, LayoutLeft, 1, iType0 >::type
-    operator[] ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 ];
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & , traits, LayoutLeft, 1, iType0 >::type
-    operator() ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 ];
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & , traits, LayoutLeft, 1, iType0 >::type
-    at( const iType0 & i0 , const int , const int , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 ];
-    }
-
-  // LayoutLeft, rank 2:
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 2, iType0, iType1 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * i1 ];
-    }
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 2, iType0, iType1 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * i1 ];
-    }
-
-  // LayoutLeft, rank 3:
-
-  template< typename iType0 , typename iType1 , typename iType2 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 3, iType0, iType1, iType2 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_shape, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * i2 ) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 3, iType0, iType1, iType2 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_shape, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * i2 ) ];
-    }
-
-  // LayoutLeft, rank 4:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 4, iType0, iType1, iType2, iType3 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_shape, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * i3 )) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 4, iType0, iType1, iType2, iType3 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_shape, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * i3 )) ];
-    }
-
-  // LayoutLeft, rank 5:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 5, iType0, iType1, iType2, iType3 , iType4 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_shape, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * i4 ))) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 5, iType0, iType1, iType2, iType3 , iType4 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_shape, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * i4 ))) ];
-    }
-
-  // LayoutLeft, rank 6:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 6, iType0, iType1, iType2, iType3 , iType4, iType5 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_shape, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * (
-                              i4 + m_shape.N4 * i5 )))) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 6, iType0, iType1, iType2, iType3 , iType4, iType5 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_shape, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * (
-                              i4 + m_shape.N4 * i5 )))) ];
-    }
-
-  // LayoutLeft, rank 7:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 7, iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_shape, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * (
-                              i4 + m_shape.N4 * (
-                              i5 + m_shape.N5 * i6 ))))) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 7, iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_shape, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * (
-                              i4 + m_shape.N4 * (
-                              i5 + m_shape.N5 * i6 ))))) ];
-    }
-
-  // LayoutLeft, rank 8:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 8, iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_shape, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * (
-                              i4 + m_shape.N4 * (
-                              i5 + m_shape.N5 * (
-                              i6 + m_shape.N6 * i7 )))))) ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutLeft, 8, iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_shape, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 + m_stride.value * (
-                              i1 + m_shape.N1 * (
-                              i2 + m_shape.N2 * (
-                              i3 + m_shape.N3 * (
-                              i4 + m_shape.N4 * (
-                              i5 + m_shape.N5 * (
-                              i6 + m_shape.N6 * i7 )))))) ];
-    }
-
-  //------------------------------------
-  // LayoutRight, rank 1:
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & , traits, LayoutRight, 1, iType0 >::type
-    operator[] ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 ];
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & , traits, LayoutRight, 1, iType0 >::type
-    operator() ( const iType0 & i0 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 ];
-    }
-
-  template< typename iType0 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & , traits, LayoutRight, 1, iType0 >::type
-    at( const iType0 & i0 , const int , const int , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_shape, i0 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i0 ];
-    }
-
-  // LayoutRight, rank 2:
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 2, iType0, iType1 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i1 + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 2, iType0, iType1 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i1 + i0 * m_stride.value ];
-    }
-
-  // LayoutRight, rank 3:
-
-  template< typename iType0 , typename iType1 , typename iType2 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 3, iType0, iType1, iType2 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_shape, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i2 + m_shape.N2 * ( i1 ) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 3, iType0, iType1, iType2 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_shape, i0,i1,i2 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i2 + m_shape.N2 * ( i1 ) + i0 * m_stride.value ];
-    }
-
-  // LayoutRight, rank 4:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 4, iType0, iType1, iType2, iType3 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_shape, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 )) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 4, iType0, iType1, iType2, iType3 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_shape, i0,i1,i2,i3 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 )) + i0 * m_stride.value ];
-    }
-
-  // LayoutRight, rank 5:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 5, iType0, iType1, iType2, iType3, iType4 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_shape, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 ))) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 5, iType0, iType1, iType2, iType3, iType4 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const int , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_shape, i0,i1,i2,i3,i4 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 ))) + i0 * m_stride.value ];
-    }
-
-  // LayoutRight, rank 6:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 6, iType0, iType1, iType2, iType3, iType4, iType5 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_shape, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i5 + m_shape.N5 * (
-                              i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 )))) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 6, iType0, iType1, iType2, iType3, iType4, iType5 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const int , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_shape, i0,i1,i2,i3,i4,i5 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i5 + m_shape.N5 * (
-                              i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 )))) + i0 * m_stride.value ];
-    }
-
-  // LayoutRight, rank 7:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 7, iType0, iType1, iType2, iType3, iType4, iType5, iType6 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_shape, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i6 + m_shape.N6 * (
-                              i5 + m_shape.N5 * (
-                              i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 ))))) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 7, iType0, iType1, iType2, iType3, iType4, iType5, iType6 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const int ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_shape, i0,i1,i2,i3,i4,i5,i6 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i6 + m_shape.N6 * (
-                              i5 + m_shape.N5 * (
-                              i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 ))))) + i0 * m_stride.value ];
-    }
-
-  // LayoutRight, rank 8:
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 8, iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7 >::type
-    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_shape, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i7 + m_shape.N7 * (
-                              i6 + m_shape.N6 * (
-                              i5 + m_shape.N5 * (
-                              i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 )))))) + i0 * m_stride.value ];
-    }
-
-  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
-            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
-  KOKKOS_INLINE_FUNCTION
-  typename Impl::ViewEnableArrayOper< typename traits::scalar_type & ,
-                                      traits, LayoutRight, 8, iType0, iType1, iType2, iType3, iType4, iType5, iType6, iType7 >::type
-    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
-        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
-    {
-      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_shape, i0,i1,i2,i3,i4,i5,i6,i7 );
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-
-      return m_ptr_on_device[ i7 + m_shape.N7 * (
-                              i6 + m_shape.N6 * (
-                              i5 + m_shape.N5 * (
-                              i4 + m_shape.N4 * (
-                              i3 + m_shape.N3 * (
-                              i2 + m_shape.N2 * (
-                              i1 )))))) + i0 * m_stride.value ];
-    }
-
-  //------------------------------------
-  // Access to the underlying contiguous storage of this view specialization.
-  // These methods are specific to specialization of a view.
-
-  KOKKOS_INLINE_FUNCTION
-  typename traits::scalar_type * ptr_on_device() const { return m_ptr_on_device ; }
-
-  // Stride of physical storage, dimensioned to at least Rank
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  void stride( iType * const s ) const
-  { Impl::stride( s , m_shape , m_stride ); }
-
-  // Count of contiguously allocated data members including padding.
-  KOKKOS_INLINE_FUNCTION
-  typename traits::size_type capacity() const
-  { return Impl::capacity( m_shape , m_stride ); }
-};
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class LT , class LL , class LD , class LM , class LS ,
-          class RT , class RL , class RD , class RM , class RS >
-KOKKOS_INLINE_FUNCTION
-typename Impl::enable_if<( Impl::is_same< LS , RS >::value ), bool >::type
-operator == ( const View<LT,LL,LD,LM,LS> & lhs ,
-              const View<RT,RL,RD,RM,RS> & rhs )
-{
-  // Same data, layout, dimensions
-  typedef ViewTraits<LT,LL,LD,LM> lhs_traits ;
-  typedef ViewTraits<RT,RL,RD,RM> rhs_traits ;
-
-  return
-    Impl::is_same< typename lhs_traits::const_data_type ,
-                   typename rhs_traits::const_data_type >::value &&
-    Impl::is_same< typename lhs_traits::array_layout ,
-                   typename rhs_traits::array_layout >::value &&
-    Impl::is_same< typename lhs_traits::memory_space ,
-                   typename rhs_traits::memory_space >::value &&
-    Impl::is_same< typename lhs_traits::specialize ,
-                   typename rhs_traits::specialize >::value &&
-    lhs.ptr_on_device() == rhs.ptr_on_device() &&
-    lhs.shape()         == rhs.shape() ;
-}
-
-template< class LT , class LL , class LD , class LM , class LS ,
-          class RT , class RL , class RD , class RM , class RS >
-KOKKOS_INLINE_FUNCTION
-bool operator != ( const View<LT,LL,LD,LM,LS> & lhs ,
-                   const View<RT,RL,RD,RM,RS> & rhs )
-{
-  return ! operator==( lhs , rhs );
-}
-
-//----------------------------------------------------------------------------
-
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-/** \brief  Deep copy a value into a view.
- */
-template< class DT , class DL , class DD , class DM , class DS >
-inline
-void deep_copy( const View<DT,DL,DD,DM,DS> & dst ,
-                typename Impl::enable_if<(
-                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_scalar_type ,
-                                 typename ViewTraits<DT,DL,DD,DM>::scalar_type >::value
-                ), typename ViewTraits<DT,DL,DD,DM>::const_scalar_type >::type & value )
-{
-  Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value );
-}
-
-template< class ST , class SL , class SD , class SM , class SS >
-inline
-typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type
-deep_copy( ST & dst , const View<ST,SL,SD,SM,SS> & src )
-{
-  typedef  ViewTraits<ST,SL,SD,SM>  src_traits ;
-  typedef typename src_traits::memory_space  src_memory_space ;
-  Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.ptr_on_device() , sizeof(ST) );
-}
-
-//----------------------------------------------------------------------------
-/** \brief  A deep copy between views of the same specialization, compatible type,
- *          same rank, same layout are handled by that specialization.
- */
-template< class DT , class DL , class DD , class DM , class DS ,
-          class ST , class SL , class SD , class SM , class SS >
-inline
-void deep_copy( const View<DT,DL,DD,DM,DS> & dst ,
-                const View<ST,SL,SD,SM,SS> & src ,
-                typename Impl::enable_if<(
-                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::scalar_type ,
-                                 typename ViewTraits<ST,SL,SD,SM>::non_const_scalar_type >::value
-                  &&
-                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
-                                 typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
-                  &&
-                  ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
-                )>::type * = 0 )
-{
-  typedef  ViewTraits<DT,DL,DD,DM>  dst_traits ;
-  typedef  ViewTraits<ST,SL,SD,SM>  src_traits ;
-
-  typedef typename dst_traits::memory_space  dst_memory_space ;
-  typedef typename src_traits::memory_space  src_memory_space ;
-
-  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
-
-    Impl::assert_shapes_are_equal( dst.shape() , src.shape() );
-
-    const size_t nbytes = sizeof(typename dst_traits::scalar_type) * dst.capacity();
-
-    Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
-  }
-}
-
-
-/** \brief Deep copy equal dimension arrays in the host space which
- *         have different layouts or specializations.
- */
-template< class DT , class DL , class DD , class DM , class DS ,
-          class ST , class SL ,            class SM , class SS >
-inline
-void deep_copy( const View< DT, DL, DD, DM, DS> & dst ,
-                const View< ST, SL, DD, SM, SS> & src ,
-                const typename Impl::enable_if<(
-                  // Destination is not constant:
-                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type ,
-                                 typename ViewTraits<DT,DL,DD,DM>::non_const_value_type >::value
-                  &&
-                  // Same rank
-                  ( unsigned( ViewTraits<DT,DL,DD,DM>::rank ) ==
-                    unsigned( ViewTraits<ST,SL,DD,SM>::rank ) )
-                  &&
-                  // Different layout or different specialization:
-                  ( ( ! Impl::is_same< typename DL::array_layout ,
-                                       typename SL::array_layout >::value )
-                    ||
-                    ( ! Impl::is_same< DS , SS >::value )
-                  )
-                )>::type * = 0 )
-{
-  typedef View< DT, DL, DD, DM, DS> dst_type ;
-  typedef View< ST, SL, DD, SM, SS> src_type ;
-
-  assert_shapes_equal_dimension( dst.shape() , src.shape() );
-
-  Impl::ViewRemap< dst_type , src_type >( dst , src );
-}
-
-//----------------------------------------------------------------------------
-
-template< class T , class L , class D , class M , class S >
-typename Impl::enable_if<(
-    View<T,L,D,M,S>::is_managed 
-  ), typename View<T,L,D,M,S>::HostMirror >::type
-inline
-create_mirror( const View<T,L,D,M,S> & src )
-{
-  typedef View<T,L,D,M,S>                  view_type ;
-  typedef typename view_type::HostMirror    host_view_type ;
-  typedef typename view_type::memory_space  memory_space ;
-
-  // 'view' is managed therefore we can allocate a
-  // compatible host_view through the ordinary constructor.
-  
-  std::string label = memory_space::query_label( src.ptr_on_device() );
-  label.append("_mirror");
-
-  return host_view_type( label ,
-                         src.dimension_0() ,
-                         src.dimension_1() ,
-                         src.dimension_2() ,
-                         src.dimension_3() ,
-                         src.dimension_4() ,
-                         src.dimension_5() ,
-                         src.dimension_6() ,
-                         src.dimension_7() );
-}
-
-template< class T , class L , class D , class M , class S >
-typename Impl::enable_if<(
-    View<T,L,D,M,S>::is_managed &&
-    Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
-  ), typename View<T,L,D,M,S>::HostMirror >::type
-inline
-create_mirror_view( const View<T,L,D,M,S> & src )
-{
-  return src ;
-}
-
-template< class T , class L , class D , class M , class S >
-typename Impl::enable_if<(
-    View<T,L,D,M,S>::is_managed &&
-    ! Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
-  ), typename View<T,L,D,M,S>::HostMirror >::type
-inline
-create_mirror_view( const View<T,L,D,M,S> & src )
-{
-  return create_mirror( src );
-}
-
-//----------------------------------------------------------------------------
-
-/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
-template< class T , class L , class D , class M , class S >
-inline
-void resize( View<T,L,D,M,S> & v ,
-             const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
-             const size_t n1 = 0 ,
-             const size_t n2 = 0 ,
-             const size_t n3 = 0 ,
-             const size_t n4 = 0 ,
-             const size_t n5 = 0 ,
-             const size_t n6 = 0 ,
-             const size_t n7 = 0 )
-{
-  typedef View<T,L,D,M,S> view_type ;
-  typedef typename view_type::memory_space memory_space ;
-
-  const std::string label = memory_space::query_label( v.ptr_on_device() );
-
-  view_type v_resized( label, n0, n1, n2, n3, n4, n5, n6, n7 );
-
-  Impl::ViewRemap< view_type , view_type >( v_resized , v );
-
-  v = v_resized ;
-}
-
-/** \brief  Reallocate a view without copying old data to new data */
-template< class T , class L , class D , class M , class S >
-inline
-void realloc( View<T,L,D,M,S> & v ,
-              const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
-              const size_t n1 = 0 ,
-              const size_t n2 = 0 ,
-              const size_t n3 = 0 ,
-              const size_t n4 = 0 ,
-              const size_t n5 = 0 ,
-              const size_t n6 = 0 ,
-              const size_t n7 = 0 )
-{
-  typedef View<T,L,D,M,S> view_type ;
-  typedef typename view_type::memory_space memory_space ;
-
-  // Query the current label and reuse it.
-  const std::string label = memory_space::query_label( v.ptr_on_device() );
-
-  v = view_type(); // deallocate first, if the only view to memory.
-  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} };
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst , src , arg0 );
-
-  return dst ;
-}
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 , class ArgType1 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1 );
-
-  return dst ;
-}
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2 );
-
-  return dst ;
-}
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3 );
-
-  return dst ;
-}
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4 );
-
-  return dst ;
-}
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 , class ArgType5 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 ,
-         const ArgType5 & arg5 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4, arg5 );
-
-  return dst ;
-}
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 , class ArgType5 , class ArgType6 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 ,
-         const ArgType5 & arg5 ,
-         const ArgType6 & arg6 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4, arg5, arg6 );
-
-  return dst ;
-}
-
-template< class DstViewType ,
-          class T , class L , class D , class M , class S ,
-          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
-          class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
-KOKKOS_INLINE_FUNCTION
-DstViewType
-subview( const View<T,L,D,M,S> & src ,
-         const ArgType0 & arg0 ,
-         const ArgType1 & arg1 ,
-         const ArgType2 & arg2 ,
-         const ArgType3 & arg3 ,
-         const ArgType4 & arg4 ,
-         const ArgType5 & arg5 ,
-         const ArgType6 & arg6 ,
-         const ArgType7 & arg7 )
-{
-  DstViewType dst ;
-
-  Impl::ViewAssignment<typename DstViewType::specialize,S>( dst, src, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 );
-
-  return dst ;
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#include <impl/Kokkos_ViewDefault.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
-
diff --git a/kokkos/kokkos/core/src/Kokkos_hwloc.hpp b/kokkos/kokkos/core/src/Kokkos_hwloc.hpp
deleted file mode 100644
index e7615ca..0000000
--- a/kokkos/kokkos/core/src/Kokkos_hwloc.hpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_HWLOC_HPP
-#define KOKKOS_HWLOC_HPP
-
-#include <utility>
-
-namespace Kokkos {
-
-/** \brief  Minimal subset of logical 'hwloc' functionality available
- *          from http://www.open-mpi.org/projects/hwloc/.
- *
- *  The calls are NOT thread safe in order to avoid mutexes,
- *  memory allocations, or other actions which could give the
- *  runtime system an opportunity to migrate the threads or
- *  touch allocated memory during the function calls.
- *
- *  All calls to these functions should be performed by a thread
- *  when it has guaranteed exclusive access; e.g., for OpenMP
- *  within a 'critical' region.
- */
-namespace hwloc {
-
-/** \brief  Query if hwloc is available */
-bool available();
-
-/** \brief  Query number of available NUMA regions.
- *          This will be less than the hardware capacity
- *          if the MPI process is pinned to a NUMA region.
- */
-unsigned get_available_numa_count();
-
-/** \brief  Query number of available cores per NUMA regions.
- *          This will be less than the hardware capacity
- *          if the MPI process is pinned to a set of cores.
- */
-unsigned get_available_cores_per_numa();
-
-/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
-unsigned get_available_threads_per_core();
-
-
-/** \brief  Query the core topology of ( NUMA x Core/NUMA ).
- *
- *  The topology is limited by the process binding,
- *  which may have been set by MPI.  NUMA rank #0
- *  contains the core on which the process / master thread
- *  is running.  The master thread should only be bound
- *  to its original NUMA rank - because moving it to
- *  a different NUMA rank will displace it from all of
- *  the memory which it has already touched.
- */
-std::pair<unsigned,unsigned> get_core_topology();
-
-/** \brief  Number of concurrent threads per core.
- *
- *  This typically reflects the number of hyperthreads
- *  the core can support.
- */
-unsigned get_core_capacity();
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Internal functions for binding persistent spawned threads.
-
-namespace Kokkos {
-namespace hwloc {
-
-/** \brief  Determine best use of cores for a given thread count */
-std::pair<unsigned,unsigned> use_core_topology( const unsigned thread_count );
-
-/** \brief  Query core-coordinate of the current thread
- *          with respect to the core_topology.
- *
- *  As long as the thread is running within the 
- *  process binding the following condition holds.
- *
- *  core_coordinate.first  < core_topology.first
- *  core_coordinate.second < core_topology.second
- */
-std::pair<unsigned,unsigned> get_this_thread_coordinate();
-
-/** \brief  Bind the current thread to a core. */
-bool bind_this_thread( const std::pair<unsigned,unsigned> );
-
-/** \brief  Bind the current thread to one of the cores in the list.
- *          Set that entry to (~0,~0) and return the index.
- *          If binding fails return ~0.
- */
-unsigned bind_this_thread( const unsigned               coordinate_count ,
-                           std::pair<unsigned,unsigned> coordinate[] );
-
-/** \brief  Unbind the current thread back to the original process binding */
-bool unbind_this_thread();
-
-void thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                     const std::pair<unsigned,unsigned> core_use ,
-                     const std::pair<unsigned,unsigned> core_topo ,
-                           std::pair<unsigned,unsigned> thread_coord[] );
-
-void thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                     const std::pair<unsigned,unsigned> core_use ,
-                     const std::pair<unsigned,unsigned> core_topo ,
-                     const std::pair<unsigned,unsigned> master_coord ,
-                           std::pair<unsigned,unsigned> thread_coord[] );
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void host_thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                          const std::pair<unsigned,unsigned> core_use ,
-                          const std::pair<unsigned,unsigned> core_topo ,
-                                std::pair<unsigned,unsigned> thread_coord[] );
-
-void host_thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                          const std::pair<unsigned,unsigned> core_use ,
-                          const std::pair<unsigned,unsigned> core_topo ,
-                          const std::pair<unsigned,unsigned> master_coord ,
-                                std::pair<unsigned,unsigned> thread_coord[] );
-
-}
-}
-
-#endif /* #define KOKKOS_HWLOC_HPP */
-
diff --git a/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
deleted file mode 100644
index b26f9fa..0000000
--- a/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_OPENMP_PARALLEL_HPP
-#define KOKKOS_OPENMP_PARALLEL_HPP
-
-#include <omp.h>
-
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_ParallelReduce.hpp>
-#include <OpenMP/Kokkos_OpenMPexec.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class WorkSpec >
-class ParallelFor< FunctorType , WorkSpec , ::Kokkos::OpenMP >
-{
-public:
-
-  inline
-  ParallelFor( const FunctorType & functor , const size_t work_count )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread( omp_get_thread_num() );
-
-      const std::pair< size_t , size_t > range = exec.work_range( work_count );
-
-      for ( size_t iwork = range.first ; iwork < range.second ; ++iwork ) {
-        functor( iwork );
-      }
-    }
-/* END #pragma omp parallel */
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class WorkSpec >
-class ParallelReduce< FunctorType , WorkSpec , Kokkos::OpenMP >
-{
-public:
-  typedef ReduceAdapter< FunctorType >   Reduce ;
-  typedef typename Reduce::pointer_type  pointer_type ;
-
-  inline
-  ParallelReduce( const FunctorType & functor ,
-                  const size_t        work_count ,
-                  pointer_type        result = 0 )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
-
-    OpenMPexec::resize_reduce_scratch( Reduce::value_size( functor ) );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread( omp_get_thread_num() );
-
-      const std::pair<size_t,size_t> range = exec.work_range( work_count );
-
-      typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
-
-      functor.init( update );
-
-      for ( size_t iw = range.first ; iw < range.second ; ++iw ) {
-        functor( iw , update );
-      }
-    }
-/* END #pragma omp parallel */
-
-    {
-      const int n = omp_get_max_threads();
-      const pointer_type ptr = pointer_type( OpenMPexec::get_thread(0)->reduce_base() );
-      typename Reduce::reference_type update = Reduce::reference( ptr );
-
-      for ( int i = 1 ; i < n ; ++i ) {
-        functor.join( update , Reduce::reference( OpenMPexec::get_thread(i)->reduce_base() ) );
-      }
-
-      Reduce::final( functor , ptr );
-
-      if ( result ) {
-        const int n = Reduce::value_count( functor );
-
-        for ( int i = 0 ; i < n ; ++i ) { result[i] = ptr[i] ; }
-      }
-    }
-  }
-
-  void wait() {}
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType , class WorkSpec >
-class ParallelScan< FunctorType , WorkSpec , Kokkos::OpenMP >
-{
-public:
-  typedef ReduceAdapter< FunctorType >   Reduce ;
-  typedef typename Reduce::pointer_type  pointer_type ;
-
-  inline
-  ParallelScan( const FunctorType & functor , const size_t work_count )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
-
-    OpenMPexec::resize_reduce_scratch( 2 * Reduce::value_size( functor ) );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread( omp_get_thread_num() );
-
-      const std::pair<size_t,size_t> range = exec.work_range( work_count );
-
-      typename Reduce::reference_type update =
-        Reduce::reference( pointer_type( exec.reduce_base() ) + Reduce::value_count( functor ) );
-
-      functor.init( update );
-
-      for ( size_t iw = range.first ; iw < range.second ; ++iw ) {
-        functor( iw , update , false );
-      }
-    }
-/* END #pragma omp parallel */
-
-    {
-      const unsigned thread_count = omp_get_max_threads();
-      const unsigned value_count  = Reduce::value_count( functor );
-      const unsigned team_max     = OpenMP::team_max();
-
-      pointer_type ptr_prev = 0 ;
-
-      for ( unsigned rank = 0 ; rank < thread_count ; ++rank ) {
-        const unsigned league_rank = rank / team_max ;
-        const unsigned team_rank   = rank % team_max ;
-
-        pointer_type ptr = pointer_type( OpenMPexec::find_thread(league_rank,team_rank)->reduce_base() );
-
-        if ( rank ) {
-          for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
-          functor.join( Reduce::reference( ptr + value_count ) , Reduce::reference( ptr ) );
-        }
-        else {
-          functor.init( Reduce::reference( ptr ) );
-        }
-
-        ptr_prev = ptr ;
-      }
-    }
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread( omp_get_thread_num() );
-
-      const std::pair<size_t,size_t> range = exec.work_range( work_count );
-
-      typename Reduce::reference_type update =
-        Reduce::reference( pointer_type( exec.reduce_base() ) );
-
-      for ( size_t iw = range.first ; iw < range.second ; ++iw ) {
-        functor( iw , update , false );
-      }
-    }
-/* END #pragma omp parallel */
-
-  }
-
-  void wait() {}
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_USE_PRAGMA_SIMD )
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType >
-class ParallelReduce< FunctorType , VectorParallel , ::Kokkos::OpenMP >
-{
-public:
-  typedef ReduceAdapter< FunctorType >   Reduce ;
-  typedef typename Reduce::pointer_type  pointer_type ;
-
-  inline
-  ParallelReduce( const FunctorType & functor ,
-                  const size_t        work_count ,
-                  pointer_type        result = 0 )
-  {
-    typedef integral_constant< size_t , OpenMPexec::VECTOR_LENGTH >     vector_length ;
-    typedef integral_constant< size_t , OpenMPexec::VECTOR_LENGTH - 1 > vector_mask ;
-
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
-
-    OpenMPexec::resize_reduce_scratch( Reduce::value_size( functor ) * vector_length::value );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread( omp_get_thread_num() );
-      const pointer_type ptr = pointer_type( exec.reduce_base() );
-
-      const std::pair<size_t,size_t> range = exec.work_range( work_count );
-
-#pragma simd
-#pragma ivdep
-      for ( size_t iv = 0 ; iv < vector_length::value ; ++iv ) {
-        functor.init( Reduce::reference( ptr + iv * Reduce::value_count( functor ) ) );
-      }
-
-#pragma simd vectorlength( vector_length::value )
-#pragma ivdep
-      for ( size_t iw = range.first ; iw < range.second ; ++iw ) {
-        functor( iw , Reduce::reference( ptr + ( iw & vector_mask::value ) * Reduce::value_count( functor ) ) );
-      }
-
-      for ( size_t iv = 1 ; iv < vector_length::value ; ++iv ) {
-        functor.join( Reduce::reference( ptr ) ,
-                      Reduce::reference( ptr + iv * Reduce::value_count( functor ) ) );
-      }
-    }
-/* END #pragma omp parallel */
-
-    {
-      const int n = omp_get_max_threads();
-      const pointer_type ptr = pointer_type( OpenMPexec::get_thread(0)->reduce_base() );
-
-      for ( int i = 1 ; i < n ; ++i ) {
-        functor.join( Reduce::reference( ptr ) ,
-                      Reduce::reference( OpenMPexec::get_thread(i)->reduce_base() ) );
-      }
-
-      Reduce::final( functor , ptr );
-
-      if ( result ) {
-        const int n = Reduce::value_count( functor );
-
-        for ( int i = 0 ; i < n ; ++i ) { result[i] = ptr[i] ; }
-      }
-    }
-  }
-
-  void wait() {}
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #if defined( KOKKOS_USE_PRAGMA_SIMD ) */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class FunctorType >
-class ParallelFor< FunctorType , ParallelWorkRequest , ::Kokkos::OpenMP >
-{
-public:
-
-  inline
-  ParallelFor( const FunctorType         & functor ,
-               const ParallelWorkRequest & work )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
-
-    OpenMPexec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread( omp_get_thread_num() );
-
-      for ( exec.team_work_init( work.league_size ) ; exec.team_work_avail() ; exec.team_work_next() ) {
-        functor( OpenMP( exec ) );
-      }
-    }
-/* END #pragma omp parallel */
-  }
-
-  void wait() {}
-};
-
-template< class FunctorType >
-class ParallelReduce< FunctorType , ParallelWorkRequest , ::Kokkos::OpenMP >
-{
-public:
-  typedef ReduceAdapter< FunctorType >   Reduce ;
-  typedef typename Reduce::pointer_type  pointer_type ;
-
-  inline
-  ParallelReduce( const FunctorType         & functor ,
-                  const ParallelWorkRequest & work ,
-                  pointer_type                result = 0 )
-  {
-    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-
-    OpenMPexec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
-    OpenMPexec::resize_reduce_scratch( Reduce::value_size( functor ) );
-
-#pragma omp parallel
-    {
-      OpenMPexec & exec = * OpenMPexec::get_thread( omp_get_thread_num() );
-
-      typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
-
-      functor.init( update );
-
-      for ( exec.team_work_init( work.league_size ) ; exec.team_work_avail() ; exec.team_work_next() ) {
-        functor( OpenMP( exec ) , update );
-      }
-    }
-/* END #pragma omp parallel */
-
-    {
-      const int n = omp_get_max_threads();
-      const pointer_type ptr = pointer_type( OpenMPexec::get_thread(0)->reduce_base() );
-
-      for ( int i = 1 ; i < n ; ++i ) {
-        functor.join( Reduce::reference( ptr ) ,
-                      Reduce::reference( OpenMPexec::get_thread(i)->reduce_base() ) );
-      }
-
-      Reduce::final( functor , ptr );
-
-      if ( result ) {
-        const int n = Reduce::value_count( functor );
-
-        for ( int i = 0 ; i < n ; ++i ) { result[i] = ptr[i] ; }
-      }
-    }
-  }
-
-  void wait() {}
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
-
diff --git a/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
deleted file mode 100644
index cdc9f47..0000000
--- a/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <limits>
-#include <iostream>
-#include <Kokkos_OpenMP.hpp>
-#include <Kokkos_hwloc.hpp>
-#include <iostream>
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-int kokkos_omp_in_parallel();
-
-int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
-
-int kokkos_omp_in_parallel()
-{
-  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
-}
-
-} // namespace
-} // namespace Impl
-} // namespace Kokkos
-
-
-namespace Kokkos {
-namespace Impl {
-
-OpenMPexec * OpenMPexec::m_thread[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
-
-OpenMPexec * OpenMPexec::find_thread( const int init_league_rank ,
-                                      const int team_rank )
-{
-  for ( unsigned i = 0 ; i < OpenMPexec::MAX_THREAD_COUNT && 0 != m_thread[i] ; ++i ) {
-    if ( init_league_rank == m_thread[i]->m_init_league_rank &&
-         team_rank        == m_thread[i]->m_team_rank        ) {
-      return m_thread[i] ;
-    }
-  }
-  return (OpenMPexec *) 0 ;
-}
-
-OpenMPexec::OpenMPexec( const unsigned league_rank ,
-                        const unsigned league_size ,
-                        const unsigned team_rank ,
-                        const unsigned team_size )
-  : m_reduce(0)
-  , m_shared(0)
-  , m_shared_end(0)
-  , m_shared_iter(0)
-  , m_state_team( OpenMPexec::Active )
-  , m_fan_team_size(0)
-  , m_team_rank( team_rank )
-  , m_team_size( team_size )
-  , m_init_league_rank( league_rank )
-  , m_init_league_size( league_size )
-  , m_work_league_rank( league_rank )
-  , m_work_league_end(  league_rank + 1 )
-  , m_work_league_size( league_size )
-{
-  for ( int i = 0 ; i < MAX_FAN_COUNT ; ++i ) { m_fan_team[i] = 0 ; }
-}
-
-OpenMPexec::~OpenMPexec() {}
-
-
-void OpenMPexec::verify_is_process( const char * const label )
-{
-  if ( omp_in_parallel() ) {
-    std::string msg( label );
-    msg.append( " ERROR: in parallel" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-void OpenMPexec::verify_initialized( const char * const label )
-{
-  if ( 0 == m_thread[0] ) {
-    std::string msg( label );
-    msg.append( " ERROR: not initialized" );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-void OpenMPexec::resize_reduce_scratch( size_t size )
-{
-  static size_t s_size = 0 ;
-
-  verify_initialized( "OpenMP::resize_reduce_scratch" );
-  verify_is_process( "OpenMP::resize_reduce_scratch" );
-
-  if ( size ) { size += REDUCE_TEAM_BASE ; }
-
-  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
-
-  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
-
-  if ( ( 0 == size && 0 != s_size ) || s_size < size ) {
-
-#pragma omp parallel
-    {
-      OpenMPexec & th = * m_thread[ omp_get_thread_num() ];
-
-#pragma omp critical
-      {
-        kokkos_omp_in_critical_region = 1 ;
-
-        if ( th.m_reduce ) {
-          HostSpace::decrement( th.m_reduce );
-          th.m_reduce = 0 ;
-        }
-
-        if ( size ) {
-          th.m_reduce = HostSpace::allocate( "openmp_reduce_scratch" , typeid(unsigned char) , 1 , size );
-        }
-        kokkos_omp_in_critical_region = 0 ;
-      }
-/* END #pragma omp critical */
-    }
-/* END #pragma omp parallel */
-  }
-
-  s_size = size ;
-}
-
-void OpenMPexec::resize_shared_scratch( size_t size )
-{
-  static size_t s_size = 0 ;
-
-  verify_initialized( "OpenMP::resize_shared_scratch" );
-  verify_is_process( "OpenMP::resize_shared_scratch" );
-
-  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
-
-  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
-
-  if ( ( 0 == size && 0 != s_size ) || s_size < size ) {
-
-#pragma omp parallel
-    {
-      OpenMPexec & th = * m_thread[ omp_get_thread_num() ];
-
-      if ( 0 == th.m_team_rank ) {
-#pragma omp critical
-        {
-          kokkos_omp_in_critical_region = 1 ;
-
-          if ( th.m_shared ) {
-            HostSpace::decrement( th.m_shared );
-            th.m_shared = 0 ;
-          }
-
-          if ( size ) {
-            th.m_shared = HostSpace::allocate( "openmp_shared_scratch" , typeid(unsigned char) , 1 , size );
-            th.m_shared_end = size ;
-          }
-
-          kokkos_omp_in_critical_region = 0 ;
-        }
-/* END #pragma omp critical */
-        // Push to threads in the same team
-
-        for ( int i = 0 ; i < omp_get_num_threads() ; ++i ) {
-          if ( th.m_init_league_rank == m_thread[i]->m_init_league_rank ) {
-            m_thread[i]->m_shared     = th.m_shared ;
-            m_thread[i]->m_shared_end = th.m_shared_end ;
-          }
-        }
-      }
-    }
-/* END #pragma omp parallel */
-  }
-
-  s_size = size ;
-}
-
-void * OpenMPexec::get_shmem( const int size )
-{
-  // m_shared_iter is in bytes, convert to integer offsets
-  const int offset = m_shared_iter >> power_of_two<sizeof(int)>::value ;
-
-  m_shared_iter += size ;
-
-  if ( m_shared_end < m_shared_iter ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("OpenMPexec::get_shmem FAILED : exceeded shared memory size" ) );
-  }
-
-  return ((int*)m_shared) + offset ;
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-unsigned OpenMP::league_max()
-{
-  Impl::OpenMPexec::verify_initialized("Kokkos::OpenMP::league_max" );
-  Impl::OpenMPexec::verify_is_process("Kokkos::OpenMP::league_max" );
-
-  return unsigned( std::numeric_limits<int>::max() );
-}
-
-unsigned OpenMP::team_max()
-{
-  Impl::OpenMPexec::verify_initialized("Kokkos::OpenMP::team_max" );
-  Impl::OpenMPexec::verify_is_process("Kokkos::OpenMP::team_max" );
-
-  return Impl::OpenMPexec::m_thread[0]->m_team_size ;
-}
-
-//----------------------------------------------------------------------------
-
-int OpenMP::is_initialized()
-{ return 0 != Impl::OpenMPexec::m_thread[0]; }
-
-void OpenMP::initialize( const unsigned team_count ,
-                         const unsigned threads_per_team ,
-                         const unsigned numa_count ,
-                         const unsigned cores_per_numa )
-{
-  Impl::OpenMPexec::verify_is_process("Kokkos::OpenMP::initialize" );
-
-  if ( Impl::OpenMPexec::m_thread[0] ) {
-    Kokkos::Impl::throw_runtime_exception("Kokkos:OpenMP::initialize ERROR : already initialized" );
-  }
-
-  const unsigned thread_count = team_count * threads_per_team ;
-
-  omp_set_num_threads( thread_count );
-
-  if ( thread_count == 0 ) return ;
-
-  //----------------------------------------
-  // Spawn threads:
-
-  // Verify OMP interaction:
-  {
-    if ( int(thread_count) != omp_get_max_threads() ) {
-      Kokkos::Impl::throw_runtime_exception("Kokkos:OpenMP::initialize ERROR : failed omp_get_max_threads()" );
-    }
-
-#pragma omp parallel
-    {
-      if ( int(thread_count) != omp_get_num_threads() ) {
-        Kokkos::Impl::throw_runtime_exception("Kokkos:OpenMP::initialize ERROR : failed omp_get_num_threads()" );
-      }
-    }
-  }
-
-  //----------------------------------------
-
-  const bool use_hwloc = ( 1 < thread_count ) && Kokkos::hwloc::available();
-
-  const std::pair<unsigned,unsigned>
-    hwloc_core_topo( Kokkos::hwloc::get_available_numa_count() ,
-                     Kokkos::hwloc::get_available_cores_per_numa() );
-
-  std::pair<unsigned,unsigned> team_topology( team_count , threads_per_team );
-  std::pair<unsigned,unsigned> use_core_topology( numa_count , cores_per_numa );
-  std::pair<unsigned,unsigned> master_coord = Kokkos::hwloc::get_this_thread_coordinate();
-
-  std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
-
-  if ( use_hwloc ) {
-
-    if ( 0 == use_core_topology.first && 0 == use_core_topology.second ) {
-      use_core_topology = Kokkos::hwloc::use_core_topology( thread_count );
-    }
-
-    Kokkos::hwloc::thread_mapping( team_topology , use_core_topology , hwloc_core_topo , master_coord , threads_coord );
-  }
-
-  // Bind threads and allocate thread data:
-
-#pragma omp parallel
-  {
-#pragma omp critical
-    {
-      // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
-      // Call to 'new' may not be thread safe as well.
-
-      // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
-
-      const unsigned omp_rank    = omp_get_thread_num();
-      const unsigned thread_r    = use_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
-      const unsigned thread_rank = thread_count - ( thread_r + 1 );
-      const unsigned league_rank = thread_rank / threads_per_team ;
-      const unsigned team_rank   = thread_rank % threads_per_team ;
-
-      Impl::OpenMPexec::m_thread[ omp_rank ] = new Impl::OpenMPexec( league_rank , team_count , team_rank , threads_per_team );
-    }
-/* END #pragma omp critical */
-  }
-/* END #pragma omp parallel */
-
-  // Set threads' fan_team relationships:
-
-#pragma omp parallel
-  {
-    Impl::OpenMPexec & th = * Impl::OpenMPexec::m_thread[ omp_get_thread_num() ];
-
-    // Intra-team fan-in with root as the highest rank thread:
-    const int team_r = th.m_team_size - ( th.m_team_rank + 1 );
-
-    for ( int n = 1 ; ( team_r + n < th.m_team_size ) && ( 0 == ( n & team_r ) ) ; n <<= 1 ) {
-      th.m_fan_team[ th.m_fan_team_size++ ] =
-        Impl::OpenMPexec::find_thread( th.m_init_league_rank , th.m_team_size - ( team_r + n + 1 ) );
-    }
-    // Intra-team scan:
-    {
-      int n ;
-      for ( n = 1 ; 0 == ( team_r & n ) && ( team_r + n < th.m_team_size ) ; n <<= 1 );
-      if ( ( team_r & n ) && ( team_r + n < th.m_team_size ) ) {
-        th.m_fan_team[ th.m_fan_team_size ] =
-          Impl::OpenMPexec::find_thread( th.m_init_league_rank , th.m_team_size - ( team_r + n + 1 ) );
-      }
-      else {
-        th.m_fan_team[ th.m_fan_team_size ] = 0 ;
-      }
-    }
-  }
-/* END #pragma omp parallel */
-
-  Impl::OpenMPexec::resize_reduce_scratch( 4096 - Impl::OpenMPexec::REDUCE_TEAM_BASE );
-  Impl::OpenMPexec::resize_shared_scratch( 4096 );
-}
-
-//----------------------------------------------------------------------------
-
-void OpenMP::finalize()
-{
-  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
-  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
-
-  Impl::OpenMPexec::resize_reduce_scratch(0);
-  Impl::OpenMPexec::resize_shared_scratch(0);
-
-  for ( int i = 0 ; i < Impl::OpenMPexec::MAX_THREAD_COUNT ; ++i ) {
-    if ( Impl::OpenMPexec::m_thread[i] ) { delete Impl::OpenMPexec::m_thread[i] ; }
-    Impl::OpenMPexec::m_thread[i] = 0 ;
-  }
-
-  omp_set_num_threads(0);
-
-  hwloc::unbind_this_thread();
-}
-
-} // namespace Kokkos
-
diff --git a/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
deleted file mode 100644
index 7d61de5..0000000
--- a/kokkos/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_OPENMPEXEC_HPP
-#define KOKKOS_OPENMPEXEC_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_spinwait.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-/** \brief  Data for OpenMP thread execution */
-
-class OpenMPexec {
-public:
-
-  // Fan array has log_2(NT) reduction threads plus 2 scan threads
-  // Currently limited to 16k threads.
-  enum { MAX_FAN_COUNT    = 16 };
-  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
-  enum { VECTOR_LENGTH    = 8 };
-  enum { REDUCE_TEAM_BASE = 512 };
-
-  /** \brief  Thread states for team synchronization */
-  enum { Active , Rendezvous , ReductionAvailable , ScanAvailable };
-
-private:
-
-  friend class Kokkos::OpenMP ;
-
-  void        * m_reduce ;    ///< Reduction memory
-  void        * m_shared ;    ///< Shared memory
-  int           m_shared_end ;
-  int           m_shared_iter ;
-  int volatile  m_state_team ;
-  int           m_fan_team_size ;
-  int           m_team_rank ;
-  int           m_team_size ;
-  int           m_init_league_rank ;
-  int           m_init_league_size ;
-
-  int           m_work_league_rank ;
-  int           m_work_league_end ;
-  int           m_work_league_size ;
-
-  OpenMPexec  * m_fan_team[ MAX_FAN_COUNT ];
-
-  static OpenMPexec * m_thread[ MAX_THREAD_COUNT ];
-
-  OpenMPexec();
-  OpenMPexec( const OpenMPexec & );
-  OpenMPexec & operator = ( const OpenMPexec & );
-
-public:
-
-  void * reduce_team() const { return m_reduce ; }
-  void * reduce_base() const { return ((unsigned char *)m_reduce) + REDUCE_TEAM_BASE ; }
-
-  ~OpenMPexec();
-
-  OpenMPexec( const unsigned league_rank ,
-              const unsigned league_size ,
-              const unsigned team_rank ,
-              const unsigned team_size );
-
-  static void finalize();
-
-  static void initialize( const unsigned  team_count ,
-                          const unsigned threads_per_team ,
-                          const unsigned numa_count ,
-                          const unsigned cores_per_numa );
-
-  static void verify_is_process( const char * const );
-  static void verify_initialized( const char * const );
-
-  static void resize_reduce_scratch( size_t );
-  static void resize_shared_scratch( size_t );
-
-  inline static
-  OpenMPexec * get_thread( const unsigned entry ) { return m_thread[ entry ] ; }
-
-  static
-  OpenMPexec * find_thread( const int init_league_rank ,
-                            const int team_rank );
-
-  //----------------------------------------------------------------------
-  /** \brief  Compute a range of work for this thread's rank */
-
-  inline
-  std::pair< size_t , size_t >
-  work_range( const size_t work_count ) const
-  {
-    typedef integral_constant< size_t , VECTOR_LENGTH - 1 > work_mask ;
-
-    const size_t thread_size = m_team_size * m_work_league_size ;
-
-    // work per thread rounded up and aligned to vector length:
-
-    const size_t work_per_thread =
-      ( ( ( work_count + thread_size - 1 ) / thread_size ) + work_mask::value ) & ~(work_mask::value);
-
-    const size_t work_begin = std::min( work_count , work_per_thread * ( m_team_rank + m_team_size * m_work_league_rank ) );
-    const size_t work_end   = std::min( work_count , work_per_thread + work_begin );
-
-    return std::pair< size_t , size_t >( work_begin , work_end );
-  }
-
-  //----------------------------------------------------------------------
-
-  void * get_shmem( const int );
-
-  void team_barrier()
-    {
-      const bool not_root = m_team_rank + 1 < m_team_size ;
-
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        spinwait( m_fan_team[i]->m_state_team , OpenMPexec::Active );
-      }
-      if ( not_root ) {
-        m_state_team = Rendezvous ;
-        spinwait( m_state_team , OpenMPexec::Rendezvous );
-      }
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        m_fan_team[i]->m_state_team = OpenMPexec::Active ;
-      }
-    }
-
-  // Called within a parallel region
-  template< class ArgType >
-  inline
-  ArgType team_scan( const ArgType & value , ArgType * const global_accum = 0 )
-    {
-      // Sequence of m_state_team states:
-      //  0) Active              : entry and exit state
-      //  1) ReductionAvailable  : reduction value available, waiting for scan value
-      //  2) ScanAvailable       : reduction value available, scan value available
-      //  3) Rendezvous          : broadcasting global iinter-team accumulation value
-
-      // Make sure there is enough scratch space:
-      typedef typename if_c< 2 * sizeof(ArgType) < REDUCE_TEAM_BASE , ArgType , void >::type type ;
-
-      const bool not_root = m_team_rank + 1 < m_team_size ;
-
-      type * const work_value = (type*) reduce_team();
-
-      // OpenMPexec::Active == m_state_team
-
-      work_value[0] = value ;
-
-      // Fan-in reduction, wait for source thread to complete it's fan-in reduction.
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        OpenMPexec & th = *m_fan_team[i];
-
-        // Wait for source thread to exit Active state.
-        Impl::spinwait( th.m_state_team , OpenMPexec::Active );
-        // Source thread is 'ReductionAvailable' or 'ScanAvailable'
-        work_value[0] += ((volatile type*)th.reduce_team())[0];
-      }
-
-      work_value[1] = work_value[0] ;
-
-      if ( not_root ) {
-
-        m_state_team = OpenMPexec::ReductionAvailable ; // Reduction value is available.
-
-        // Wait for contributing threads' scan value to be available.
-        if ( m_fan_team[ m_fan_team_size ] ) {
-          OpenMPexec & th = *m_fan_team[ m_fan_team_size ] ;
-
-          // Wait: Active -> ReductionAvailable
-          Impl::spinwait( th.m_state_team , OpenMPexec::Active );
-          // Wait: ReductionAvailable -> ScanAvailable:
-          Impl::spinwait( th.m_state_team , OpenMPexec::ReductionAvailable );
-
-          work_value[1] += ((volatile type*)th.reduce_team())[1] ;
-        }
-
-        m_state_team = OpenMPexec::ScanAvailable ; // Scan value is available.
-      }
-      else {
-         // Root thread add team's total to global inter-team accumulation
-        work_value[0] = global_accum ? atomic_fetch_add( global_accum , work_value[0] ) : 0 ;
-      }
-
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        OpenMPexec & th = *m_fan_team[i];
-        // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait( th.m_state_team , OpenMPexec::ReductionAvailable );
-        // Wait: ScanAvailable -> Rendezvous
-        Impl::spinwait( th.m_state_team , OpenMPexec::ScanAvailable );
-      }
-
-      // All fan-in threads are in the ScanAvailable state
-      if ( not_root ) {
-        m_state_team = OpenMPexec::Rendezvous ;
-        Impl::spinwait( m_state_team , OpenMPexec::Rendezvous );
-      }
-
-      // Broadcast global inter-team accumulation value
-      volatile type & global_val = work_value[0] ;
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        OpenMPexec & th = *m_fan_team[i];
-        ((volatile type*)th.reduce_team())[0] = global_val ;
-        th.m_state_team = OpenMPexec::Active ;
-      }
-      // Exclusive scan, subtract contributed value
-      return global_val + work_value[1] - value ;
-    }
-
-
-  inline
-  void team_work_init( int work_league_size )
-    {
-      const int work_per_team = ( work_league_size + m_init_league_size - 1 ) / m_init_league_size ;
-      m_work_league_rank = std::min( work_league_size , work_per_team * m_init_league_rank );
-      m_work_league_end  = std::min( work_league_size , work_per_team + m_work_league_rank );
-      m_work_league_size = work_league_size ;
-    }
-
-  inline
-  bool team_work_avail()
-    {
-      m_shared_iter = 0 ;
-      const bool avail = m_work_league_rank < m_work_league_end ;
-      if ( ! avail ) {
-        m_work_league_rank = m_init_league_rank ;
-        m_work_league_end  = m_init_league_rank + 1 ;
-        m_work_league_size = m_init_league_size ;
-      }
-      return avail ;
-    }
-
-  inline
-  void team_work_next()
-    { if ( ++m_work_league_rank < m_work_league_end ) team_barrier(); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-inline OpenMP::OpenMP( Impl::OpenMPexec & e ) : m_exec(e) {}
-
-inline int OpenMP::league_rank() const { return m_exec.m_work_league_rank ; }
-inline int OpenMP::league_size() const { return m_exec.m_work_league_size ; }
-inline int OpenMP::team_rank() const { return m_exec.m_team_rank ; }
-inline int OpenMP::team_size() const { return m_exec.m_team_size ; }
-
-inline void OpenMP::team_barrier() { m_exec.team_barrier() ; }
-
-inline void * OpenMP::get_shmem( const int size ) { return m_exec.get_shmem(size) ; }
-
-template< typename Type >
-inline Type OpenMP::team_scan( const Type & value )
-{ return m_exec.team_scan( value ); }
-
-template< typename TypeLocal , typename TypeGlobal >
-inline TypeGlobal OpenMP::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
-{ return m_exec.template team_scan< TypeGlobal >( value , global_accum ); }
-
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
-
diff --git a/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
deleted file mode 100644
index c035ade..0000000
--- a/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ /dev/null
@@ -1,946 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <limits>
-#include <iostream>
-#include <Kokkos_Threads.hpp>
-#include <Kokkos_hwloc.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-ThreadsExec                  s_threads_process ;
-ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ];
-std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
-std::string                  s_exception_msg ;
-
-unsigned s_threads_count       = 0 ;
-unsigned s_threads_reduce_size = 0 ;
-unsigned s_threads_shared_size = 0 ;
-
-void (* volatile s_current_function)( ThreadsExec & , const void * );
-const void * volatile s_current_function_arg = 0 ;
-
-struct Sentinel {
-  Sentinel()
-  {
-    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
-  }
-
-  ~Sentinel()
-  {
-    if ( s_threads_count ||
-         s_threads_reduce_size ||
-         s_threads_shared_size ||
-         s_current_function ||
-         s_current_function_arg ||
-         s_threads_exec[0] ) {
-      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
-    }
-  }
-};
-
-} // namespace
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void ThreadsExec::driver(void)
-{
-  // If hardware locality library unavailable then pass in the rank.
-
-  size_t thread_rank = (size_t) s_current_function_arg ;
-
-  if ( s_threads_count <= thread_rank ) {
-    thread_rank = Kokkos::hwloc::bind_this_thread( s_threads_count , s_threads_coord );
-  }
-
-  if ( s_threads_count <= thread_rank || 0 != ((ThreadsExec * volatile *)s_threads_exec)[ thread_rank ] ) {
-
-    // An error occured. Inform process that thread is terminating
-    s_threads_process.m_state = ThreadsExec::Terminating ;
-
-    return ;
-  }
-
-  {
-    ThreadsExec this_thread ;
-
-    this_thread.m_state = ThreadsExec::Active ;
-
-    // Try to protect against cache coherency failure by casting to volatile.
-    ((ThreadsExec * volatile *)s_threads_exec)[ thread_rank ] = & this_thread ;
-    // Really need a memory fence here.
-
-    // Inform spawning process that the threads_exec entry has been set.
-    s_threads_process.m_state = ThreadsExec::Active ;
-
-    while ( ThreadsExec::Active == this_thread.m_state ) {
-
-#if 0
-      try {
-        // Call work function
-        (*s_current_function)( this_thread , s_current_function_arg );
-      }
-      catch( const std::exception & x ) {
-        std::ostringstream msg ;
-        msg << "Kokkos::Threads[" << thread_rank << "] Uncaught exeception : " << x.what() << std::endl ;
-        s_exception_msg.append( msg.str() );
-      }
-      catch( ... ) {
-        std::ostringstream msg ;
-        msg << "Kokkos::Threads[" << thread_rank << "] Uncaught exeception"  << std::endl ;
-        s_exception_msg.append( msg.str() );
-      }
-#else
-        (*s_current_function)( this_thread , s_current_function_arg );
-#endif
-
-      // Deactivate thread and wait for reactivation
-      this_thread.m_state = ThreadsExec::Inactive ;
-      wait_yield( this_thread.m_state , ThreadsExec::Inactive );
-    }
-
-    s_threads_process.m_state = ThreadsExec::Terminating ;
-
-    ((ThreadsExec * volatile * )s_threads_exec)[ thread_rank ] = 0 ;
-  }
-}
-
-void execute_function_noop( ThreadsExec & , const void * ) {}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-ThreadsExec::~ThreadsExec()
-{
-  m_reduce      = 0 ;
-  m_shared      = 0 ;
-  m_shared_end  = 0 ;
-  m_shared_iter = 0 ;
-  m_state       = ThreadsExec::Terminating ;
-  m_state_team  = ThreadsExec::Inactive ;
-  m_fan_size    = 0 ;
-  m_fan_team_size = 0 ;
-
-  m_team_rank   = 0 ;
-  m_team_size   = 0 ;
-  m_init_league_rank = 0 ;
-  m_init_league_size = 0 ;
-  m_init_thread_rank = 0 ;
-  m_init_thread_size = 0 ;
-
-  m_work_league_rank = 0 ;
-  m_work_league_end  = 0 ;
-  m_work_league_size = 0 ;
-
-  for ( unsigned i = 0 ; i < MAX_FAN_COUNT ; ++i ) { m_fan[i] = 0 ; }
-  for ( unsigned i = 0 ; i < MAX_FAN_COUNT ; ++i ) { m_fan_team[i] = 0 ; }
-}
-
-ThreadsExec::ThreadsExec()
-  : m_reduce(0)
-  , m_shared(0)
-  , m_shared_end(0)
-  , m_shared_iter(0)
-  , m_state( ThreadsExec::Terminating )
-  , m_state_team( ThreadsExec::Inactive )
-
-  , m_fan_size(0)
-  , m_fan_team_size(0)
-
-  , m_team_rank(0)
-  , m_team_size(0)
-  , m_init_league_rank(0)
-  , m_init_league_size(0)
-  , m_init_thread_rank(0)
-  , m_init_thread_size(0)
-
-  , m_work_league_rank(0)
-  , m_work_league_end(0)
-  , m_work_league_size(0)
-{
-  for ( unsigned i = 0 ; i < MAX_FAN_COUNT ; ++i ) { m_fan[i] = 0 ; }
-  for ( unsigned i = 0 ; i < MAX_FAN_COUNT ; ++i ) { m_fan_team[i] = 0 ; }
-
-  if ( & s_threads_process == this ) {
-    m_state = ThreadsExec::Inactive ;
-    m_team_rank = 0 ;
-    m_team_size = 1 ;
-    m_init_league_rank = 0 ;
-    m_init_league_size = 1 ;
-    m_init_thread_rank = 0 ;
-    m_init_thread_size = 1 ;
-
-    m_work_league_rank = 0 ;
-    m_work_league_end  = 1 ;
-    m_work_league_size = 1 ;
-  }
-}
-
-int ThreadsExec::get_thread_count()
-{
-  return s_threads_count ;
-}
-
-ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
-{
-  ThreadsExec * const th =
-    unsigned(init_thread_rank) < s_threads_count
-    ? s_threads_exec[ s_threads_count - ( init_thread_rank + 1 ) ] : 0 ;
-
-  if ( 0 == th || th->m_init_thread_rank != init_thread_rank ) {
-    std::ostringstream msg ;
-    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
-        << "thread " << init_thread_rank << " of " << s_threads_count ;
-    if ( 0 == th ) {
-      msg << " does not exist" ;
-    }
-    else {
-      msg << " has wrong thread_rank " << th->m_init_thread_rank ;
-    }
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  return th ;
-}
-
-// Set threads' team and initial league sizes.
-// Set threads' global and team fan-in and scan relationsups.
-// If the process thread is used then it is 's_threads_exec[0]'
-// which we map to the maximum rank so that the scan's reduction
-// places data on the proper thread.
-void ThreadsExec::set_threads_relationships( const std::pair<unsigned,unsigned> team_topo )
-{
-  const unsigned league_size  = team_topo.first ;
-  const unsigned team_size    = team_topo.second ;
-  const unsigned thread_count = league_size * team_size ;
-
-  for ( unsigned r = 0 ; r < thread_count ; ++r ) {
-    if ( s_threads_exec[r] == 0 ) {
-      Kokkos::Impl::throw_runtime_exception( std::string("ThreadsExec::set_threads_relationships FAILED : NULL entry" ) );
-    }
-  }
-
-  for ( unsigned league_r = 0 , th_r = 0 ; league_r < league_size ;  ++league_r ) {
-  for ( unsigned team_r = 0 ;              team_r   < team_size ; ++team_r , ++th_r ) {
-
-    ThreadsExec & th = * s_threads_exec[th_r] ;
-
-    th.m_team_rank        = team_size - ( team_r + 1 );
-    th.m_team_size        = team_size ;
-    th.m_init_league_rank = league_size - ( league_r + 1 );
-    th.m_init_league_size = league_size ;
-    th.m_init_thread_rank = th.m_team_rank + team_size * th.m_init_league_rank ;
-    th.m_init_thread_size = team_size * league_size ;
-
-    th.m_work_league_rank = league_r ;
-    th.m_work_league_end  = league_r + 1 ;
-    th.m_work_league_size = team_topo.first ;
-
-    th.m_fan_size = 0 ;
-    th.m_fan_team_size = 0 ;
-
-    //------------------------------------
-    // Intra-team reduction:
-    const unsigned team_begin = league_r * team_size ;
-    for ( int n = 1 ; ( team_r + n < team_size ) && ( 0 == ( n & team_r ) ) ; n <<= 1 , ++th.m_fan_team_size ) {
-      th.m_fan_team[ th.m_fan_team_size ] = s_threads_exec[ team_begin + team_r + n ];
-    }
-    // Intra-team scan input:
-    {
-      unsigned n ;
-      for ( n = 1 ; 0 == ( team_r & n ) && ( team_r + n < team_size ) ; n <<= 1 );
-      if ( ( team_r & n ) && ( team_r + n < team_size ) ) {
-        th.m_fan_team[ th.m_fan_team_size ] = s_threads_exec[ team_begin + team_r + n ];
-      }
-      else {
-        th.m_fan_team[ th.m_fan_team_size ] = 0 ;
-      }
-    }
-    //------------------------------------
-    // All-thread reduction:
-    for ( unsigned n = 1 ; ( th_r + n < thread_count ) && ( 0 == ( n & th_r ) ) ; n <<= 1 , ++th.m_fan_size ) {
-      th.m_fan[ th.m_fan_size ] = s_threads_exec[ th_r + n ];
-    }
-    // All-thread Scan input:
-    {
-      unsigned n ;
-      for ( n = 1 ; 0 == ( th_r & n ) && ( th_r + n < thread_count ) ; n <<= 1 );
-      if ( ( th_r & n ) && ( th_r + n < thread_count ) ) {
-        th.m_fan[ th.m_fan_size ] = s_threads_exec[ th_r + n ];
-      }
-      else {
-        th.m_fan[ th.m_fan_size ] = 0 ;
-      }
-    }
-    th.m_fan[ th.m_fan_size + 1 ] = th_r + 1 < thread_count ? s_threads_exec[ th_r + 1 ] : 0 ;
-    //------------------------------------
-  }}
-}
-
-void ThreadsExec::execute_get_binding( ThreadsExec & exec , const void * )
-{
-  const size_t init_thread_rank = exec.m_team_rank + exec.m_team_size * exec.m_init_league_rank ;
-  s_threads_coord[ init_thread_rank ] = Kokkos::hwloc::get_this_thread_coordinate();
-}
-
-void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
-{
-  ThreadsExec::global_lock();
-  ThreadsExec::global_unlock();
-
-  const int n = exec.m_fan_size ;
-
-  for ( int i = 0 ; i < n ; ++i ) {
-    Impl::spinwait( exec.m_fan[i]->m_state , ThreadsExec::Active );
-  }
-
-  exec.m_state = ThreadsExec::Inactive ;
-}
-
-void ThreadsExec::execute_reduce_resize( ThreadsExec & exec , const void * )
-{
-  if ( exec.m_reduce ) {
-    HostSpace::decrement( exec.m_reduce );
-    exec.m_reduce = 0 ;
-  }
-
-  if ( s_threads_reduce_size ) {
-
-    exec.m_reduce =
-      HostSpace::allocate( "reduce_scratch_space" , typeid(unsigned char) , 1 , s_threads_reduce_size );
-
-    // Guaranteed multiple of 'unsigned'
-
-    unsigned * ptr = (unsigned *)( exec.m_reduce );
-    unsigned * const end = ptr + s_threads_reduce_size / sizeof(unsigned);
-
-    // touch on this thread
-    while ( ptr < end ) *ptr++ = 0 ;
-  }
-}
-
-void ThreadsExec::execute_shared_resize( ThreadsExec & exec , const void * )
-{
-  const bool not_root = exec.m_team_rank + 1 < exec.m_team_size ;
-
-  if ( not_root ) {
-    exec.m_shared = 0 ;
-  }
-  else {
-
-    if ( exec.m_shared ) {
-      HostSpace::decrement( exec.m_shared );
-      exec.m_shared = 0 ;
-    }
-
-    if ( s_threads_shared_size ) {
-
-      exec.m_shared =
-        HostSpace::allocate( "shared_scratch_space" , typeid(unsigned char) , 1 , s_threads_shared_size );
-
-      // Guaranteed multiple of 'unsigned'
-
-      unsigned * ptr = (unsigned *)( exec.m_shared );
-      unsigned * const end = ptr + s_threads_shared_size / sizeof(unsigned);
-
-      // touch on this thread
-      while ( ptr < end ) *ptr++ = 0 ;
-    }
-  }
-
-  exec.m_shared_end = s_threads_shared_size ;
-}
-
-void * ThreadsExec::get_shmem( const int size )
-{
-  // m_shared_iter is in bytes, convert to integer offsets
-  const int offset = m_shared_iter >> power_of_two<sizeof(int)>::value ;
-
-  m_shared_iter += size ;
-
-  if ( m_shared_end < m_shared_iter ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("ThreadsExec::get_shmem FAILED : exceeded shared memory size" ) );
-  }
-
-  return ((int*)m_shared) + offset ;
-}
-
-}
-}
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
-{
-  if ( ! is_process() ) {
-    std::string msg( name );
-    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-
-  if ( initialized && 0 == s_threads_count ) {
-    std::string msg( name );
-    msg.append( " FAILED : Threads not initialized." );
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-int ThreadsExec::in_parallel()
-{
-  // A thread function is in execution and
-  // the function argument is not the special threads process argument and
-  // the master process is a worker or is not the master process.
-  return s_current_function &&
-         ( & s_threads_process != s_current_function_arg ) &&
-         ( s_threads_process.m_team_size || ! is_process() );
-}
-
-// Wait for root thread to become inactive
-void ThreadsExec::fence()
-{
-  if ( s_threads_count ) {
-    // Wait for the root thread to complete:
-    Impl::spinwait( s_threads_exec[0]->m_state , ThreadsExec::Active );
-
-    if ( s_exception_msg.size() ) {
-      Kokkos::Impl::throw_runtime_exception( s_exception_msg );
-    }
-  }
-
-  s_current_function     = 0 ;
-  s_current_function_arg = 0 ;
-}
-
-/** \brief  Begin execution of the asynchronous functor */
-void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg , int work_league_size )
-{
-  verify_is_process("ThreadsExec::start" , false );
-
-  if ( s_current_function || s_current_function_arg ) {
-    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
-  }
-
-  s_exception_msg.clear();
-
-  s_current_function     = func ;
-  s_current_function_arg = arg ;
-
-  if ( work_league_size ) {
-    const int work_per_team = ( work_league_size + s_threads_process.m_init_league_size - 1 )
-                            / s_threads_process.m_init_league_size ;
-
-    for ( int i = s_threads_count ; 0 < i-- ; ) {
-      ThreadsExec & th = * s_threads_exec[i] ;
-
-      th.m_work_league_rank = std::min( th.m_init_league_rank * work_per_team , work_league_size );
-      th.m_work_league_end  = std::min( th.m_work_league_rank + work_per_team , work_league_size );
-      th.m_work_league_size = work_league_size ;
-    }
-  }
-
-  // Activate threads:
-  for ( int i = s_threads_count ; 0 < i-- ; ) {
-    s_threads_exec[i]->m_state = ThreadsExec::Active ;
-  }
-
-  if ( s_threads_process.m_team_size ) {
-    // Master process is the root thread:
-    (*func)( s_threads_process , arg );
-    s_threads_process.m_state = ThreadsExec::Inactive ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-bool ThreadsExec::sleep()
-{
-  verify_is_process("ThreadsExec::sleep", true );
-
-  if ( & execute_sleep == s_current_function ) return false ;
-
-  fence();
-
-  ThreadsExec::global_lock();
-
-  s_exception_msg.clear();
-
-  s_current_function = & execute_sleep ;
-
-  // Activate threads:
-  for ( unsigned i = s_threads_count ; 0 < i ; ) {
-    s_threads_exec[--i]->m_state = ThreadsExec::Active ;
-  }
-
-  return true ;
-}
-
-bool ThreadsExec::wake()
-{
-  verify_is_process("ThreadsExec::wake", true );
-
-  if ( & execute_sleep != s_current_function ) return false ;
-
-  ThreadsExec::global_unlock();
-
-  if ( s_threads_process.m_team_size ) {
-    execute_sleep( s_threads_process , 0 );
-    s_threads_process.m_state = ThreadsExec::Inactive ;
-  }
-
-  fence();
-
-  return true ;
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
-{
-  s_exception_msg.clear();
-
-  s_current_function = func ;
-  s_current_function_arg = & s_threads_process ;
-
-  const unsigned begin = s_threads_process.m_team_size ? 1 : 0 ;
-
-  for ( unsigned i = s_threads_count ; begin < i ; ) {
-    ThreadsExec & th = * s_threads_exec[ --i ];
-
-    th.m_state = ThreadsExec::Active ;
-
-    wait_yield( th.m_state , ThreadsExec::Active );
-  }
-
-  if ( s_threads_process.m_team_size ) {
-    s_threads_process.m_state = ThreadsExec::Active ;
-    (*func)( s_threads_process , 0 );
-    s_threads_process.m_state = ThreadsExec::Inactive ;
-  }
-
-  s_current_function_arg = 0 ;
-  s_current_function = 0 ;
-}
-
-//----------------------------------------------------------------------------
-
-void * ThreadsExec::root_reduce_scratch()
-{
-  return s_threads_process.reduce_base();
-}
-
-void ThreadsExec::resize_reduce_scratch( size_t size )
-{
-  fence();
-
-  if ( size ) { size += REDUCE_TEAM_BASE ; }
-
-  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
-
-  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
-
-  if ( ( s_threads_reduce_size < size ) ||
-       ( 0 == size && s_threads_reduce_size ) ) {
-
-    verify_is_process( "ThreadsExec::resize_reduce_scratch" , true );
-
-    s_threads_reduce_size = size ;
-
-    execute_serial( & execute_reduce_resize );
-
-    s_threads_process.m_reduce = s_threads_exec[0]->m_reduce ;
-  }
-}
-
-void ThreadsExec::resize_shared_scratch( size_t size )
-{
-  fence();
-
-  const size_t rem = size % Kokkos::Impl::MEMORY_ALIGNMENT ;
-
-  if ( rem ) size += Kokkos::Impl::MEMORY_ALIGNMENT - rem ;
-
-  if ( s_threads_shared_size < size || ( 0 == size && s_threads_shared_size ) ) {
-
-    verify_is_process( "ThreadsExec::resize_shared_scratch" , true );
-
-    s_threads_shared_size = size ;
-
-    execute_serial( & execute_shared_resize );
-
-    for ( unsigned i = 0 ; i < s_threads_count ; ) {
-      ThreadsExec & team_th = * s_threads_exec[i] ;
-
-      for ( int j = 0 ; j < team_th.m_team_size ; ++j , ++i ) {
-        s_threads_exec[i]->m_shared = team_th.m_shared ;
-      }
-    }
-
-    s_threads_process.m_shared = s_threads_exec[0]->m_shared ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
-{
-  verify_is_process("ThreadsExec::print_configuration",false);
-
-  fence();
-
-  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-#if defined( KOKKOS_HAVE_HWLOC )
-  s << "macro  KOKKOS_HAVE_HWLOC   : defined" << std::endl ;
-#endif
-#if defined( KOKKOS_HAVE_PTHREAD )
-  s << "macro  KOKKOS_HAVE_PTHREAD : defined" << std::endl ;
-#endif
-
-  s << "Kokkos::Threads hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
-
-  if ( s_threads_exec[0] ) {
-    s << " team_league[" << s_threads_exec[0]->m_init_league_size << "x" << s_threads_exec[0]->m_team_size << "]" ;
-    if ( 0 == s_threads_process.m_team_size ) { s << " Asynchronous" ; }
-    s << " ReduceScratch[" << s_threads_reduce_size << "]"
-      << " SharedScratch[" << s_threads_shared_size << "]" ;
-    s << std::endl ;
-
-    if ( detail ) {
-
-      execute_serial( & execute_get_binding );
-
-      for ( unsigned i = 0 ; i < s_threads_count ; ++i ) {
-        ThreadsExec * const th = s_threads_exec[i] ;
-        s << "  Thread hwloc("
-          << s_threads_coord[i].first << ","
-          << s_threads_coord[i].second << ")" ;
-
-        s_threads_coord[i].first  = ~0u ;
-        s_threads_coord[i].second = ~0u ;
-
-        if ( th ) {
-          s << " rank(" << th->m_init_league_rank << "." << th->m_team_rank << ")" ;
-          if ( th->m_fan_size ) {
-            s << " Fan ranks" ;
-            for ( int j = 0 ; j < th->m_fan_size ; ++j ) {
-              s << " (" << th->m_fan[j]->m_init_league_rank << "." << th->m_fan[j]->m_team_rank << ")" ;
-            }
-          }
-        }
-        s << std::endl ;
-      }
-    }
-  }
-  else {
-    s << " not initialized" << std::endl ;
-  }
-}
-
-//----------------------------------------------------------------------------
-
-int ThreadsExec::league_max()
-{ return std::numeric_limits<int>::max(); }
-
-int ThreadsExec::team_max()
-{ return s_threads_exec[0] ? s_threads_exec[0]->m_team_size : 1 ; }
-
-//----------------------------------------------------------------------------
-
-int ThreadsExec::is_initialized()
-{ return 0 != s_threads_exec[0] ; }
-
-void ThreadsExec::initialize( 
-  const std::pair<unsigned,unsigned> team_topology ,
-        std::pair<unsigned,unsigned> use_core_topology )
-{
-  static const Sentinel sentinel ;
-
-  verify_is_process("ThreadsExec::initialize",false);
-
-  std::ostringstream msg ;
-
-  msg << "Kokkos::Threads::initialize("
-      << " team_topology(" << team_topology.first << "," << team_topology.second << ")"
-      << ", use_core_topology(" << use_core_topology.first << "," << use_core_topology.second << ")"
-      << " )" ;
-
-  if ( s_threads_count ) {
-    msg << " FAILED : Already initialized" ;
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  const unsigned thread_count = team_topology.first * team_topology.second ;
-
-  if ( 0 == thread_count ) {
-    msg << " FAILED : zero thread count" ;
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-  //------------------------------------
-  // Query hardware topology and capacity, if available.
-
-  const bool hwloc_avail = Kokkos::hwloc::available();
-
-  const std::pair<unsigned,unsigned>
-    hwloc_core_topo( Kokkos::hwloc::get_available_numa_count() ,
-                     Kokkos::hwloc::get_available_cores_per_numa() );
-
-  std::pair<unsigned,unsigned> master_coord = Kokkos::hwloc::get_this_thread_coordinate();
-  bool                         asynchronous = false ;
-
-  if ( hwloc_avail && 1 < thread_count ) {
-
-    if ( 0 == use_core_topology.first && 0 == use_core_topology.second ) {
-      use_core_topology = Kokkos::hwloc::use_core_topology( thread_count );
-    }
-
-    if ( use_core_topology.first < hwloc_core_topo.first ) {
-      // Can omit a (NUMA) group of cores and execute work asynchronously
-      // on the other groups.
-
-      Kokkos::hwloc::thread_mapping( team_topology , use_core_topology , hwloc_core_topo , s_threads_coord );
-
-      // Don't use master thread's first core coordinate (NUMA region).
-      // Originally mapped:
-      //   begin = hwloc_core_topo.first - use_core_topology.first ;
-      //   end   = hwloc_core_topo.first ;
-      // So can decrement.
-
-      for ( unsigned i = 0 ; i < thread_count ; ++i ) {
-        if ( s_threads_coord[i].first <= master_coord.first ) {
-          --( s_threads_coord[i].first );
-        }
-      }
-
-      asynchronous = true ;
-    }
-    else if ( use_core_topology.second < hwloc_core_topo.second ) {
-      // Can omit a core from each group and execute work asynchronously
-
-      Kokkos::hwloc::thread_mapping( team_topology , use_core_topology , hwloc_core_topo , s_threads_coord );
-
-      // Threads' coordinates are in the range
-      //   0 <= numa_begin = hwloc_core_topo.first - use_core_topology.first
-      //   1 <= numa_end   = hwloc_core_topo.first
-      //   1 <= core_begin = hwloc_core_topo.second - use_core_topology.second
-      //   1 <= core_end   = hwloc_core_topo.second
-      //
-      //   range: ( [numa_begin,numa_end) , [core_begin,core_end) )
-      //
-      // Force master thread onto the highest rank unused core of its current numa region.
-      //
-      master_coord.second = ( hwloc_core_topo.second - use_core_topology.second ) - 1 ;
-
-      asynchronous = true ;
-    }
-    else {
-      // Spawn threads with root thread on the master process' core
-
-      Kokkos::hwloc::thread_mapping( team_topology , use_core_topology , hwloc_core_topo , master_coord , s_threads_coord );
-
-      s_threads_coord[0] = std::pair<unsigned,unsigned>( ~0u , ~0u );
-    }
-  }
-
-  //------------------------------------
-  // Spawn threads
-
-  {
-    const unsigned thread_spawn_begin  = asynchronous ? 0 : 1 ;
-    unsigned       thread_spawn_failed = 0 ;
-
-    s_threads_count    = thread_count ;
-    s_current_function = & execute_function_noop ; // Initialization work function
-
-    // If not fully utilizing the capacity then spawn threads for asynchronous execution.
-
-    for ( unsigned i = thread_spawn_begin ; i < thread_count ; ++i ) {
-
-      s_threads_process.m_state = ThreadsExec::Inactive ;
-
-      // If hwloc available then spawned thread will choose its own rank,
-      // otherwise specify the rank.
-      s_current_function_arg = (void*)( hwloc_avail ? ~0u : i );
-
-      // Spawn thread executing the 'driver()' function.
-      // Wait until spawned thread has attempted to initialize.
-      // If spawning and initialization is successfull then
-      // an entry in 's_threads_exec' will be assigned.
-      if ( ThreadsExec::spawn() ) {
-        wait_yield( s_threads_process.m_state , ThreadsExec::Inactive );
-      }
-    }
-
-    // Wait for all spawned threads to deactivate before zeroing the function.
-
-    for ( unsigned i = thread_spawn_begin ; i < thread_count ; ++i ) {
-      // Try to protect against cache coherency failure by casting to volatile.
-      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[i] ;
-      if ( th ) {
-        wait_yield( th->m_state , ThreadsExec::Active );
-      }
-      else {
-        ++thread_spawn_failed ;
-      }
-    }
-
-    s_current_function     = 0 ;
-    s_current_function_arg = 0 ;
-
-    if ( thread_spawn_failed ) {
-
-      s_threads_count = 0 ;
-
-      msg << " FAILED " << thread_spawn_failed << " attempts to spawn threads" ;
-
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    }
-
-    if ( 1 < thread_count ) { Kokkos::hwloc::bind_this_thread( master_coord ); }
-
-    // Clear master thread data.
-    // The master thread will be unused or initialized
-    // as part of the thread pool.
-
-    s_threads_process.m_team_rank = 0 ;
-    s_threads_process.m_team_size = 0 ;
-    s_threads_process.m_init_league_rank = 0 ;
-    s_threads_process.m_init_league_size = 0 ;
-    s_threads_process.m_init_thread_rank = 0 ;
-    s_threads_process.m_init_thread_size = 0 ;
-    s_threads_process.m_work_league_rank = 0 ;
-    s_threads_process.m_work_league_end  = 0 ;
-    s_threads_process.m_work_league_size = 0 ;
-    s_threads_process.m_state = ThreadsExec::Inactive ;
-
-    if ( thread_spawn_begin ) {
-      s_threads_exec[0] = & s_threads_process ; // Include the master thread in pool.
-    }
-  }
-
-  //------------------------------------
-  // Initialize team topology and fan-in/out relationships:
-
-  s_threads_process.m_init_league_size = team_topology.first ;
-
-  ThreadsExec::set_threads_relationships( team_topology );
-
-  // Initial allocations:
-  ThreadsExec::resize_reduce_scratch( 4096 - REDUCE_TEAM_BASE );
-  ThreadsExec::resize_shared_scratch( 4096 );
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::finalize()
-{
-  verify_is_process("ThreadsExec::finalize",false);
-
-  fence();
-
-  resize_reduce_scratch(0);
-  resize_shared_scratch(0);
-
-  const unsigned begin = s_threads_process.m_team_size ? 1 : 0 ;
-
-  for ( unsigned i = s_threads_count ; begin < i-- ; ) {
-
-    if ( s_threads_exec[i] ) {
-
-      s_threads_exec[i]->m_state = ThreadsExec::Terminating ;
-
-      wait_yield( s_threads_process.m_state , ThreadsExec::Inactive );
-
-      s_threads_process.m_state = ThreadsExec::Inactive ;
-    }
-  }
-
-  if ( s_threads_process.m_team_size ) {
-    ( & s_threads_process )->~ThreadsExec();
-    s_threads_exec[0] = 0 ;
-  }
-
-  Kokkos::hwloc::unbind_this_thread();
-
-  s_threads_count = 0 ;
-
-  // Reset master thread to run solo.
-  s_threads_process.m_team_rank = 0 ;
-  s_threads_process.m_team_size = 1 ;
-  s_threads_process.m_init_league_rank = 0 ;
-  s_threads_process.m_init_league_size = 1 ;
-  s_threads_process.m_init_thread_rank = 0 ;
-  s_threads_process.m_init_thread_size = 1 ;
-
-  s_threads_process.m_work_league_rank = 0 ;
-  s_threads_process.m_work_league_end  = 1 ;
-  s_threads_process.m_work_league_size = 1 ;
-  s_threads_process.m_state = ThreadsExec::Inactive ;
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-
diff --git a/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
deleted file mode 100644
index ec7cd02..0000000
--- a/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_THREADSEXEC_HPP
-#define KOKKOS_THREADSEXEC_HPP
-
-#include <stdio.h>
-
-#include <utility>
-#include <impl/Kokkos_spinwait.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-template< class > struct ThreadsExecAdapter ;
-
-//----------------------------------------------------------------------------
-
-class ThreadsExec {
-public:
-
-  // Fan array has log_2(NT) reduction threads plus 2 scan threads
-  // Currently limited to 16k threads.
-  enum { MAX_FAN_COUNT    = 16 };
-  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
-  enum { VECTOR_LENGTH    = 8 };
-
-  /** \brief States of a worker thread */
-  enum { Terminating ///<  Termination in progress
-       , Inactive    ///<  Exists, waiting for work
-       , Active      ///<  Exists, performing work
-       , Rendezvous  ///<  Exists, waiting in a barrier or reduce
-
-       , ScanAvailable
-       , ReductionAvailable
-       };
-
-private:
-
-  friend class Kokkos::Threads ;
-
-  // Fan-in operations' root is the highest ranking thread
-  // to place the 'scan' reduction intermediate values on
-  // the threads that need them.
-  // For a simple reduction the thread location is arbitrary.
-
-  /** \brief  Reduction memory reserved for team reductions */
-  enum { REDUCE_TEAM_BASE = 512 };
-
-  void        * m_reduce ;      ///< Reduction memory
-  void        * m_shared ;      ///< Team-shared memory
-  int           m_shared_end ;  ///< End of team-shared memory
-  int           m_shared_iter ; ///< Current offset for team-shared memory
-  int volatile  m_state ;       ///< State for global synchronizations
-  int volatile  m_state_team ;  ///< State for team synchronizations
-  int           m_fan_size ;
-  int           m_fan_team_size ;
-
-  int           m_team_rank ;
-  int           m_team_size ;
-  int           m_init_league_rank ;
-  int           m_init_league_size ;
-  int           m_init_thread_rank ;
-  int           m_init_thread_size ;
-
-  int           m_work_league_rank ;
-  int           m_work_league_end ;
-  int           m_work_league_size ;
-
-  ThreadsExec * m_fan[ MAX_FAN_COUNT ] ;
-  ThreadsExec * m_fan_team[ MAX_FAN_COUNT ] ;
-
-  static void global_lock();
-  static void global_unlock();
-  static bool spawn();
-
-  static void execute_sleep( ThreadsExec & , const void * );
-  static void execute_reduce_resize( ThreadsExec & , const void * );
-  static void execute_shared_resize( ThreadsExec & , const void * );
-  static void execute_get_binding(   ThreadsExec & , const void * );
-
-  ThreadsExec( const ThreadsExec & );
-  ThreadsExec & operator = ( const ThreadsExec & );
-
-  static void execute_serial( void (*)( ThreadsExec & , const void * ) );
-
-  inline void * reduce_team() const { return m_reduce ; }
-
-public:
-
-  static int get_thread_count();
-  static ThreadsExec * get_thread( const int init_thread_rank );
-
-  inline void * reduce_base() const { return ((unsigned char *) m_reduce) + REDUCE_TEAM_BASE ; }
-
-  static void driver(void);
-
-  ~ThreadsExec();
-  ThreadsExec();
-
-  static void set_threads_relationships( const std::pair<unsigned,unsigned> team_topo );
-
-  static void resize_reduce_scratch( size_t );
-  static void resize_shared_scratch( size_t );
-
-  static void * root_reduce_scratch();
-
-  static bool is_process();
-
-  static void verify_is_process( const std::string & , const bool initialized );
-
-  static int is_initialized();
-
-  static void initialize( const std::pair<unsigned,unsigned> team_topo ,
-                                std::pair<unsigned,unsigned> core_topo );
-
-  static void finalize();
-
-  static void print_configuration( std::ostream & , const bool detail = false );
-
-  //------------------------------------
-
-  static void wait_yield( volatile int & , const int );
-
-  //------------------------------------
-  // All-thread functions:
-
-  inline
-  std::pair< size_t , size_t >
-  work_range( const size_t work_count ) const
-  {
-    typedef integral_constant< size_t , VECTOR_LENGTH - 1 > work_mask ;
-
-    // work per thread rounded up and aligned to vector length:
-
-    const size_t work_per_thread =
-      ( ( ( work_count + m_init_thread_size - 1 ) / m_init_thread_size ) + work_mask::value ) & ~(work_mask::value);
-
-    const size_t work_begin = std::min( work_count , work_per_thread * m_init_thread_rank );
-    const size_t work_end   = std::min( work_count , work_per_thread + work_begin );
-
-    return std::pair< size_t , size_t >( work_begin , work_end );
-  }
-
-  template< class Functor >
-  inline
-  void fan_in_reduce( const Functor & f ) const
-    {
-      typedef ReduceAdapter< Functor > Reduce ;
-
-      for ( int i = 0 ; i < m_fan_size ; ++i ) {
-
-        ThreadsExec & fan = *m_fan[i] ;
-
-        Impl::spinwait( fan.m_state , ThreadsExec::Active );
-
-        f.join( Reduce::reference( reduce_base() ) ,
-                Reduce::reference( fan.reduce_base() ) );
-      }
-    }
-
-  inline
-  void fan_in() const
-    {
-      for ( int i = 0 ; i < m_fan_size ; ++i ) {
-        Impl::spinwait( m_fan[i]->m_state , ThreadsExec::Active );
-      }
-    }
-
-  template< class FunctorType >
-  inline
-  void scan_large( const FunctorType & f )
-    {
-      // Sequence of states:
-      //  0) Active             : entry and exit state
-      //  1) ReductionAvailable : reduction value available
-      //  1) Rendezvous         : all reduction values available and copied
-      //  2) ScanAvailable      : scan value available
-
-      typedef ReduceAdapter< FunctorType > Reduce ;
-      typedef typename Reduce::scalar_type scalar_type ;
-
-      const bool     not_root = m_init_thread_rank + 1 < m_init_thread_size ;
-      const unsigned count    = Reduce::value_count( f );
-
-      scalar_type * const work_value = (scalar_type *) reduce_base();
-
-      //--------------------------------
-      // Fan-in reduction with highest ranking thread as the root
-      for ( int i = 0 ; i < m_fan_size ; ++i ) {
-        ThreadsExec & fan = *m_fan[i];
-
-        // Wait: Active -> ReductionAvailable
-        Impl::spinwait( fan.m_state , ThreadsExec::Active );
-        f.join( Reduce::reference( work_value ) , Reduce::reference( fan.reduce_base() ) );
-      }
-
-      // Copy reduction value to scan value before releasing from this phase.
-      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
-
-      if ( not_root ) {
-        m_state = ThreadsExec::ReductionAvailable ;
-        // Wait: ReductionAvailable -> Rendezvous
-        Impl::spinwait( m_state , ThreadsExec::ReductionAvailable );
-      }
-
-      for ( int i = 0 ; i < m_fan_size ; ++i ) {
-        m_fan[i]->m_state = ThreadsExec::Rendezvous ;
-      }
-
-      // All non-root threads are now in the Rendezvous state
-      //--------------------------------
-
-      if ( not_root ) {
-
-        // Wait for contributing threads' scan value to be available.
-        if ( m_fan[ m_fan_size ] ) {
-          ThreadsExec & th = *m_fan[ m_fan_size ] ;
-
-          // Wait: Rendezvous -> ScanAvailable
-          Impl::spinwait( th.m_state , ThreadsExec::Rendezvous );
-
-          f.join( Reduce::reference( work_value + count ) ,
-                  Reduce::reference( ((scalar_type *)th.reduce_base()) + count ) );
-        }
-
-        m_state = ThreadsExec::ScanAvailable ;
-      }
-
-      //--------------------------------
-
-      if ( m_fan[ m_fan_size + 1 ] ) {
-        ThreadsExec & th = *m_fan[ m_fan_size + 1 ] ; // Not the root thread
-
-        // Wait: Rendezvous -> ScanAvailable
-        Impl::spinwait( th.m_state , ThreadsExec::Rendezvous );
-
-        const scalar_type * const src_value = ((scalar_type *)th.reduce_base()) + count ;
-
-        for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
-
-        th.m_state = ThreadsExec::Active ; // Release the source thread
-      }
-      else {
-        f.init( Reduce::reference( work_value ) );
-      }
-
-      // Wait for scan value to be claimed before exiting.
-      Impl::spinwait( m_state , ThreadsExec::ScanAvailable );
-    }
-
-  template< class FunctorType >
-  inline
-  void scan_small( const FunctorType & f )
-    {
-      typedef ReduceAdapter< FunctorType > Reduce ;
-      typedef typename Reduce::scalar_type scalar_type ;
-
-      const bool     not_root = m_init_thread_rank + 1 < m_init_thread_size ;
-      const unsigned count    = Reduce::value_count( f );
-
-      scalar_type * const work_value = (scalar_type *) reduce_base();
-
-      //--------------------------------
-      // Fan-in reduction with highest ranking thread as the root
-      for ( int i = 0 ; i < m_fan_size ; ++i ) {
-        // Wait: Active -> Rendezvous
-        Impl::spinwait( m_fan[i]->m_state , ThreadsExec::Active );
-      }
-
-      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
-
-      if ( not_root ) {
-        m_state = ThreadsExec::Rendezvous ;
-        // Wait: Rendezvous -> Active
-        Impl::spinwait( m_state , ThreadsExec::Rendezvous );
-      }
-      else {
-        // Root thread does the thread-scan before releasing threads
-
-        scalar_type * ptr_prev = 0 ;
-
-        for ( int rank = 0 ; rank < m_init_thread_size ; ++rank ) {
-          scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_base();
-          if ( rank ) {
-            for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
-            f.join( Reduce::reference( ptr + count ), Reduce::reference( ptr ) );
-          }
-          else {
-            f.init( Reduce::reference( ptr ) );
-          }
-          ptr_prev = ptr ;
-        }
-      }
-
-      for ( int i = 0 ; i < m_fan_size ; ++i ) {
-        m_fan[i]->m_state = ThreadsExec::Active ;
-      }
-    }
-
-  //------------------------------------
-  // Team-only functions:
-
-  void * get_shmem( const int size );
-
-  void team_barrier()
-    {
-      const bool not_root = m_team_rank + 1 < m_team_size ;
-
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        Impl::spinwait( m_fan_team[i]->m_state , ThreadsExec::Active );
-      }
-      if ( not_root ) {
-        m_state = Rendezvous ;
-        Impl::spinwait( m_state , ThreadsExec::Rendezvous );
-      }
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        m_fan_team[i]->m_state = ThreadsExec::Active ;
-      }
-    }
-
-  template< class ArgType >
-  inline
-  ArgType team_scan( const ArgType & value , ArgType * const global_accum = 0 )
-    {
-      // Sequence of m_state_team states:
-      //  0) Inactive            : entry and exit state
-      //  1) ReductionAvailable  : reduction value available, waiting for scan value
-      //  2) ScanAvailable       : reduction value available, scan value available
-      //  3) Rendezvous          : broadcasting global iinter-team accumulation value
-
-      // Make sure there is enough scratch space:
-      typedef typename if_c< 2 * sizeof(ArgType) < REDUCE_TEAM_BASE , ArgType , void >::type type ;
-
-      const bool not_root = m_team_rank + 1 < m_team_size ;
-      type * const work_value = (type*) reduce_team();
-
-      // ThreadsExec::Inactive == m_state_team
-
-      work_value[0] = value ;
-
-      // Fan-in reduction, wait for source thread to complete it's fan-in reduction.
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        ThreadsExec & th = *m_fan_team[i];
-
-        // Wait for source thread to exit Inactive state.
-        Impl::spinwait( th.m_state_team , ThreadsExec::Inactive );
-        // Source thread is 'ReductionAvailable' or 'ScanAvailable'
-        work_value[0] += ((volatile type*)th.reduce_team())[0];
-      }
-
-      work_value[1] = work_value[0] ;
-
-      if ( not_root ) {
-
-        m_state_team = ThreadsExec::ReductionAvailable ; // Reduction value is available.
-
-        // Wait for contributing threads' scan value to be available.
-        if ( m_fan_team[ m_fan_team_size ] ) {
-          ThreadsExec & th = *m_fan_team[ m_fan_team_size ] ;
-
-          // Wait: Inactive -> ReductionAvailable
-          Impl::spinwait( th.m_state_team , ThreadsExec::Inactive );
-          // Wait: ReductionAvailable -> ScanAvailable:
-          Impl::spinwait( th.m_state_team , ThreadsExec::ReductionAvailable );
-
-          work_value[1] += ((volatile type*)th.reduce_team())[1] ;
-        }
-
-        m_state_team = ThreadsExec::ScanAvailable ; // Scan value is available.
-      }
-      else {
-         // Root thread add team's total to global inter-team accumulation
-        work_value[0] = global_accum ? atomic_fetch_add( global_accum , work_value[0] ) : 0 ;
-      }
-
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        ThreadsExec & th = *m_fan_team[i];
-        // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait( th.m_state_team , ThreadsExec::ReductionAvailable );
-        // Wait: ScanAvailable -> Rendezvous
-        Impl::spinwait( th.m_state_team , ThreadsExec::ScanAvailable );
-      }
-
-      // All fan-in threads are in the ScanAvailable state
-      if ( not_root ) {
-        m_state_team = ThreadsExec::Rendezvous ;
-        Impl::spinwait( m_state_team , ThreadsExec::Rendezvous );
-      }
-
-      // Broadcast global inter-team accumulation value
-      volatile type & global_val = work_value[0] ;
-      for ( int i = 0 ; i < m_fan_team_size ; ++i ) {
-        ThreadsExec & th = *m_fan_team[i];
-        ((volatile type*)th.reduce_team())[0] = global_val ;
-        th.m_state_team = ThreadsExec::Inactive ;
-      }
-      // Exclusive scan, subtract contributed value
-      return global_val + work_value[1] - value ;
-    }
-
-  /*  When a functor using the 'device' interface requests
-   *  more teams than are initialized the parallel operation
-   *  must loop over a range of league ranks with a team_barrier
-   *  between each iteration.
-   */
-  bool team_work_avail()
-    {
-      m_shared_iter = 0 ;
-      return m_work_league_rank < m_work_league_end ;
-    }
-
-  void team_work_next()
-    { if ( ++m_work_league_rank < m_work_league_end ) team_barrier(); }
-
-  //------------------------------------
-  /** \brief  Wait for previous asynchronous functor to
-   *          complete and release the Threads device.
-   *          Acquire the Threads device and start this functor.
-   */
-  static void start( void (*)( ThreadsExec & , const void * ) , const void * , int = 0 );
-
-  static int league_max();
-  static int team_max();
-
-  static int  in_parallel();
-  static void fence();
-  static bool sleep();
-  static bool wake();
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-inline int Threads::in_parallel()
-{ return Impl::ThreadsExec::in_parallel(); }
-
-inline int Threads::is_initialized()
-{ return Impl::ThreadsExec::is_initialized(); }
-
-inline void Threads::initialize(
-  unsigned team_count ,
-  unsigned threads_per_team ,
-  unsigned use_numa_count ,
-  unsigned use_cores_per_numa )
-{
-  Impl::ThreadsExec::initialize(
-    std::pair<unsigned,unsigned>( team_count , threads_per_team ),
-    std::pair<unsigned,unsigned>( use_numa_count , use_cores_per_numa ) );
-}
-
-inline void Threads::finalize()
-{
-  Impl::ThreadsExec::finalize();
-}
-
-inline void Threads::print_configuration( std::ostream & s , const bool detail )
-{
-  Impl::ThreadsExec::print_configuration( s , detail );
-}
-
-inline unsigned Threads::league_max()
-{ return Impl::ThreadsExec::league_max() ; }
-
-inline unsigned Threads::team_max()
-{ return Impl::ThreadsExec::team_max() ; }
-
-inline bool Threads::sleep()
-{ return Impl::ThreadsExec::sleep() ; }
-
-inline bool Threads::wake()
-{ return Impl::ThreadsExec::wake() ; }
-
-inline void Threads::fence()
-{ Impl::ThreadsExec::fence() ; }
-
-inline int Threads::league_rank() const
-{ return m_exec.m_work_league_rank ; }
-
-inline int Threads::league_size() const
-{ return m_exec.m_work_league_size ; }
-
-inline int Threads::team_rank() const
-{ return m_exec.m_team_rank ; }
-
-inline int Threads::team_size() const
-{ return m_exec.m_team_size ; }
-
-inline void Threads::team_barrier()
-{ return m_exec.team_barrier(); }
-
-inline Threads::Threads( Impl::ThreadsExec & t ) : m_exec( t ) {}
-
-template< typename Type >
-inline Type Threads::team_scan( const Type & value )
-{ return m_exec.team_scan( value ); }
-
-template< typename TypeLocal , typename TypeGlobal >
-inline TypeGlobal Threads::team_scan( const TypeLocal & value , TypeGlobal * const global_accum )
-{ return m_exec.template team_scan< TypeGlobal >( value , global_accum ); }
-
-inline
-void * Threads::get_shmem( const int size ) { return m_exec.get_shmem( size ); }
-
-} /* namespace Kokkos */
-
-#endif /* #define KOKKOS_THREADSEXEC_HPP */
-
diff --git a/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
deleted file mode 100644
index 1e7cb0f..0000000
--- a/kokkos/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <string>
-#include <stdexcept>
-
-#include <KokkosCore_config.h>
-#include <Kokkos_Threads.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#if defined( KOKKOS_HAVE_PTHREAD )
-
-/* Standard 'C' Linux libraries */
-
-#include <pthread.h>
-#include <sched.h>
-#include <errno.h>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-namespace {
-
-pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
-
-// Pthreads compatible driver:
-
-void * internal_pthread_driver( void * )
-{
-  ThreadsExec::driver();
-
-  return NULL ;
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-// Spawn a thread
-
-bool ThreadsExec::spawn()
-{
-  bool result = false ;
-
-  pthread_attr_t attr ;
-
-  if ( 0 == pthread_attr_init( & attr ) ||
-       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
-       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
-
-    pthread_t pt ;
-
-    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
-  }
-
-  pthread_attr_destroy( & attr );
-
-  return result ;
-}
-
-//----------------------------------------------------------------------------
-
-bool ThreadsExec::is_process()
-{
-  static const pthread_t master_pid = pthread_self();
-
-  return pthread_equal( master_pid , pthread_self() );
-}
-
-void ThreadsExec::global_lock()
-{
-  pthread_mutex_lock( & host_internal_pthread_mutex );
-}
-
-void ThreadsExec::global_unlock()
-{
-  pthread_mutex_unlock( & host_internal_pthread_mutex );
-}
-
-//----------------------------------------------------------------------------
-
-void ThreadsExec::wait_yield( volatile int & flag , const int value )
-{
-  while ( value == flag ) { sched_yield(); }
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_HAVE_WINTHREAD )
-
-/* Windows libraries */
-#include <windows.h>
-#include <process.h>
-
-//----------------------------------------------------------------------------
-// Driver for each created pthread
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-unsigned WINAPI internal_winthread_driver( void * arg )
-{
-  ThreadsExec::driver();
-
-  return 0 ;
-}
-
-class ThreadLockWindows {
-private:
-  CRITICAL_SECTION  m_handle ;
-
-  ~ThreadLockWindows()
-  { DeleteCriticalSection( & m_handle ); }
-
-  ThreadLockWindows();
-  { InitializeCriticalSection( & m_handle ); }
-
-  ThreadLockWindows( const ThreadLockWindows & );
-  ThreadLockWindows & operator = ( const ThreadLockWindows & );
-
-public:
-
-  static ThreadLockWindows & singleton();
-
-  void lock()
-  { EnterCriticalSection( & m_handle ); }
-
-  void unlock()
-  { LeaveCriticalSection( & m_handle ); }
-};
-
-ThreadLockWindows & ThreadLockWindows::singleton()
-{ static ThreadLockWindows self ; return self ; }
-
-} // namespace <>
-} // namespace Kokkos
-} // namespace Impl
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-// Spawn this thread
-
-bool ThreadsExec::spawn()
-{
-  unsigned Win32ThreadID = 0 ;
-
-  HANDLE handle =
-    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
-
-  return ! handle ;
-}
-
-bool ThreadsExec::is_process() { return true ; }
-
-void ThreadsExec::global_lock()
-{ ThreadLockWindows::singleton().lock(); }
-
-void ThreadsExec::global_unlock()
-{ ThreadLockWindows::singleton().unlock(); }
-
-void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
-{
-  while ( value == flag ) { Sleep(0); }
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#else /* NO Threads */
-
-namespace Kokkos {
-namespace Impl {
-
-bool ThreadsExec::spawn()
-{
-  std::string msg("Kokkos::Threads ERROR : Attempting to spawn threads without configuring with a threading library.  Try configuring with KOKKOS_HAVE_PTHREAD");
-  throw std::runtime_error( msg );
-
-  return false ;
-}
-
-bool ThreadsExec::is_process() { return true ; }
-void ThreadsExec::global_lock() {}
-void ThreadsExec::global_unlock() {}
-void ThreadsExec::wait_yield( volatile int & , const int ) {}
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* End thread model */
-
diff --git a/kokkos/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/kokkos/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
deleted file mode 100644
index 184237f..0000000
--- a/kokkos/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
+++ /dev/null
@@ -1,422 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_THREADS_PARALLEL_HPP
-#define KOKKOS_THREADS_PARALLEL_HPP
-
-#include <vector>
-
-#include <Kokkos_Parallel.hpp>
-#include <Kokkos_ParallelReduce.hpp>
-
-#include <impl/Kokkos_StaticAssert.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class WorkSpec >
-class ParallelFor< FunctorType , WorkSpec , Kokkos::Threads >
-{
-public:
-
-  const FunctorType  m_func ;
-  const size_t       m_work ;
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelFor & self = * ((const ParallelFor *) arg );
-
-    const std::pair<size_t,size_t> work = exec.work_range( self.m_work );
-
-    for ( size_t iwork = work.first ; iwork < work.second ; ++iwork ) {
-      self.m_func( iwork );
-    }
-
-    exec.fan_in();
-  }
-
-  ParallelFor( const FunctorType & functor , const size_t work )
-    : m_func( functor ), m_work( work )
-    {
-      ThreadsExec::start( & ParallelFor::execute , this );
-      ThreadsExec::fence();
-    }
-
-  inline void wait() {}
-
-  inline ~ParallelFor() { wait(); }
-};
-
-template< class FunctorType >
-class ParallelFor< FunctorType , ParallelWorkRequest , Kokkos::Threads >
-{
-public:
-
-  const FunctorType  m_func ;
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelFor & self = * ((const ParallelFor *) arg );
-
-    for ( ; exec.team_work_avail() ; exec.team_work_next() ) {
-      self.m_func( Threads( exec ) );
-    }
-
-    exec.fan_in();
-  }
-
-  ParallelFor( const FunctorType & functor , const ParallelWorkRequest & work )
-    : m_func( functor )
-    {
-      ThreadsExec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
-      ThreadsExec::start( & ParallelFor::execute , this , work.league_size );
-      ThreadsExec::fence();
-    }
-
-  inline void wait() {}
-
-  inline ~ParallelFor() { wait(); }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-template< class FunctorType , class WorkSpec >
-class ParallelReduce< FunctorType , WorkSpec , Kokkos::Threads >
-{
-public:
-
-  typedef ReduceAdapter< FunctorType >   Reduce ;
-  typedef typename Reduce::pointer_type  pointer_type ;
-
-  const FunctorType  m_func ;
-  const size_t       m_work ;
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelReduce & self = * ((const ParallelReduce *) arg );
-
-    typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
-
-    self.m_func.init( update ); // Initialize thread-local value
-
-    const std::pair<size_t,size_t> work = exec.work_range( self.m_work );
-
-    for ( size_t iwork = work.first ; iwork < work.second ; ++iwork ) {
-      self.m_func( iwork , update );
-    }
-
-    exec.fan_in_reduce( self.m_func );
-  }
-
-  ParallelReduce( const FunctorType & functor ,
-                  const size_t        work ,
-                  const pointer_type  result_ptr = 0 )
-    : m_func( functor ), m_work( work )
-    {
-      ThreadsExec::resize_reduce_scratch( Reduce::value_size( m_func ) );
-
-      ThreadsExec::start( & ParallelReduce::execute , this );
-
-      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
-
-      ThreadsExec::fence();
-
-      Reduce::final( m_func , data );
-
-      if ( result_ptr ) {
-        const unsigned n = Reduce::value_count( m_func );
-        for ( unsigned i = 0 ; i < n ; ++i ) { result_ptr[i] = data[i]; }
-      }
-    }
-
-  inline void wait() {}
-
-  inline ~ParallelReduce() { wait(); }
-};
-
-template< class FunctorType >
-class ParallelReduce< FunctorType , ParallelWorkRequest , Kokkos::Threads >
-{
-public:
-
-  typedef ReduceAdapter< FunctorType >   Reduce ;
-  typedef typename Reduce::pointer_type  pointer_type ;
-
-  const FunctorType  m_func ;
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelReduce & self = * ((const ParallelReduce *) arg );
-
-    typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
-
-    self.m_func.init( update ); // Initialize thread-local value
-
-    for ( ; exec.team_work_avail() ; exec.team_work_next() ) {
-      self.m_func( Threads( exec ) , update );
-    }
-
-    exec.fan_in_reduce( self.m_func );
-  }
-
-  ParallelReduce( const FunctorType & functor ,
-                  const ParallelWorkRequest & work ,
-                  const pointer_type  result_ptr = 0 )
-    : m_func( functor )
-    {
-      ThreadsExec::resize_shared_scratch( FunctorShmemSize< FunctorType >::value( functor ) );
-      ThreadsExec::resize_reduce_scratch( Reduce::value_size( m_func ) );
-
-      ThreadsExec::start( & ParallelReduce::execute , this , work.league_size );
-
-      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
-
-      ThreadsExec::fence();
-
-      Reduce::final( m_func , data );
-
-      if ( result_ptr ) {
-        const unsigned n = Reduce::value_count( m_func );
-        for ( unsigned i = 0 ; i < n ; ++i ) { result_ptr[i] = data[i]; }
-      }
-    }
-
-  inline void wait() {}
-
-  inline ~ParallelReduce() { wait(); }
-};
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-struct ThreadsExecUseScanSmall {
-  size_t nwork ;
-  operator size_t () const { return nwork ; }
-  ThreadsExecUseScanSmall( size_t n ) : nwork( n ) {}
-};
-
-template< class FunctorType , class WorkSpec >
-class ParallelScan< FunctorType , WorkSpec , Kokkos::Threads >
-{
-public:
-
-  typedef ReduceAdapter< FunctorType > Reduce ;
-  typedef typename Reduce::pointer_type pointer_type ;
-
-  const FunctorType  m_func ;
-  const size_t       m_work ;
-
-  static void execute( ThreadsExec & exec , const void * arg )
-  {
-    const ParallelScan & self = * ((const ParallelScan *) arg );
-
-    const std::pair<size_t,size_t> work = exec.work_range( self.m_work );
-
-    typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
-
-    self.m_func.init( update );
-
-    for ( size_t iwork = work.first ; iwork < work.second ; ++iwork ) {
-      self.m_func( iwork , update , false );
-    }
-
-    // Compile time selection of scan algorithm to support unit testing
-    // of both large and small thread count algorithms.
-    if ( ! is_same< WorkSpec , ThreadsExecUseScanSmall >::value ) {
-      exec.scan_large( self.m_func );
-    }
-    else {
-      exec.scan_small( self.m_func );
-    }
-
-    for ( size_t iwork = work.first ; iwork < work.second ; ++iwork ) {
-      self.m_func( iwork , update , true );
-    }
-
-    exec.fan_in();
-  }
-
-  ParallelScan( const FunctorType & functor , const size_t nwork )
-    : m_func( functor )
-    , m_work( nwork )
-    {
-      ThreadsExec::resize_reduce_scratch( 2 * Reduce::value_size( m_func ) );
-      ThreadsExec::start( & ParallelScan::execute , this );
-      ThreadsExec::fence();
-    }
-
-  inline void wait() {}
-
-  inline ~ParallelScan() { wait(); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template<>
-class MultiFunctorParallelReduce< Threads > {
-private:
-
-  struct MemberBase {
-    virtual void init( Impl::ThreadsExec & ) const = 0 ;
-    virtual void exec( Impl::ThreadsExec & ) const = 0 ;
-    virtual void fan_in_reduce( Impl::ThreadsExec & ) const = 0 ;
-    virtual void output( void * ) const = 0 ;
-    virtual ~MemberBase() {}
-  };
-
-  template< class FunctorType >
-  struct Member : public MemberBase {
-    typedef Impl::ReduceAdapter< FunctorType >   Reduce ;
-    typedef typename Reduce::pointer_type  pointer_type ;
-
-    const FunctorType  m_func ;
-    const size_t       m_work ;
-
-    ~Member() {}
-
-    Member( const FunctorType & func , const size_t work )
-      : m_func( func ), m_work( work )
-      {
-        Impl::ThreadsExec::resize_reduce_scratch( Reduce::value_size( m_func ) );
-      }
-    
-    void init( Impl::ThreadsExec & exec ) const
-      { m_func.init( Reduce::reference( exec.reduce_base() ) ); }
-
-    void exec( Impl::ThreadsExec & exec ) const
-      {
-        typename Reduce::reference_type update = Reduce::reference( exec.reduce_base() );
-
-        const std::pair<size_t,size_t> work = exec.work_range( m_work );
-
-        for ( size_t iwork = work.first ; iwork < work.second ; ++iwork ) {
-          m_func( iwork , update );
-        }
-      }
-
-    void fan_in_reduce( Impl::ThreadsExec & exec ) const
-      { exec.fan_in_reduce( m_func ); }
-
-    void output( void * ptr ) const
-      {
-        const pointer_type result = (pointer_type) ptr ;
-        const pointer_type data   = (pointer_type) Impl::ThreadsExec::root_reduce_scratch();
-
-        Impl::ThreadsExec::fence();
-
-        Reduce::final( m_func , data );
-
-        if ( result ) {
-          const unsigned n = Reduce::value_count( m_func );
-          for ( unsigned i = 0 ; i < n ; ++i ) { result[i] = data[i]; }
-        }
-      }
-  };
-
-  std::vector< MemberBase * > m_members ;
-
-  static void execute_members( Impl::ThreadsExec & exec , const void * arg )
-  {
-    const MultiFunctorParallelReduce & self = * ((const MultiFunctorParallelReduce *) arg );
-
-    // First functor initializes:
-
-    self.m_members.front()->init( exec ); // Initialize thread-local value
-
-    for ( unsigned i = 0 ; i < self.m_members.size() ; ++i ) {
-      self.m_members[i]->exec( exec );
-    }
-
-    // Last functor fan-in reduce:
-
-    self.m_members.back()->fan_in_reduce( exec );
-  }
-
-public:
-
-  inline
-  void execute( void * host_ptr ) const
-    {
-      if ( ! m_members.empty() ) {
-        Impl::ThreadsExec::start( & MultiFunctorParallelReduce::execute_members , this );
-        m_members.back()->output( host_ptr );
-      }
-    }
-
-  inline
-  void wait() const {}
-
-  template< class FunctorType >
-  void push_back( const size_t work_count , const FunctorType & f )
-  {
-    MemberBase * const m = new Member< FunctorType >( f , work_count );
-    m_members.push_back( m );
-  }
-
-  ~MultiFunctorParallelReduce()
-  {
-    while ( ! m_members.empty() ) {
-      delete m_members.back();
-      m_members.pop_back();
-    }
-  }
-};
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
-
diff --git a/kokkos/kokkos/core/src/build.cuda.mac b/kokkos/kokkos/core/src/build.cuda.mac
deleted file mode 100755
index 8c94550..0000000
--- a/kokkos/kokkos/core/src/build.cuda.mac
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-
-touch KokkosCore_config.h
-
-#flags="-I../ -I./ -I../../../TPL -c -O3 -arch=sm_30 -Xcompiler -fPIC -DKOKKOS_HAVE_CUDA -DKOKKOS_HAVE_PTHREAD --compiler-bindir=/Users/mhoemme/pkg/gcc-4.7.2/bin"
-flags="-I../ -I./ -I../../../TPL -c -O3 -arch=sm_30 -Xcompiler -fPIC -DKOKKOS_HAVE_CUDA -DKOKKOS_HAVE_PTHREAD"
-CC=nvcc
-cd Cuda
-rm *.o
-$CC $flags Kokkos_Cuda_Impl.cu
-$CC $flags Kokkos_CudaSpace.cu
-cd ..
-cd impl
-rm *.o
-$CC $flags Kokkos_hwloc.cpp
-$CC $flags Kokkos_MemoryTracking.cpp
-$CC $flags Kokkos_Shape.cpp
-$CC $flags Kokkos_Error.cpp
-$CC $flags Kokkos_HostSpace.cpp
-$CC $flags Kokkos_Serial.cpp
-cd ..
-cd Threads
-rm *.o
-$CC $flags Kokkos_ThreadsExec.cpp
-$CC $flags Kokkos_ThreadsExec_base.cpp
-cd ..
-$CC -arch=sm_35 -lib -o libkokkoscore-cuda.a Cuda/*.o impl/*.o Threads/*.o
-
diff --git a/kokkos/kokkos/core/src/build_common.sh b/kokkos/kokkos/core/src/build_common.sh
deleted file mode 100755
index 8051609..0000000
--- a/kokkos/kokkos/core/src/build_common.sh
+++ /dev/null
@@ -1,271 +0,0 @@
-#!/bin/bash
-
-#-----------------------------------------------------------------------------
-# Shared portion of build script for the base Kokkos functionality
-# Simple build script with options
-#-----------------------------------------------------------------------------
-if [    ! -d "${KOKKOS}" \
-     -o ! -d "${KOKKOS}/src" \
-     -o ! -d "${KOKKOS}/src/impl" \
-     -o ! -d "${KOKKOS}/src/Cuda" \
-     -o ! -d "${KOKKOS}/src/OpenMP" \
-     -o ! -d "${KOKKOS}/src/Threads" \
-   ] ;
-then
-echo "Must set KOKKOS to the kokkos/core directory"
-exit -1
-fi
-
-#-----------------------------------------------------------------------------
-
-INC_PATH="-I${KOKKOS}/src"
-INC_PATH="${INC_PATH} -I${KOKKOS}/../TPL"
-
-#-----------------------------------------------------------------------------
-
-while [ -n "${1}" ] ; do
-
-ARG="${1}"
-shift 1
-
-case ${ARG} in
-#----------- OPTIONS -----------
-OPT | opt | O3 | -O3 ) OPTFLAGS="${OPTFLAGS} -O3" ;;
-#-------------------------------
-DBG | dbg | g | -g )   KOKKOS_EXPRESSION_CHECK=1 ;;
-#-------------------------------
-HWLOC | hwloc ) KOKKOS_HAVE_HWLOC=${1} ; shift 1 ;;
-#-------------------------------
-MPI | mpi )
-  KOKKOS_HAVE_MPI=${1} ; shift 1
-  CXX="${KOKKOS_HAVE_MPI}/bin/mpicxx"
-  LINK="${KOKKOS_HAVE_MPI}/bin/mpicxx"  
-  INC_PATH="${INC_PATH} -I${KOKKOS_HAVE_MPI}/include"
-  ;;
-#-------------------------------
-OMP | omp | OpenMP )
-  KOKKOS_HAVE_OPENMP=1
-  ;;
-#-------------------------------
-CUDA | Cuda | cuda )
-  # CUDA_ARCH options: 20 30 35
-  CUDA_ARCH=${1} ; shift 1
-  #
-  # -x cu : process all files through the Cuda compiler as Cuda code.
-  # -lib -o : produce library
-  #
-  NVCC="nvcc -DKOKKOS_HAVE_CUDA_ARCH=${CUDA_ARCH}0 -gencode arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
-  NVCC="${NVCC} -maxrregcount=64"
-  NVCC="${NVCC} -Xcompiler -Wall,-ansi"
-  NVCC="${NVCC} -lib -o libCuda.a -x cu"
-
-  NVCC_SOURCES="${NVCC_SOURCES} ${KOKKOS}/src/Cuda/*.cu"
-  LIB="${LIB} libCuda.a -L/usr/local/cuda/lib64 -lcudart -lcusparse"
-  ;;#-------------------------------
-CUDA_OSX | Cuda_OSX | cuda_osx )
-  # CUDA_ARCH options: 20 30 35
-  CUDA_ARCH=${1} ; shift 1
-  #
-  # -x cu : process all files through the Cuda compiler as Cuda code.
-  # -lib -o : produce library
-  #
-  NVCC="nvcc -DKOKKOS_HAVE_CUDA_ARCH=${CUDA_ARCH}0 -gencode arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH}"
-  NVCC="${NVCC} -maxrregcount=64"
-  NVCC="${NVCC} -Xcompiler -Wall,-ansi -Xcompiler -m64"
-  NVCC="${NVCC} -lib -o libCuda.a -x cu"
-
-  NVCC_SOURCES="${NVCC_SOURCES} ${KOKKOS}/src/Cuda/*.cu"
-  LIB="${LIB} libCuda.a -Xlinker -rpath -Xlinker /Developer/NVIDIA/CUDA-5.5/lib -L /Developer/NVIDIA/CUDA-5.5/lib -lcudart -lcusparse"
-  ;;
-#-------------------------------
-GNU | gnu | g++ )
-  # Turn on lots of warnings and ansi compliance.
-  # The Trilinos build system requires '-pedantic'
-  # 
-  CXX="g++ -Wall -Wextra -ansi -pedantic"
-  LINK="g++"
-  CXX="${CXX} -rdynamic -DENABLE_TRACEBACK"
-  LIB="${LIB} -ldl"
-  ;;
-#-------------------------------
-GNU_OSX | gnu_osx | g++_osx )
-  # Turn on lots of warnings and ansi compliance.
-  # The Trilinos build system requires '-pedantic'
-  # 
-  CXX="g++ -Wall -Wextra -ansi -pedantic -m64"
-  LINK="g++"
-  CXX="${CXX} -DENABLE_TRACEBACK"
-  LIB="${LIB} -ldl"
-  ;;
-#-------------------------------
-INTEL | intel | icc | icpc )
-  # -xW = use SSE and SSE2 instructions
-  CXX="icpc -Wall"
-  LINK="icpc"
-  LIB="${LIB} -lstdc++"
-  ;;
-#-------------------------------
-MPIINTEL | mpiintel | mpiicc | mpiicpc )
-  # -xW = use SSE and SSE2 instructions
-  CXX="mpiicpc -Wall"
-  LINK="mpiicpc"
-  LIB="${LIB} -lstdc++"
-  KOKKOS_HAVE_MPI=1
-;;
-#-------------------------------
-MIC | mic )
-  CXX="icpc -mmic -ansi-alias -Wall"
-  LINK="icpc -mmic"
-  CXX="${CXX} -mGLOB_default_function_attrs=knc_stream_store_controls=2"
-  # CXX="${CXX} -vec-report6"
-  # CXX="${CXX} -guide-vec"
-  LIB="${LIB} -lstdc++"
-  COMPILE_MIC="on"
-  ;;
-#-------------------------------
-MPIMIC | mpimic )
-  CXX="mpiicpc -mmic -ansi-alias -Wall"
-  LINK="mpiicpc -mmic"
-  KOKKOS_HAVE_MPI=1
-  CXX="${CXX} -mGLOB_default_function_attrs=knc_stream_store_controls=2"
-  # CXX="${CXX} -vec-report6"
-  # CXX="${CXX} -guide-vec"
-  LIB="${LIB} -lstdc++"
-  COMPILE_MIC="on"
-  ;;
-#-------------------------------
-curie )
-  CXX="CC"
-  LINK="CC"
-  INC_PATH="${INC_PATH} -I/opt/cray/mpt/default/gni/mpich2-cray/74"
-  KOKKOS_HAVE_MPI=1
-  ;;  
-#-------------------------------
-MKL | mkl )
-  HAVE_MKL=${1} ; shift 1 ;
-  CXX_FLAGS="${CXX_FLAGS} -DKOKKOS_USE_MKL -I${HAVE_MKL}/include/"
-  ARCH="intel64"
-  if [ -n "${COMPILE_MIC}" ] ;
-  then
-    ARCH="mic"
-  fi
-  LIB="${LIB}  -L${HAVE_MKL}/lib/${ARCH}/ -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core"
-  NVCC_FLAGS="${NVCC_FLAGS} -DKOKKOS_USE_MKL"
-;;
-#-------------------------------
-CUSPARSE | cusparse )
-  CXX_FLAGS="${CXX_FLAGS} -DKOKKOS_USE_CUSPARSE"
-  NVCC_FLAGS="${NVCC_FLAGS} -DKOKKOS_USE_CUSPARSE"
-  LIB="${LIB} -lcusparse"
-;;
-#-------------------------------
-AVX | avx )
-  CXX_FLAGS="${CXX_FLAGS} -mavx"
-;;
-#-------------------------------
-*) echo 'unknown option: ' ${ARG} ; exit -1 ;;
-esac
-done
-
-#-----------------------------------------------------------------------------
-
-if [ -z "${CXX}" ] ;
-then
-  echo "No C++ compiler selected"
-  exit -1
-fi
-
-if [ -n "${KOKKOS_HAVE_OPENMP}" ]
-then
-CXX="${CXX} -fopenmp"
-CXX_SOURCES="${CXX_SOURCES} ${KOKKOS}/src/OpenMP/*.cpp"
-fi
-
-#-----------------------------------------------------------------------------
-# Option for PTHREAD or WINTHREAD eventually
-
-KOKKOS_HAVE_PTHREAD=1
-
-if [ -n "${KOKKOS_HAVE_PTHREAD}" ] ;
-then
-  LIB="${LIB} -lpthread"
-fi
-
-#-----------------------------------------------------------------------------
-# Attach options to compile lines
-
-CXX="${CXX} ${OPTFLAGS}"
-
-if [ -n "${NVCC}" ] ;
-then
-  NVCC="${NVCC} ${OPTFLAGS}"
-fi
-
-#-----------------------------------------------------------------------------
-
-CXX_SOURCES="${CXX_SOURCES} ${KOKKOS}/src/impl/*.cpp"
-CXX_SOURCES="${CXX_SOURCES} ${KOKKOS}/src/Threads/*.cpp"
-
-#-----------------------------------------------------------------------------
-#
-
-if [ -n "${KOKKOS_HAVE_HWLOC}" ] ;
-then
-
-  if [ ! -d ${KOKKOS_HAVE_HWLOC} ] ;
-  then
-    echo "${KOKKOS_HAVE_HWLOC} does not exist"
-    exit 1
-  fi
-
-  echo "LD_LIBRARY_PATH must include ${KOKKOS_HAVE_HWLOC}/lib"
-
-  LIB="${LIB} -L${KOKKOS_HAVE_HWLOC}/lib -lhwloc"
-  INC_PATH="${INC_PATH} -I${KOKKOS_HAVE_HWLOC}/include"
-fi
-
-#-----------------------------------------------------------------------------
-
-INC_PATH="${INC_PATH} -I."
-
-CONFIG="KokkosCore_config.h"
-
-rm -f ${CONFIG}
-
-echo "#ifndef KOKKOS_CORE_CONFIG_H" >> ${CONFIG}
-echo "#define KOKKOS_CORE_CONFIG_H" >> ${CONFIG}
-
-if [ -n "${KOKKOS_HAVE_MPI}" ] ;
-then
-  echo "#define KOKKOS_HAVE_MPI" >> ${CONFIG}
-fi
-
-if [ -n "${NVCC}" ] ;
-then
-  echo "#define KOKKOS_HAVE_CUDA" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_HAVE_PTHREAD}" ] ;
-then
-  echo "#define KOKKOS_HAVE_PTHREAD" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_HAVE_HWLOC}" ] ;
-then
-  echo "#define KOKKOS_HAVE_HWLOC" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_HAVE_OPENMP}" ] ;
-then
-  echo "#define KOKKOS_HAVE_OPENMP" >> ${CONFIG}
-fi
-
-if [ -n "${KOKKOS_EXPRESSION_CHECK}" ] ;
-then
-  echo "#define KOKKOS_EXPRESSION_CHECK" >> ${CONFIG}
-fi
-
-echo "#endif" >> ${CONFIG}
-
-#-----------------------------------------------------------------------------
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp b/kokkos/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
deleted file mode 100644
index 51f4fbd..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_ANALYZESHAPE_HPP
-#define KOKKOS_ANALYZESHAPE_HPP
-
-#include <impl/Kokkos_Shape.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-/** \brief  Analyze the array shape defined by a Kokkos::View data type.
- *
- *  It is presumed that the data type can be mapped down to a multidimensional
- *  array of an intrinsic scalar numerical type (double, float, int, ... ).
- *  The 'value_type' of an array may be an embedded aggregate type such
- *  as a fixed length array 'Array<T,N>'.  In this case the 'scalar_type'
- *  is 'T' and the 'value_type' is 'Array<T,N>' to enable data layout 
- *  according to shape and scalar_type AND data access by value_type.
- *
- *  The embedded aggregate type must have an AnalyzeShape specialization
- *  to map it down to a shape and intrinsic scalar numerical type.
- */
-
-template< class T >
-struct AnalyzeShape : public Shape< sizeof(T) , 0 >
-{
-  typedef Shape< sizeof(T), 0 >  shape ;
-
-  typedef       T  scalar_type ;
-  typedef       T  array_type ;
-  typedef       T  value_type ;
-  typedef       T  type ;
-  typedef const T  const_scalar_type ;
-  typedef const T  const_array_type ;
-  typedef const T  const_value_type ;
-  typedef const T  const_type ;
-  typedef       T  non_const_scalar_type ;
-  typedef       T  non_const_array_type ;
-  typedef       T  non_const_value_type ;
-  typedef       T  non_const_type ;
-};
-
-template<>
-struct AnalyzeShape<void> : public Shape< 0 , 0 >
-{
-  typedef Shape< 0 , 0 >  shape ;
-
-  typedef       void  scalar_type ;
-  typedef       void  array_type ;
-  typedef       void  value_type ;
-  typedef       void  type ;
-  typedef const void  const_scalar_type ;
-  typedef const void  const_array_type ;
-  typedef const void  const_value_type ;
-  typedef const void  const_type ;
-  typedef       void  non_const_scalar_type ;
-  typedef       void  non_const_array_type ;
-  typedef       void  non_const_value_type ;
-  typedef       void  non_const_type ;
-};
-
-template< class T >
-struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename nested::shape shape ;
-
-  typedef typename nested::const_scalar_type scalar_type ;
-  typedef typename nested::const_array_type  array_type ;
-  typedef typename nested::const_value_type  value_type ;
-  typedef typename nested::const_type        type ;
-
-  typedef typename nested::const_scalar_type const_scalar_type ;
-  typedef typename nested::const_array_type  const_array_type ;
-  typedef typename nested::const_value_type  const_value_type ;
-  typedef typename nested::const_type        const_type ;
-
-  typedef typename nested::non_const_scalar_type non_const_scalar_type ;
-  typedef typename nested::non_const_array_type  non_const_array_type ;
-  typedef typename nested::non_const_value_type  non_const_value_type ;
-  typedef typename nested::non_const_type        non_const_type ;
-};
-
-template< class T >
-struct AnalyzeShape< T * >
-  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
-
-  typedef typename nested::scalar_type  scalar_type ;
-  typedef typename nested::array_type * array_type ;
-  typedef typename nested::value_type   value_type ;
-  typedef typename nested::type       * type ;
-
-  typedef typename nested::const_scalar_type  const_scalar_type ;
-  typedef typename nested::const_array_type * const_array_type ;
-  typedef typename nested::const_value_type   const_value_type ;
-  typedef typename nested::const_type       * const_type ;
-
-  typedef typename nested::non_const_scalar_type  non_const_scalar_type ;
-  typedef typename nested::non_const_array_type * non_const_array_type ;
-  typedef typename nested::non_const_value_type   non_const_value_type ;
-  typedef typename nested::non_const_type       * non_const_type ;
-};
-
-template< class T >
-struct AnalyzeShape< T[] >
-  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
-
-  typedef typename nested::scalar_type scalar_type ;
-  typedef typename nested::array_type  array_type [] ;
-  typedef typename nested::value_type  value_type ;
-  typedef typename nested::type        type [] ;
-
-  typedef typename nested::const_scalar_type const_scalar_type ;
-  typedef typename nested::const_array_type  const_array_type [] ;
-  typedef typename nested::const_value_type  const_value_type ;
-  typedef typename nested::const_type        const_type [] ;
-
-  typedef typename nested::non_const_scalar_type non_const_scalar_type ;
-  typedef typename nested::non_const_array_type  non_const_array_type [] ;
-  typedef typename nested::non_const_value_type  non_const_value_type ;
-  typedef typename nested::non_const_type        non_const_type [] ;
-};
-
-template< class T >
-struct AnalyzeShape< const T[] >
-  : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
-{
-private:
-  typedef AnalyzeShape< const T > nested ;
-public:
-
-  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
-
-  typedef typename nested::scalar_type scalar_type ;
-  typedef typename nested::array_type  array_type [] ;
-  typedef typename nested::value_type  value_type ;
-  typedef typename nested::type        type [] ;
-
-  typedef typename nested::const_scalar_type const_scalar_type ;
-  typedef typename nested::const_array_type  const_array_type [] ;
-  typedef typename nested::const_value_type  const_value_type ;
-  typedef typename nested::const_type        const_type [] ;
-
-  typedef typename nested::non_const_scalar_type non_const_scalar_type ;
-  typedef typename nested::non_const_array_type  non_const_array_type [] ;
-  typedef typename nested::non_const_value_type  non_const_value_type ;
-  typedef typename nested::non_const_type        non_const_type [] ;
-};
-
-template< class T , unsigned N >
-struct AnalyzeShape< T[N] >
-  : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
-{
-private:
-  typedef AnalyzeShape<T> nested ;
-public:
-
-  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
-
-  typedef typename nested::scalar_type scalar_type ;
-  typedef typename nested::array_type  array_type [N] ;
-  typedef typename nested::value_type  value_type ;
-  typedef typename nested::type        type [N] ;
-
-  typedef typename nested::const_scalar_type const_scalar_type ;
-  typedef typename nested::const_array_type  const_array_type [N] ;
-  typedef typename nested::const_value_type  const_value_type ;
-  typedef typename nested::const_type        const_type [N] ;
-
-  typedef typename nested::non_const_scalar_type non_const_scalar_type ;
-  typedef typename nested::non_const_array_type  non_const_array_type [N] ;
-  typedef typename nested::non_const_value_type  non_const_value_type ;
-  typedef typename nested::non_const_type        non_const_type [N] ;
-};
-
-template< class T , unsigned N >
-struct AnalyzeShape< const T[N] >
-  : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
-{
-private:
-  typedef AnalyzeShape< const T > nested ;
-public:
-
-  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
-
-  typedef typename nested::scalar_type scalar_type ;
-  typedef typename nested::array_type  array_type [N] ;
-  typedef typename nested::value_type  value_type ;
-  typedef typename nested::type        type [N] ;
-
-  typedef typename nested::const_scalar_type const_scalar_type ;
-  typedef typename nested::const_array_type  const_array_type [N] ;
-  typedef typename nested::const_value_type  const_value_type ;
-  typedef typename nested::const_type        const_type [N] ;
-
-  typedef typename nested::non_const_scalar_type non_const_scalar_type ;
-  typedef typename nested::non_const_array_type  non_const_array_type [N] ;
-  typedef typename nested::non_const_value_type  non_const_value_type ;
-  typedef typename nested::non_const_type        non_const_type [N] ;
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
deleted file mode 100644
index 3fbc728..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
-#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
-// Must cast-away 'volatile' for the CAS call.
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-KOKKOS_INLINE_FUNCTION
-int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
-{ return atomicCAS((int*)dest,compare,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
-{ return atomicCAS((unsigned int*)dest,compare,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
-                                                const unsigned long long int compare ,
-                                                const unsigned long long int val )
-{ return atomicCAS((unsigned long long int*)dest,compare,val); }
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Impl::UnionPair<T,int,unsigned long long int>::first_type
-atomic_compare_exchange( volatile T * const dest , const T compare , const T val )
-{
-  typedef Kokkos::Impl::UnionPair<T,int,unsigned long long int> union_type ;
-  typedef typename union_type::second_type int_type ;
-
-  return union_type( atomicCAS( (int_type *) union_type::cast( dest ) ,
-                                union_type::cast( compare ) ,
-                                union_type::cast( val ) )
-                   ).first ;
-}
-
-//----------------------------------------------------------------------------
-// GCC native CAS supports int, long, unsigned int, unsigned long.
-// Intel native CAS support int and long with the same interface as GCC.
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-KOKKOS_INLINE_FUNCTION
-int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-KOKKOS_INLINE_FUNCTION
-long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-#if defined( KOKKOS_ATOMICS_USE_GCC )
-
-// GCC supports unsigned
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
-                                       const unsigned long compare ,
-                                       const unsigned long val )
-{ return __sync_val_compare_and_swap(dest,compare,val); }
-
-#endif
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Impl::UnionPair<T,int,long>::first_type
-atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
-{
-  typedef Kokkos::Impl::UnionPair<T,int,long> union_type ;
-
-  return union_type(
-    __sync_val_compare_and_swap( union_type::cast( dest ) ,
-                                 union_type::cast( compare ) ,
-                                 union_type::cast( val ) )
-  ).first ;
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template< typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
-{
-  T retval;
-#pragma omp critical
-  {
-    retval = dest[0];
-    if ( retval == compare )
-  	dest[0] = val;
-  }
-  return retval;
-}
-
-#endif
-
-
-template <typename T>
-KOKKOS_INLINE_FUNCTION
-bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
-{
-  return compare == atomic_compare_exchange(dest, compare, val);
-}
-
-//----------------------------------------------------------------------------
-
-} // namespace Kokkos
-
-#endif
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
deleted file mode 100644
index 8d4965e..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
-#define KOKKOS_ATOMIC_EXCHANGE_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-KOKKOS_INLINE_FUNCTION
-int atomic_exchange( volatile int * const dest , const int val )
-{ return atomicExch( (int*) dest , val ); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
-{ return atomicExch( (unsigned int*) dest , val ); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long long atomic_exchange( volatile unsigned long long * const dest , const unsigned long long val )
-{ return atomicExch( (unsigned long long*) dest , val ); }
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Impl::UnionPair<T,int,unsigned long long int>::first_type
-atomic_exchange( volatile T * const dest , const T val )
-{
-  typedef Kokkos::Impl::UnionPair<T,int,unsigned long long int> union_type ;
-  typedef typename union_type::second_type type ;
-
-  return union_type( atomicExch( (type *) union_type::cast( dest ) ,
-                                 union_type::cast( val ) )
-                   ).first ;
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-template< typename T >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Impl::UnionPair<T,int,long>::first_type
-atomic_exchange( volatile T * const dest , const T val )
-{
-  typedef Kokkos::Impl::UnionPair<T,int,long> union_type ;
-
-  union_type assumed , old ;
-
-  old.first = *dest ;
-  do {
-    assumed.second = old.second ;
-    old.second = __sync_val_compare_and_swap( union_type::cast( dest ),
-                                              assumed.second ,
-                                              union_type::cast( val ) );
-  } while ( assumed.second != old.second );
-
-  return old.first ;
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-T atomic_exchange( volatile T * const dest , const T val )
-{
-  T retval;
-#pragma omp critical
-  {
-    retval = dest[0];
-    dest[0] = val;
-  }
-  return retval;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-} // namespace Kokkos
-
-#endif
-
-//----------------------------------------------------------------------------
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
deleted file mode 100644
index 7411c28..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
-#define KOKKOS_ATOMIC_FETCH_ADD_HPP
-
-namespace Kokkos {
-
-//----------------------------------------------------------------------------
-
-#if defined( KOKKOS_ATOMICS_USE_CUDA )
-
-// Support for int, unsigned int, unsigned long long int, and float
-
-KOKKOS_INLINE_FUNCTION
-int atomic_fetch_add( volatile int * const dest , const int val )
-{ return atomicAdd((int*)dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
-{ return atomicAdd((unsigned int*)dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
-                                         const unsigned long long int val )
-{ return atomicAdd((unsigned long long int*)dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-float atomic_fetch_add( volatile float * const dest , const float val )
-{ return atomicAdd((float*)dest,val); }
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Impl::UnionPair<T,int,unsigned long long int>::first_type
-atomic_fetch_add( volatile T * const dest , const T val )
-{
-  typedef Kokkos::Impl::UnionPair<T,int,unsigned long long int> union_type ;
-  typedef typename union_type::second_type type ;
-
-  union_type assumed , old , newval ;
-
-  old.first = *dest ;
-  do {
-    assumed.second = old.second ;
-    newval.first = assumed.first + val ;
-    old.second = atomicCAS( (type *) union_type::cast( dest ),
-                            assumed.second ,
-                            newval.second );
-  } while ( assumed.second != old.second );
-
-  return old.first ;
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
-
-KOKKOS_INLINE_FUNCTION
-int atomic_fetch_add( volatile int * const dest , const int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-long int atomic_fetch_add( volatile long int * const dest , const long int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-#if defined( KOKKOS_ATOMICS_USE_GCC )
-
-KOKKOS_INLINE_FUNCTION
-unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-KOKKOS_INLINE_FUNCTION
-unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
-{ return __sync_fetch_and_add(dest,val); }
-
-#endif
-
-template < typename T >
-KOKKOS_INLINE_FUNCTION
-typename Kokkos::Impl::UnionPair<T,int,long>::first_type
-atomic_fetch_add( volatile T * const dest , const T val )
-{
-  typedef Kokkos::Impl::UnionPair<T,int,long> union_type ;
-
-  union_type assumed , old , newval ;
-
-  old.first = *dest ;
-  do {
-    assumed.second = old.second ;
-    newval.first = assumed.first + val ;
-    old.second = __sync_val_compare_and_swap( union_type::cast( dest ),
-                                              assumed.second ,
-                                              newval.second );
-  } while ( assumed.second != old.second );
-
-  return old.first ;
-}
-
-//----------------------------------------------------------------------------
-
-#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
-
-template< typename T >
-T atomic_fetch_add( volatile T * const dest , const T val )
-{
-  T retval;
-#pragma omp critical
-  {
-    retval = dest[0];
-    dest[0] += val;
-  }
-  return retval;
-}
-
-#endif
-
-//----------------------------------------------------------------------------
-
-}
-
-#endif
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp b/kokkos/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
deleted file mode 100644
index 825e6cf..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_CrsArray_factory.hpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
-#define KOKKOS_IMPL_CRSARRAY_FACTORY_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view )
-{
-  // Force copy:
-  typedef Impl::ViewAssignment< Impl::LayoutDefault > alloc ;
-  typedef CrsArray< DataType , Arg1Type , Arg2Type , SizeType >  crsarray_type ;
-
-  typename crsarray_type::HostMirror               tmp ;
-  typename crsarray_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map );
-
-  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
-  tmp.entries = create_mirror( view.entries );
-
-  // Deep copy:
-  deep_copy( tmp_row_map , view.row_map );
-  deep_copy( tmp.entries , view.entries );
-
-  return tmp ;
-}
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
-                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
-{
-  return view ;
-}
-
-template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
-inline
-typename CrsArray< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
-create_mirror_view( const CrsArray<DataType,Arg1Type,Arg2Type,SizeType > & view ,
-                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
-{
-  return create_mirror( view );
-}
-
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class CrsArrayType , class InputSizeType >
-inline
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< InputSizeType > & input )
-{
-  typedef CrsArrayType                  output_type ;
-  typedef std::vector< InputSizeType >  input_type ;
-
-  typedef typename output_type::entries_type   entries_type ;
-
-  typedef View< typename output_type::size_type [] ,
-                typename output_type::array_layout ,
-                typename output_type::device_type > work_type ;
-
-  output_type output ;
-
-  // Create the row map:
-
-  const size_t length = input.size();
-
-  {
-    work_type row_work( "tmp" , length + 1 );
-
-    typename work_type::HostMirror row_work_host =
-      create_mirror_view( row_work );
-
-    size_t sum = 0 ;
-    row_work_host[0] = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      row_work_host[i+1] = sum += input[i];
-    }
-
-    deep_copy( row_work , row_work_host );
-
-    output.entries   = entries_type( label , sum );
-    output.row_map   = row_work ;
-  }
-
-  return output ;
-}
-
-//----------------------------------------------------------------------------
-
-template< class CrsArrayType , class InputSizeType >
-inline
-typename CrsArrayType::crsarray_type
-create_crsarray( const std::string & label ,
-                 const std::vector< std::vector< InputSizeType > > & input )
-{
-  typedef CrsArrayType                                output_type ;
-  typedef std::vector< std::vector< InputSizeType > > input_type ;
-  typedef typename output_type::entries_type          entries_type ;
-  typedef typename output_type::size_type             size_type ;
-
-  typedef typename
-    Impl::assert_shape_is_rank_one< typename entries_type::shape_type >::type
-      ok_rank ;
-
-  typedef View< typename output_type::size_type [] ,
-                typename output_type::array_layout ,
-                typename output_type::device_type > work_type ;
-
-  output_type output ;
-
-    // Create the row map:
-
-  const size_t length = input.size();
-
-  {
-    work_type row_work( "tmp" , length + 1 );
-
-    typename work_type::HostMirror row_work_host =
-      create_mirror_view( row_work );
-
-    size_t sum = 0 ;
-    row_work_host[0] = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      row_work_host[i+1] = sum += input[i].size();
-    }
-
-    deep_copy( row_work , row_work_host );
-
-    output.entries   = entries_type( label , sum );
-    output.row_map   = row_work ;
-  }
-
-  // Fill in the entries:
-  {
-    typename entries_type::HostMirror host_entries =
-      create_mirror_view( output.entries );
-
-    size_t sum = 0 ;
-    for ( size_t i = 0 ; i < length ; ++i ) {
-      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
-        host_entries( sum ) = input[i][j] ;
-      }
-    }
-
-    deep_copy( output.entries , host_entries );
-  }
-
-  return output ;
-}
-
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Error.cpp b/kokkos/kokkos/core/src/impl/Kokkos_Error.cpp
deleted file mode 100644
index cf762ae..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Error.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <ostream>
-#include <sstream>
-#include <iomanip>
-#include <stdexcept>
-#include <impl/Kokkos_Error.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void throw_runtime_exception( const std::string & msg )
-{
-  std::ostringstream o ;
-  o << msg ;
-  traceback_callstack( o );
-  throw std::runtime_error( o.str() );
-}
-
-
-std::string human_memory_size(size_t arg_bytes)
-{
-  double bytes = arg_bytes;
-  const double K = 1024;
-  const double M = K*1024;
-  const double G = M*1024;
-
-  std::ostringstream out;
-  if (bytes < K) {
-    out << std::setprecision(4) << bytes << " B";
-  } else if (bytes < M) {
-    bytes /= K;
-    out << std::setprecision(4) << bytes << " K";
-  } else if (bytes < G) {
-    bytes /= M;
-    out << std::setprecision(4) << bytes << " M";
-  } else {
-    bytes /= G;
-    out << std::setprecision(4) << bytes << " G";
-  }
-  return out.str();
-}
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
-
-/*  This is only known to work with GNU C++
- *  Must be compiled with '-rdynamic'
- *  Must be linked with   '-ldl'
- */
-
-/* Print call stack into an error stream,
- * so one knows in which function the error occured.
- *
- * Code copied from:
- *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
- *
- * License on this site:
- *   This blog is licensed under a
- *   Creative Commons Attribution-Share Alike 3.0 Unported License.
- *
- *   http://creativecommons.org/licenses/by-sa/3.0/
- *
- * Modified to output to std::ostream.
- */
-#include <signal.h>
-#include <execinfo.h>
-#include <cxxabi.h>
-#include <dlfcn.h>
-#include <stdlib.h>
-
-namespace Kokkos {
-namespace Impl {
-
-void traceback_callstack( std::ostream & msg )
-{
-  using namespace abi;
-
-  enum { MAX_DEPTH = 32 };
-
-  void *trace[MAX_DEPTH];
-  Dl_info dlinfo;
-
-  int status;
-
-  int trace_size = backtrace(trace, MAX_DEPTH);
-
-  msg << std::endl << "Call stack {" << std::endl ;
-
-  for (int i=1; i<trace_size; ++i)
-  {
-    if(!dladdr(trace[i], &dlinfo))
-        continue;
-
-    const char * symname = dlinfo.dli_sname;
-
-    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
-
-    if ( status == 0 && demangled ) {
-      symname = demangled;
-    }
-
-    if ( symname && *symname != 0 ) {
-      msg << "  object: " << dlinfo.dli_fname
-          << " function: " << symname
-          << std::endl ;
-    }
-
-    if ( demangled ) {
-        free(demangled);
-    }
-  }
-  msg << "}" ;
-}
-
-}
-}
-
-#else
-
-namespace Kokkos {
-namespace Impl {
-
-void traceback_callstack( std::ostream & msg )
-{
-  msg << std::endl << "Traceback functionality not available" << std::endl ;
-}
-
-}
-}
-
-#endif
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Error.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Error.hpp
deleted file mode 100644
index 5b89b18..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Error.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                             Kokkos
-//         Manycore Performance-Portable Multidimensional Arrays
-//
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_IMPL_ERROR_HPP
-#define KOKKOS_IMPL_ERROR_HPP
-
-#include <string>
-#include <iosfwd>
-
-namespace Kokkos {
-namespace Impl {
-
-void throw_runtime_exception( const std::string & );
-
-void traceback_callstack( std::ostream & );
-
-std::string human_memory_size(size_t arg_bytes);
-
-}
-}
-
-#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/kokkos/kokkos/core/src/impl/Kokkos_HostSpace.cpp
deleted file mode 100644
index 487271c..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <memory.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-
-#include <Kokkos_HostSpace.hpp>
-#include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_MemoryTracking.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace {
-
-class HostMemoryTrackingEntry : public Impl::MemoryTrackingEntry
-{
-public:
-
-  void * const ptr_alloc ;
-
-  HostMemoryTrackingEntry( const std::string & arg_label ,
-                           const std::type_info & arg_info ,
-                           void * const           arg_ptr ,
-                           const unsigned         arg_size )
-    : Impl::MemoryTrackingEntry( arg_label , arg_info , arg_ptr , arg_size )
-    , ptr_alloc( arg_ptr )
-    {}
-
-  ~HostMemoryTrackingEntry();
-};
-
-HostMemoryTrackingEntry::~HostMemoryTrackingEntry()
-{
-#if defined( __INTEL_COMPILER )
-   _mm_free( ptr_alloc );
-#else
-   free( ptr_alloc );
-#endif
-}
-
-Impl::MemoryTracking & host_space_singleton()
-{
-  static Impl::MemoryTracking self("Kokkos::HostSpace");
-  return self ;
-}
-
-} // namespace <blank>
-} // namespade Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-void * host_allocate_not_thread_safe(
-  const std::string    & label ,
-  const std::type_info & scalar_type ,
-  const size_t           scalar_size ,
-  const size_t           scalar_count )
-{
-  void * ptr = 0 ;
-
-  if ( 0 < scalar_size && 0 < scalar_count ) {
-    void * ptr_alloc = 0 ;
-    size_t count_alloc = scalar_count ;
-
-#if defined( __INTEL_COMPILER )
-
-    ptr = ptr_alloc = _mm_malloc( scalar_size * count_alloc , MEMORY_ALIGNMENT );
-   
-#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
-      ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
-
-    posix_memalign( & ptr_alloc , MEMORY_ALIGNMENT , scalar_size * count_alloc );
-    ptr = ptr_alloc ;
-
-#else
-
-    // Over-allocate to guarantee enough aligned space.
-
-    count_alloc += ( MEMORY_ALIGNMENT + scalar_size - 1 ) / scalar_size ;
-
-    ptr_alloc = malloc( scalar_size * count_alloc );
-
-    ptr = static_cast<unsigned char *>(ptr_alloc) + 
-          ( MEMORY_ALIGNMENT - reinterpret_cast<ptrdiff_t>(ptr_alloc) % MEMORY_ALIGNMENT );
-
-#endif
-
-    if ( ptr_alloc && ptr_alloc <= ptr &&
-         0 == ( reinterpret_cast<ptrdiff_t>(ptr) % MEMORY_ALIGNMENT ) ) {
-      host_space_singleton().insert(
-        new HostMemoryTrackingEntry( label , scalar_type , ptr_alloc , scalar_size * count_alloc ) );
-    }
-    else {
-      std::ostringstream msg ;
-      msg << "Kokkos::Impl::host_allocate_not_thread_safe( "
-          << label
-          << " , " << scalar_type.name()
-          << " , " << scalar_size
-          << " , " << scalar_count
-          << " ) FAILED aligned memory allocation" ;
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    }
-  }
-
-  return ptr ;
-}
-
-void host_decrement_not_thread_safe( const void * ptr )
-{
-  host_space_singleton().decrement( ptr );
-}
-
-DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
-{
-  memcpy( dst , src , n );
-}
-
-}
-}
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace {
-
-static const int QUERY_DEVICE_IN_PARALLEL_MAX = 16 ;
-
-typedef int (* QueryDeviceInParallelPtr )();
-
-QueryDeviceInParallelPtr s_in_parallel_query[ QUERY_DEVICE_IN_PARALLEL_MAX ] ;
-int s_in_parallel_query_count = 0 ;
-
-} // namespace <empty>
-
-void HostSpace::register_in_parallel( int (*device_in_parallel)() )
-{
-  if ( 0 == device_in_parallel ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
-  }
-
-  int i = -1 ;
-
-  if ( ! (device_in_parallel)() ) {
-    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
-  }
-
-  if ( i < s_in_parallel_query_count ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
-
-  }
-
-  if ( QUERY_DEVICE_IN_PARALLEL_MAX <= i ) {
-    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
-
-  }
-
-  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
-
-  if ( i == s_in_parallel_query_count ) {
-    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
-  }
-}
-
-int HostSpace::in_parallel()
-{
-  const int n = s_in_parallel_query_count ;
-
-  int i = 0 ;
-
-  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
-
-  return i < n ;
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-
-void * HostSpace::allocate(
-  const std::string    & label ,
-  const std::type_info & scalar_type ,
-  const size_t           scalar_size ,
-  const size_t           scalar_count )
-{
-  if ( HostSpace::in_parallel() ) {
-    Kokkos::Impl::throw_runtime_exception( "Kokkos::HostSpace::allocate ERROR : called in parallel" );
-  }
-
-  void * const ptr =
-    Impl::host_allocate_not_thread_safe( label , scalar_type , scalar_size , scalar_count );
-
-  return ptr ;
-}
-
-void HostSpace::increment( const void * ptr )
-{
-  if ( ! HostSpace::in_parallel() ) {
-    host_space_singleton().increment( ptr );
-  }
-}
-
-void HostSpace::decrement( const void * ptr )
-{
-  if ( ! HostSpace::in_parallel() ) {
-    Impl::host_decrement_not_thread_safe( ptr );
-  }
-}
-
-void HostSpace::print_memory_view( std::ostream & o )
-{
-  host_space_singleton().print( o , std::string("  ") );
-}
-
-std::string HostSpace::query_label( const void * p )
-{
-  const Impl::MemoryTrackingEntry * const info = 
-    host_space_singleton().query( p );
-
-  return 0 != info ? info->label : std::string("ERROR NOT DEFINED");
-}
-
-} // namespace Kokkos
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp b/kokkos/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp
deleted file mode 100644
index 28aa65c..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_MemoryTracking.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <stddef.h>
-#include <limits>
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-
-#include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_MemoryTracking.hpp>
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-//----------------------------------------------------------------------------
-// Fast search for result[-1] <= val < result[0].
-// Requires result[max] == upper_bound.
-// Start with a binary search until the search range is
-// less than LINEAR_LIMIT, then switch to linear search.
-
-int upper_bound( const ptrdiff_t * const begin , unsigned length ,
-                 const ptrdiff_t val )
-{
-  enum { LINEAR_LIMIT = 32 };
-
-  // precondition: begin[length-1] == std::numeric_limits<ptrdiff_t>::max()
-
-  const ptrdiff_t * first = begin ;
-
-  while ( LINEAR_LIMIT < length ) {
-    unsigned          half   = length >> 1 ;
-    const ptrdiff_t * middle = first + half ;
-
-    if ( val < *middle ) {
-      length = half ;
-    }
-    else {
-      first   = ++middle ;
-      length -= ++half ;
-    }
-  }
-
-  for ( ; ! ( val < *first ) ; ++first ) {}
-
-  return first - begin ;
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-MemoryTracking::MemoryTracking( const std::string & space )
-  : m_space( space ), m_tracking(), m_tracking_end()
-{
-  ptrdiff_t max = std::numeric_limits<ptrdiff_t>::max();
-  void * const ptr = reinterpret_cast<void*>( max );
-
-  m_tracking.reserve(64);
-  m_tracking_end.reserve(64);
-
-  // Sentinal value of end
-
-  m_tracking.push_back( new MemoryTrackingEntry( "sentinal" , typeid(void) , ptr , 0 ) );
-  m_tracking_end.push_back( max );
-}
-
-MemoryTracking::~MemoryTracking()
-{
-  const ptrdiff_t max =  std::numeric_limits<ptrdiff_t>::max();
-
-  try {
-    if ( 1 < m_tracking.size() ) {
-      std::cerr << m_space << " destroyed with memory leaks:" << std::endl ;
-      print( std::cerr , std::string("  ") );
-    }
-    else if ( 1 != m_tracking_end.size() || m_tracking_end.back() != max ) {
-      std::cerr << m_space << " corrupted data structure" << std::endl ;
-    }
-  } catch( ... ) {}
-}
-
-void MemoryTracking::insert( MemoryTrackingEntry * entry )
-{
-  const ptrdiff_t max =  std::numeric_limits<ptrdiff_t>::max();
-
-  const bool ok_range = entry &&
-                        0 < entry->begin &&
-                            entry->begin < entry->end &&
-                                           entry->end < max ;
-
-  int i = -1 ;
-
-  if ( ok_range ) {
-
-    i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , entry->begin );
-
-    // Guaranteed:
-    //   a) entry->begin < m_tracking_end[i]
-    //   b) i == 0 || m_tracking_end[i-1] <= entry->begin
-
-    if ( entry->end <= m_tracking[i]->begin ) {
-
-      // Non-overlapping range:
-      // m_tracking[i-1].end <= entry->begin < entry->end <= m_tracking[i].begin
-
-      entry->m_count = 1 ;
-
-      m_tracking.insert(     m_tracking.begin() + i , entry );
-      m_tracking_end.insert( m_tracking_end.begin() + i , entry->end );
-    }
-  }
-
-  if ( ! ok_range || -1 == i ) {
-    std::ostringstream msg ;
-    msg << "MemoryTracking(" << m_space << ")::insert( " ;
-    entry->print( msg );
-    msg << " ) ERROR: " ;
-
-    if ( ! ok_range ) {
-      msg << "Invalid memory range" ;
-    }
-    else {
-      msg << "Overlapping memory range with " ;
-      m_tracking[i]->print( msg );
-    }
-    msg << " )" ;
-    throw_runtime_exception( msg.str() );
-  }
-}
-
-void MemoryTracking::increment( const void * ptr )
-{
-  if ( ptr ) {
-    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
-    const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
-
-    if ( m_tracking[i]->begin <= p ) {
-      ++( m_tracking[i]->m_count );
-    }
-    else {
-      std::ostringstream msg ;
-      msg << "MemoryTracking(" << m_space
-          << ")::increment( " << p << " ) ERROR: Not being tracked" ;
-      throw_runtime_exception( msg.str() );
-    }
-  }
-}
-
-void MemoryTracking::decrement( const void * ptr )
-{
-  if ( ptr ) {
-    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
-    const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
-
-    if ( m_tracking[i]->begin <= p ) {
-      if ( 0 == --( m_tracking[i]->m_count ) ) {
-
-        delete m_tracking[i] ;
-
-        m_tracking.erase(     m_tracking.begin() + i );
-        m_tracking_end.erase( m_tracking_end.begin() + i );
-      }
-    }
-    else {
-      std::ostringstream msg ;
-      msg << "MemoryTracking(" << m_space
-          << ")::decrement( " << p << " ) ERROR: Not being tracked" ;
-      throw_runtime_exception( msg.str() );
-    }
-  }
-}
-
-MemoryTrackingEntry *
-MemoryTracking::query( const void * ptr ) const
-{
-  MemoryTrackingEntry * result = 0 ;
-
-  if ( ptr ) {
-    const ptrdiff_t p = reinterpret_cast<ptrdiff_t>( ptr );
-
-    const int i = upper_bound( & m_tracking_end[0] , m_tracking_end.size() , p );
-
-    if ( m_tracking[i]->begin <= p ) result = m_tracking[i] ;
-  }
-
-  return result ;
-}
-
-void MemoryTracking::print( std::ostream & s , const std::string & lead ) const
-{
-  // Don't print the sentinal value:
-  const size_t n = m_tracking.size() - 1 ;
-
-  for ( size_t i = 0 ; i < n ; ++i ) {
-    s << lead ;
-    m_tracking[i]->print( s );
-    s << std::endl ;
-  }
-}
-
-MemoryTrackingEntry::~MemoryTrackingEntry()
-{}
-
-void MemoryTrackingEntry::print( std::ostream & s ) const
-{
-  s << "{ "
-    << "label("  << label << ") "
-    << "typeid(" << type.name() << ") "
-    << "range[ " << ((void*)begin) << " : " << ((void*)end) << " ) "
-    << "count("  << m_count << ") }" ;
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp b/kokkos/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
deleted file mode 100644
index 7e1bbfb..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_MemoryTracking.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_MEMORY_TRACKING_HPP
-#define KOKKOS_MEMORY_TRACKING_HPP
-
-#include <cstddef>
-#include <utility>
-#include <vector>
-#include <string>
-#include <typeinfo>
-#include <iosfwd>
-
-namespace Kokkos {
-namespace Impl {
-
-class MemoryTracking ;
-
-class MemoryTrackingEntry {
-public:
-  const std::string      label ;
-  const std::type_info & type ;
-  const ptrdiff_t        begin ;
-  const ptrdiff_t        end ;
-private:
-  unsigned m_count ;
-protected:
-
-  MemoryTrackingEntry( const std::string    & arg_label ,
-                       const std::type_info & arg_type ,
-                       const void * const     arg_begin ,
-                       const unsigned         arg_bytes )
-    : label( arg_label )
-    , type(  arg_type )
-    , begin( reinterpret_cast<ptrdiff_t>( arg_begin ) )
-    , end(   reinterpret_cast<ptrdiff_t>(
-               reinterpret_cast<const unsigned char *>( arg_begin ) + arg_bytes ) )
-    , m_count( 0 )
-    {}
-
-public:
-
-  unsigned count() const { return m_count ; }
-
-  virtual void print( std::ostream & ) const ;
-
-  virtual ~MemoryTrackingEntry();
-
-private:
-
-  MemoryTrackingEntry();
-  MemoryTrackingEntry( const MemoryTrackingEntry & rhs );
-  MemoryTrackingEntry & operator = ( const MemoryTrackingEntry & rhs );
-
-  friend class MemoryTracking ;
-};
-
-
-class MemoryTracking {
-public:
-
-  /** \brief  Track a memory range defined by the entry.
-   *          This entry must be allocated via 'new'.
-   */
-  void insert( MemoryTrackingEntry * entry );
-
-  /** \brief  Decrement the tracked memory range.
-   *          If the count is zero then the entry is deleted
-   *          via the 'delete' operator.
-   */
-  void decrement( const void * ptr );
-
-  /** \brief  Increment the tracking count.  */
-  void increment( const void * ptr );
-
-  /** \brief  Query a tracked memory range. */
-  MemoryTrackingEntry * query( const void * ptr ) const ;
-
-  /** \brief  Call the 'print' method on all entries. */
-  void print( std::ostream & , const std::string & lead ) const ;
-
-  size_t size() const { return m_tracking.size(); }
-
-  template< typename iType >
-  MemoryTracking & operator[]( const iType & i ) const
-    { return *m_tracking[i]; }
-
-  explicit MemoryTracking( const std::string & space );
-
-  /** \brief  Print memory leak warning for all entries. */
-  ~MemoryTracking();
-
-private:
-  MemoryTracking();
-  MemoryTracking( const MemoryTracking & );
-  MemoryTracking & operator = ( const MemoryTracking & );
-
-  std::string                        m_space ;
-  std::vector<MemoryTrackingEntry*>  m_tracking ;
-  std::vector<ptrdiff_t>             m_tracking_end ;
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/kokkos/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
deleted file mode 100644
index 68254df..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
-#define KOKKOS_PHYSICAL_LAYOUT_HPP
-
-
-#include <Kokkos_View.hpp>
-namespace Kokkos {
-namespace Impl {
-
-
-
-struct PhysicalLayout {
-  enum LayoutType {Left,Right,Scalar,Error};
-  LayoutType layout_type;
-  int rank;
-  long long int stride[8]; //distance between two neighboring elements in a given dimension
-
-  template< class T , class L , class D , class M >
-  PhysicalLayout( const View<T,L,D,M,LayoutDefault> & view )
-    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
-                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
-    , rank( view.Rank )
-    {
-      for(int i=0;i<8;i++) stride[i] = 0;
-      view.stride( stride );
-    }
-  template< class T , class L , class D , class M >
-  PhysicalLayout( const View<T,L,D,M,CudaTexture> & view )
-    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
-                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
-    , rank( view.Rank )
-    {
-      for(int i=0;i<8;i++) stride[i] = 0;
-      view.stride( stride );
-    }
-};
-
-}
-}
-#endif
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Serial.cpp b/kokkos/kokkos/core/src/impl/Kokkos_Serial.cpp
deleted file mode 100644
index ba302f9..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <stdlib.h>
-#include <Kokkos_Serial.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace {
-
-struct Sentinel {
-
-  void *   m_reduce ;
-  unsigned m_reduce_size ;
-
-  Sentinel() : m_reduce(0), m_reduce_size(0) {}
-
-  ~Sentinel() { if ( m_reduce ) { free( m_reduce ); } }
-};
-
-}
-
-void * Serial::resize_reduce_scratch( unsigned size )
-{
-  static Sentinel s ;
-
-  const unsigned rem = size % Impl::MEMORY_ALIGNMENT ;
-
-  if ( rem ) size += Impl::MEMORY_ALIGNMENT - rem ;
-
-  if ( ( 0 == size ) || ( s.m_reduce_size < size ) ) {
-
-    if ( s.m_reduce ) { free( s.m_reduce ); }
-  
-    s.m_reduce_size = size ;
-
-    s.m_reduce = malloc( size );
-  }
-
-  return s.m_reduce ;
-}
-
-} // namespace Kokkos
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Shape.cpp b/kokkos/kokkos/core/src/impl/Kokkos_Shape.cpp
deleted file mode 100644
index e3bf5d3..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Shape.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-
-#include <sstream>
-#include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Shape.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-void assert_counts_are_equal_throw(
-  const unsigned x_count ,
-  const unsigned y_count )
-{
-  std::ostringstream msg ;
-
-  msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
-      << x_count << " != " << y_count << " )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-void assert_shapes_are_equal_throw(
-  const unsigned x_scalar_size ,
-  const unsigned x_rank ,
-  const unsigned x_N0 , const unsigned x_N1 ,
-  const unsigned x_N2 , const unsigned x_N3 ,
-  const unsigned x_N4 , const unsigned x_N5 ,
-  const unsigned x_N6 , const unsigned x_N7 ,
-
-  const unsigned y_scalar_size ,
-  const unsigned y_rank ,
-  const unsigned y_N0 , const unsigned y_N1 ,
-  const unsigned y_N2 , const unsigned y_N3 ,
-  const unsigned y_N4 , const unsigned y_N5 ,
-  const unsigned y_N6 , const unsigned y_N7 )
-{
-  std::ostringstream msg ;
-
-  msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
-      << " scalar_size(" << x_scalar_size
-      << ") rank(" << x_rank
-      << ") dimension(" ;
-  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
-  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
-  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
-  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
-  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
-  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
-  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
-  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
-  msg << " ) } != { "
-      << " scalar_size(" << y_scalar_size
-      << ") rank(" << y_rank
-      << ") dimension(" ;
-  if ( 0 < y_rank ) { msg << " " << y_N0 ; }
-  if ( 1 < y_rank ) { msg << " " << y_N1 ; }
-  if ( 2 < y_rank ) { msg << " " << y_N2 ; }
-  if ( 3 < y_rank ) { msg << " " << y_N3 ; }
-  if ( 4 < y_rank ) { msg << " " << y_N4 ; }
-  if ( 5 < y_rank ) { msg << " " << y_N5 ; }
-  if ( 6 < y_rank ) { msg << " " << y_N6 ; }
-  if ( 7 < y_rank ) { msg << " " << y_N7 ; }
-  msg << " ) } )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
-  const size_t rank ,
-  const size_t n0 , const size_t n1 , 
-  const size_t n2 , const size_t n3 ,
-  const size_t n4 , const size_t n5 ,
-  const size_t n6 , const size_t n7 ,
-
-  const size_t arg_rank ,
-  const size_t i0 , const size_t i1 ,
-  const size_t i2 , const size_t i3 ,
-  const size_t i4 , const size_t i5 ,
-  const size_t i6 , const size_t i7 )
-{
-  std::ostringstream msg ;
-  msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
-  if ( 0 < rank ) { msg << " " << n0 ; }
-  if ( 1 < rank ) { msg << " " << n1 ; }
-  if ( 2 < rank ) { msg << " " << n2 ; }
-  if ( 3 < rank ) { msg << " " << n3 ; }
-  if ( 4 < rank ) { msg << " " << n4 ; }
-  if ( 5 < rank ) { msg << " " << n5 ; }
-  if ( 6 < rank ) { msg << " " << n6 ; }
-  if ( 7 < rank ) { msg << " " << n7 ; }
-  msg << " } index = {" ;
-  if ( 0 < arg_rank ) { msg << " " << i0 ; }
-  if ( 1 < arg_rank ) { msg << " " << i1 ; }
-  if ( 2 < arg_rank ) { msg << " " << i2 ; }
-  if ( 3 < arg_rank ) { msg << " " << i3 ; }
-  if ( 4 < arg_rank ) { msg << " " << i4 ; }
-  if ( 5 < arg_rank ) { msg << " " << i5 ; }
-  if ( 6 < arg_rank ) { msg << " " << i6 ; }
-  if ( 7 < arg_rank ) { msg << " " << i7 ; }
-  msg << " } )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-void assert_shape_effective_rank1_at_leastN_throw(
-  const size_t x_rank , const size_t x_N0 ,
-  const size_t x_N1 ,   const size_t x_N2 ,
-  const size_t x_N3 ,   const size_t x_N4 ,
-  const size_t x_N5 ,   const size_t x_N6 ,
-  const size_t x_N7 ,
-  const size_t N0 )
-{
-  std::ostringstream msg ;
-
-  msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
-  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
-  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
-  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
-  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
-  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
-  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
-  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
-  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
-  msg << " } N = " << N0 << " )" ;
-
-  throw_runtime_exception( msg.str() );
-}
-
-
-
-}
-}
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Shape.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Shape.hpp
deleted file mode 100644
index f77e4e7..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Shape.hpp
+++ /dev/null
@@ -1,894 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_SHAPE_HPP
-#define KOKKOS_SHAPE_HPP
-
-#include <typeinfo>
-#include <utility>
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_StaticAssert.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-/** \brief  The shape of a Kokkos with dynamic and static dimensions.
- *          Dynamic dimensions are member values and static dimensions are
- *          'static const' values.
- *
- *  The upper bound on the array rank is eight.
- */
-template< unsigned ScalarSize ,
-          unsigned Rank ,
-          unsigned s0  = 1 ,
-          unsigned s1  = 1 ,
-          unsigned s2  = 1 ,
-          unsigned s3  = 1 ,
-          unsigned s4  = 1 ,
-          unsigned s5  = 1 ,
-          unsigned s6  = 1 ,
-          unsigned s7  = 1 >
-struct Shape ;
-
-template< class ShapeType , class Layout >
-struct ShapeMap ;
-
-//----------------------------------------------------------------------------
-/** \brief  Shape equality if the value type, layout, and dimensions
- *          are equal.
- */
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize , unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-KOKKOS_INLINE_FUNCTION
-bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{
-  enum { same_size = xSize == ySize };
-  enum { same_rank = xRank == yRank };
-
-  return same_size && same_rank &&
-         unsigned( x.N0 ) == unsigned( y.N0 ) &&
-         unsigned( x.N1 ) == unsigned( y.N1 ) &&
-         unsigned( x.N2 ) == unsigned( y.N2 ) &&
-         unsigned( x.N3 ) == unsigned( y.N3 ) &&
-         unsigned( x.N4 ) == unsigned( y.N4 ) &&
-         unsigned( x.N5 ) == unsigned( y.N5 ) &&
-         unsigned( x.N6 ) == unsigned( y.N6 ) &&
-         unsigned( x.N7 ) == unsigned( y.N7 ) ;
-}
-
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize ,unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-KOKKOS_INLINE_FUNCTION
-bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{ return ! operator == ( x , y ); }
-
-//----------------------------------------------------------------------------
-
-void assert_counts_are_equal_throw(
-  const unsigned x_count ,
-  const unsigned y_count );
-
-inline
-void assert_counts_are_equal(
-  const unsigned x_count ,
-  const unsigned y_count )
-{
-  if ( x_count != y_count ) {
-    assert_counts_are_equal_throw( x_count , y_count );
-  }
-}
-
-void assert_shapes_are_equal_throw(
-  const unsigned x_scalar_size ,
-  const unsigned x_rank ,
-  const unsigned x_N0 , const unsigned x_N1 ,
-  const unsigned x_N2 , const unsigned x_N3 ,
-  const unsigned x_N4 , const unsigned x_N5 ,
-  const unsigned x_N6 , const unsigned x_N7 ,
-
-  const unsigned y_scalar_size ,
-  const unsigned y_rank ,
-  const unsigned y_N0 , const unsigned y_N1 ,
-  const unsigned y_N2 , const unsigned y_N3 ,
-  const unsigned y_N4 , const unsigned y_N5 ,
-  const unsigned y_N6 , const unsigned y_N7 );
-
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize , unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-inline
-void assert_shapes_are_equal(
-  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{
-  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
-  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
-
-  if ( x != y ) {
-    assert_shapes_are_equal_throw(
-      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
-      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
-  }
-}
-
-template< unsigned xSize , unsigned xRank ,
-          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
-          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
-
-          unsigned ySize , unsigned yRank ,
-          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
-          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
-void assert_shapes_equal_dimension(
-  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
-  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
-{
-  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
-  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
-
-  // Omit comparison of scalar_size.
-  if ( unsigned( x.rank ) != unsigned( y.rank ) ||
-       unsigned( x.N0 ) != unsigned( y.N0 ) || 
-       unsigned( x.N1 ) != unsigned( y.N1 ) || 
-       unsigned( x.N2 ) != unsigned( y.N2 ) || 
-       unsigned( x.N3 ) != unsigned( y.N3 ) ||
-       unsigned( x.N4 ) != unsigned( y.N4 ) || 
-       unsigned( x.N5 ) != unsigned( y.N5 ) || 
-       unsigned( x.N6 ) != unsigned( y.N6 ) || 
-       unsigned( x.N7 ) != unsigned( y.N7 ) ) {
-    assert_shapes_are_equal_throw(
-      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
-      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
-  }
-}
-
-//----------------------------------------------------------------------------
-
-template< class ShapeType > struct assert_shape_is_rank_zero ;
-template< class ShapeType > struct assert_shape_is_rank_one ;
-
-template< unsigned Size >
-struct assert_shape_is_rank_zero< Shape<Size,0> >
-  : public true_type {};
-
-template< unsigned Size , unsigned s0 >
-struct assert_shape_is_rank_one< Shape<Size,1,s0> >
-  : public true_type {};
-
-//----------------------------------------------------------------------------
-
-/** \brief  Array bounds assertion templated on the execution space
- *          to allow device-specific abort code.
- */
-template< class ExecutionSpace >
-struct AssertShapeBoundsAbort ;
-
-template<>
-struct AssertShapeBoundsAbort< Kokkos::HostSpace >
-{
-  static void apply( const size_t rank ,
-                     const size_t n0 , const size_t n1 ,
-                     const size_t n2 , const size_t n3 ,
-                     const size_t n4 , const size_t n5 ,
-                     const size_t n6 , const size_t n7 ,
-                     const size_t arg_rank ,
-                     const size_t i0 , const size_t i1 ,
-                     const size_t i2 , const size_t i3 ,
-                     const size_t i4 , const size_t i5 ,
-                     const size_t i6 , const size_t i7 );
-};
-
-template< class ExecutionDevice >
-struct AssertShapeBoundsAbort
-{
-  KOKKOS_INLINE_FUNCTION
-  static void apply( const size_t rank ,
-                     const size_t n0 , const size_t n1 ,
-                     const size_t n2 , const size_t n3 ,
-                     const size_t n4 , const size_t n5 ,
-                     const size_t n6 , const size_t n7 ,
-                     const size_t arg_rank ,
-                     const size_t i0 , const size_t i1 ,
-                     const size_t i2 , const size_t i3 ,
-                     const size_t i4 , const size_t i5 ,
-                     const size_t i6 , const size_t i7 )
-    {
-      AssertShapeBoundsAbort< Kokkos::HostSpace >
-        ::apply( rank ,    n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
-                 arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
-    }
-};
-
-template< class ShapeType >
-KOKKOS_INLINE_FUNCTION
-void assert_shape_bounds( const ShapeType & shape ,
-                          const size_t arg_rank ,
-                          const size_t i0 ,
-                          const size_t i1 = 0 ,
-                          const size_t i2 = 0 ,
-                          const size_t i3 = 0 ,
-                          const size_t i4 = 0 ,
-                          const size_t i5 = 0 ,
-                          const size_t i6 = 0 ,
-                          const size_t i7 = 0 )
-{
-  // Must supply at least as many indices as ranks.
-  // Every index must be within bounds.
-  const bool ok = ShapeType::rank <= arg_rank &&
-                  i0 < shape.N0 && 
-                  i1 < shape.N1 &&
-                  i2 < shape.N2 &&
-                  i3 < shape.N3 &&
-                  i4 < shape.N4 &&
-                  i5 < shape.N5 &&
-                  i6 < shape.N6 &&
-                  i7 < shape.N7 ;
-
-  if ( ! ok ) {
-    AssertShapeBoundsAbort< ExecutionSpace >
-      ::apply( ShapeType::rank ,
-               shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
-               shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
-               arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
-  }
-}
-
-#if defined( KOKKOS_EXPRESSION_CHECK )
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
-#else
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
-#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
-#endif
-
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Specialization and optimization for the Rank 0 shape.
-
-template < unsigned ScalarSize >
-struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 0 };
-  enum { rank         = 0 };
-
-  enum { N0 = 1 };
-  enum { N1 = 1 };
-  enum { N2 = 1 };
-  enum { N3 = 1 };
-  enum { N4 = 1 };
-  enum { N5 = 1 };
-  enum { N6 = 1 };
-  enum { N7 = 1 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  {}
-};
-
-//----------------------------------------------------------------------------
-// All-static dimension array
-
-template < unsigned ScalarSize ,
-           unsigned Rank ,
-           unsigned s0 ,
-           unsigned s1 ,
-           unsigned s2 ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape {
-
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 0 };
-  enum { rank         = Rank };
-
-  enum { N0 = s0 };
-  enum { N1 = s1 };
-  enum { N2 = s2 };
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  {}
-};
-
-// 1 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize ,
-           unsigned Rank ,
-           unsigned s1 ,
-           unsigned s2 ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 1 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-
-  enum { N1 = s1 };
-  enum { N2 = s2 };
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; }
-};
-
-// 2 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s2 ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 2 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-
-  enum { N2 = s2 };
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; }
-};
-
-// 3 == dynamic_rank <= rank <= 8
-template < unsigned Rank , unsigned ScalarSize ,
-           unsigned s3 ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 3 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-
-  enum { N3 = s3 };
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
-};
-
-// 4 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s4 ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 4 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-
-  enum { N4 = s4 };
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
-};
-
-// 5 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s5 ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 5 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-
-  enum { N5 = s5 };
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
-  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
-};
-
-// 6 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s6 ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 6 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-  unsigned N5 ;
-
-  enum { N6 = s6 };
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
-  {
-    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
-    s.N4 = n4 ; s.N5 = n5 ;
-  }
-};
-
-// 7 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize , unsigned Rank ,
-           unsigned s7 >
-struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 7 };
-  enum { rank         = Rank };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-  unsigned N5 ;
-  unsigned N6 ;
-
-  enum { N7 = s7 };
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
-  {
-    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
-    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
-  }
-};
-
-// 8 == dynamic_rank <= rank <= 8
-template < unsigned ScalarSize >
-struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
-{
-  enum { scalar_size   = ScalarSize };
-  enum { rank_dynamic = 8 };
-  enum { rank         = 8 };
-
-  unsigned N0 ;
-  unsigned N1 ;
-  unsigned N2 ;
-  unsigned N3 ;
-  unsigned N4 ;
-  unsigned N5 ;
-  unsigned N6 ;
-  unsigned N7 ;
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  void assign( Shape & s ,
-               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
-               unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
-  {
-    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
-    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template< class ShapeType , unsigned N ,
-          unsigned R = ShapeType::rank_dynamic >
-struct ShapeInsert ;
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 0 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 N ,
-                 ShapeType::N0 ,
-                 ShapeType::N1 ,
-                 ShapeType::N2 ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 1 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 N ,
-                 ShapeType::N1 ,
-                 ShapeType::N2 ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 2 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N2 ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 3 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N3 ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 4 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N4 ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 5 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N5 ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 6 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N ,
-                 ShapeType::N6 > type ;
-};
-
-template< class ShapeType , unsigned N >
-struct ShapeInsert< ShapeType , N , 7 >
-{
-  typedef Shape< ShapeType::scalar_size ,
-                 ShapeType::rank + 1 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 0 ,
-                 N > type ;
-};
-
-//----------------------------------------------------------------------------
-
-template< class DstShape , class SrcShape ,
-          unsigned DstRankDynamic   = DstShape::rank_dynamic ,
-          bool     DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
-struct ShapeCompatible { enum { value = false }; };
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 8 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 7 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 6 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 5 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 4 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 3 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 2 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 1 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
-                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-template< class DstShape , class SrcShape >
-struct ShapeCompatible< DstShape , SrcShape , 0 , true >
-{
-  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
-                 unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
-                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
-                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
-                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
-                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
-                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
-                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
-                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< unsigned ScalarSize , unsigned Rank ,
-          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
-          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
-          typename iType >
-KOKKOS_INLINE_FUNCTION
-size_t dimension( 
-  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
-  const iType & r )
-{
-  return 0 == r ? shape.N0 : (
-         1 == r ? shape.N1 : (
-         2 == r ? shape.N2 : (
-         3 == r ? shape.N3 : (
-         4 == r ? shape.N4 : (
-         5 == r ? shape.N5 : (
-         6 == r ? shape.N6 : (
-         7 == r ? shape.N7 : 1 )))))));
-}
-
-template< unsigned ScalarSize , unsigned Rank ,
-          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
-          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
-size_t cardinality_count(
-  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
-{
-  return shape.N0 * shape.N1 * shape.N2 * shape.N3 *
-         shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#endif /* #ifndef KOKKOS_CORESHAPE_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_StaticAssert.hpp b/kokkos/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
deleted file mode 100644
index f1017c3..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_STATICASSERT_HPP
-#define KOKKOS_STATICASSERT_HPP
-
-namespace Kokkos {
-namespace Impl {
-
-template < bool , class T = void >
-struct StaticAssert ;
-
-template< class T >
-struct StaticAssert< true , T > {
-  typedef T type ;
-  static const bool value = true ;
-};
-
-template < class A , class B >
-struct StaticAssertSame ;
-
-template < class A >
-struct StaticAssertSame<A,A> { typedef A type ; };
-
-template < class A , class B >
-struct StaticAssertAssignable ;
-
-template < class A >
-struct StaticAssertAssignable<A,A> { typedef A type ; };
-
-template < class A >
-struct StaticAssertAssignable< const A , A > { typedef const A type ; };
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* KOKKOS_STATICASSERT_HPP */
-
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Timer.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Timer.hpp
deleted file mode 100644
index 700653b..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov) 
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_IMPLWALLTIME_HPP
-#define KOKKOS_IMPLWALLTIME_HPP
-
-#include <stddef.h>
-
-#ifdef _MSC_VER
-#undef KOKKOS_USE_LIBRT
-#include <gettimeofday.c>
-#else
-#ifdef KOKKOS_USE_LIBRT
-#include <ctime>
-#else
-#include <sys/time.h>
-#endif
-#endif
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Time since construction */
-
-class Timer {
-private:
-  #ifdef KOKKOS_USE_LIBRT
-	struct timespec m_old;
-  #else
-	struct timeval m_old ;
-  #endif
-  Timer( const Timer & );
-  Timer & operator = ( const Timer & );
-public:
-
-  inline
-  void reset() {
-    #ifdef KOKKOS_USE_LIBRT
-	  clock_gettime(&m_old);
-    #else
-	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
-    #endif
-  }
-
-  inline
-  ~Timer() {}
-
-  inline
-  Timer() { reset(); }
-
-  inline
-  double seconds() const
-  {
-    #ifdef KOKKOS_USE_LIBRT
-      struct timespec m_new;
-      clock_gettime(&m_new);
-
-      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
-             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
-    #else
-      struct timeval m_new ;
-
-      ::gettimeofday( & m_new , ((struct timezone *) NULL ) );
-
-      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
-             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
-    #endif
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Traits.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Traits.hpp
deleted file mode 100644
index 459a769..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Traits.hpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOSTRAITS_HPP
-#define KOKKOSTRAITS_HPP
-
-#include <stddef.h>
-#include <Kokkos_Macros.hpp>
-
-namespace Kokkos {
-namespace Impl {
-
-/* C++11 conformal compile-time type traits utilities.
- * Prefer to use C++11 when portably available.
- */
-//----------------------------------------------------------------------------
-// C++11 Helpers:
-
-template < class T , T v >
-struct integral_constant
-{
-  static const T value = v ;
-  typedef T value_type;
-  typedef integral_constant<T,v> type;
-  KOKKOS_INLINE_FUNCTION operator T() { return v ; }
-};
-
-typedef integral_constant<bool,false> false_type ;
-typedef integral_constant<bool,true>  true_type ;
-
-//----------------------------------------------------------------------------
-// C++11 Type relationships:
-
-template< class X , class Y > struct is_same : public false_type {};
-template< class X >           struct is_same<X,X> : public true_type {};
-
-//----------------------------------------------------------------------------
-// C++11 Type properties:
-
-template <typename T> struct is_const : public false_type {};
-template <typename T> struct is_const<const T> : public true_type {};
-template <typename T> struct is_const<const T & > : public true_type {};
-
-//----------------------------------------------------------------------------
-// C++11 Type transformations:
-
-template <typename T> struct remove_const { typedef T type; };
-template <typename T> struct remove_const<const T> { typedef T type; };
-template <typename T> struct remove_const<const T & > { typedef T & type; };
-
-template <typename T> struct add_const { typedef const T type; };
-template <typename T> struct add_const<T & > { typedef const T & type; };
-template <typename T> struct add_const<const T> { typedef const T type; };
-template <typename T> struct add_const<const T & > { typedef const T & type; };
-
-template<typename T> struct remove_reference { typedef T type ; };
-template<typename T> struct remove_reference< T & > { typedef T type ; };
-template<typename T> struct remove_reference< const T & > { typedef const T type ; };
-
-//----------------------------------------------------------------------------
-// C++11 Other type generators:
-
-template< bool , class T , class F >
-struct condition { typedef F type ; };
-
-template< class T , class F >
-struct condition<true,T,F> { typedef T type ; };
-
-template< bool , class = void >
-struct enable_if ;
-
-template< class T >
-struct enable_if< true , T > { typedef T type ; };
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-// Other traits
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-template< class , class T = void >
-struct enable_if_type { typedef T type ; };
-
-//----------------------------------------------------------------------------
-
-template< bool B >
-struct bool_ : public integral_constant<bool,B> {};
-
-template< unsigned I >
-struct unsigned_ : public integral_constant<unsigned,I> {};
-
-template< int I >
-struct int_ : public integral_constant<int,I> {};
-
-//----------------------------------------------------------------------------
-// if_
-
-template < bool Cond , typename TrueType , typename FalseType>
-struct if_c
-{
-  enum { value = Cond };
-
-  typedef FalseType type;
-
-
-  typedef typename remove_const<
-          typename remove_reference<type>::type >::type value_type ;
-
-  typedef typename add_const<value_type>::type const_value_type ;
-
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const_value_type & v ) { return v ; }
-
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( value_type & v ) { return v ; }
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
-
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const T & , const_value_type & v ) { return v ; }
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( const T & , value_type & v ) { return v ; }
-};
-
-template <typename TrueType, typename FalseType>
-struct if_c< true , TrueType , FalseType >
-{
-  enum { value = true };
-
-  typedef TrueType type;
-
-
-  typedef typename remove_const<
-          typename remove_reference<type>::type >::type value_type ;
-
-  typedef typename add_const<value_type>::type const_value_type ;
-
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const_value_type & v ) { return v ; }
-
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( value_type & v ) { return v ; }
-
-  template< class T >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
-
-
-  template< class F >
-  static KOKKOS_INLINE_FUNCTION
-  const_value_type & select( const_value_type & v , const F & ) { return v ; }
-
-  template< class F >
-  static KOKKOS_INLINE_FUNCTION
-  value_type & select( value_type & v , const F & ) { return v ; }
-};
-
-
-template <typename Cond, typename TrueType, typename FalseType>
-struct if_ : public if_c<Cond::value, TrueType, FalseType> {};
-
-//----------------------------------------------------------------------------
-
-template <size_t N>
-struct is_power_of_two
-{
-  enum type { value = (N > 0) && !(N & (N-1)) };
-};
-
-template < size_t N , bool OK = is_power_of_two<N>::value >
-struct power_of_two ;
-
-template < size_t N >
-struct power_of_two<N,true>
-{
-  enum type { value = 1+ power_of_two<(N>>1),true>::value };
-};
-
-template <>
-struct power_of_two<2,true>
-{
-  enum type { value = 1 };
-};
-
-template <>
-struct power_of_two<1,true>
-{
-  enum type { value = 0 };
-};
-
-//----------------------------------------------------------------------------
-
-template< typename T , T v , bool NonZero = ( v != T(0) ) >
-struct integral_nonzero_constant
-{
-  static const T value = v ;
-  typedef T value_type ;
-  typedef integral_nonzero_constant<T,v> type ;
-  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {}
-};
-
-template< typename T , T zero >
-struct integral_nonzero_constant<T,zero,false>
-{
-  const T value ;
-  typedef T value_type ;
-  typedef integral_nonzero_constant<T,0> type ;
-  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {}
-};
-
-//----------------------------------------------------------------------------
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOSTRAITS_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_Utility.hpp b/kokkos/kokkos/core/src/impl/Kokkos_Utility.hpp
deleted file mode 100644
index d80324c..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_Utility.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_UTILITY_HPP
-#define KOKKOS_UTILITY_HPP
-
-#include <Kokkos_Macros.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template < bool , class T , class F > struct or_ ;
-
-template < class T , class F > struct or_<true, T,F> { typedef T type ; };
-template < class T , class F > struct or_<false,T,F> { typedef F type ; };
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< typename T , typename TS1 , typename TS2 = TS1 >
-union UnionPair
-{
-private:
-  typedef typename or_< sizeof(T) == sizeof(TS2) , TS2 , void     >::type ts2_type ;
-  typedef typename or_< sizeof(T) == sizeof(TS1) , TS1 , ts2_type >::type ts_type ;
-public:
-
-  typedef T       first_type ;
-  typedef ts_type second_type ;
-
-  first_type  first ;
-  second_type second ;
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair() {}
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair( const second_type & rhs ) : second(rhs) {}
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type * cast( first_type * const ptr )
-  { return reinterpret_cast<second_type*>( ptr ); }
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type * cast( const first_type * const ptr )
-  { return reinterpret_cast<const second_type*>( ptr ); }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type * const ptr )
-  { return reinterpret_cast<volatile second_type*>( ptr ); }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type & cast( first_type & ptr )
-  { return reinterpret_cast<second_type&>( ptr ); }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type & cast( const first_type & ptr )
-  { return reinterpret_cast<const second_type &>( ptr ); }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type & ptr )
-  { return reinterpret_cast<volatile second_type &>( ptr ); }
-};
-
-
-template< typename T >
-union UnionPair<T,T,T>
-{
-  typedef T  first_type ;
-  typedef T  second_type ;
-
-  first_type  first ;
-  second_type second ;
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair() {}
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair( const first_type & rhs ) : first(rhs) {}
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type * cast( first_type * const ptr ) { return ptr ; }
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type * cast( const first_type * const ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type * const ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type & cast( first_type & ptr ) { return ptr ; }
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type & cast( const first_type & ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type & ptr ) { return ptr ; }
-};
-
-template< typename T , typename TS2 >
-union UnionPair<T,T,TS2>
-{
-  typedef T  first_type ;
-  typedef T  second_type ;
-
-  first_type  first ;
-  second_type second ;
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair() {}
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair( const first_type & rhs ) : first(rhs) {}
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type * cast( first_type * const ptr ) { return ptr ; }
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type * cast( const first_type * const ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type * const ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type & cast( first_type & ptr ) { return ptr ; }
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type & cast( const first_type & ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type & ptr ) { return ptr ; }
-};
-
-
-template< typename T , typename TS1 >
-union UnionPair<T,TS1,T>
-{
-  typedef T  first_type ;
-  typedef T  second_type ;
-
-  first_type  first ;
-  second_type second ;
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair() {}
-
-  KOKKOS_INLINE_FUNCTION
-  UnionPair( const first_type & rhs ) : first(rhs) {}
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type * cast( first_type * const ptr ) { return ptr ; }
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type * cast( const first_type * const ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type * const ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  second_type & cast( first_type & ptr ) { return ptr ; }
-  
-  KOKKOS_INLINE_FUNCTION
-  static
-  const second_type & cast( const first_type & ptr ) { return ptr ; }
-
-  KOKKOS_INLINE_FUNCTION
-  static
-  volatile second_type * cast( volatile first_type & ptr ) { return ptr ; }
-};
-
-}
-}
-
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_UTILITY_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_ViewDefault.hpp b/kokkos/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
deleted file mode 100644
index c8279e4..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VIEWDEFAULT_HPP
-#define KOKKOS_VIEWDEFAULT_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template<>
-struct ViewAssignment< LayoutDefault , LayoutDefault , void >
-{
-  typedef LayoutDefault Specialize ;
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-1 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 1 )
-                  ), unsigned >::type i0 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-
-    assert_shape_bounds( src.m_shape , 1 , i0 );
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    dst.m_ptr_on_device = src.m_ptr_on_device + i0 ;
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-2 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
-                  ), unsigned >::type i0 ,
-                  const unsigned i1 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    enum { is_left = is_same< typename src_traits::array_layout , LayoutLeft >::value };
-
-    assert_shape_bounds( src.m_shape , 2 , i0 , i1 );
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    if ( is_left ) {
-      dst.m_ptr_on_device = src.m_ptr_on_device + i0 + src.m_stride.value * i1 ;
-    }
-    else {
-      dst.m_ptr_on_device = src.m_ptr_on_device + i1 + i0 * src.m_stride.value ;
-    }
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-3 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 3 )
-                  ), unsigned >::type i0 ,
-                  const unsigned i1 ,
-                  const unsigned i2 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    enum { is_left = is_same< typename src_traits::array_layout , LayoutLeft >::value };
-
-    assert_shape_bounds( src.m_shape, 3, i0, i1, i2 );
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    if ( is_left ) {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i0 + src.m_stride.value * (
-          i1 + src.m_shape.N1 * (
-          i2 ));
-    }
-    else {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i2 + src.m_shape.N2 * (
-          i1 ) + i0 * src.m_stride.value ;
-    }
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-4 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 4 )
-                  ), unsigned >::type i0 ,
-                  const unsigned i1 ,
-                  const unsigned i2 ,
-                  const unsigned i3 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    enum { is_left = is_same< typename src_traits::array_layout , LayoutLeft >::value };
-
-    assert_shape_bounds( src.m_shape, 4, i0, i1, i2, i3 );
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    if ( is_left ) {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i0 + src.m_stride.value * (
-          i1 + src.m_shape.N1 * (
-          i2 + src.m_shape.N2 * (
-          i3 )));
-    }
-    else {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i3 + src.m_shape.N3 * (
-          i2 + src.m_shape.N2 * (
-          i1 )) + i0 * src.m_stride.value ;
-    }
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-5 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 5 )
-                  ), unsigned >::type i0 ,
-                  const unsigned i1 ,
-                  const unsigned i2 ,
-                  const unsigned i3 ,
-                  const unsigned i4 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    enum { is_left = is_same< typename src_traits::array_layout , LayoutLeft >::value };
-
-    assert_shape_bounds( src.m_shape, 5, i0, i1, i2, i3, i4);
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    if ( is_left ) {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i0 + src.m_stride.value * (
-          i1 + src.m_shape.N1 * (
-          i2 + src.m_shape.N2 * (
-          i3 + src.m_shape.N3 * (
-          i4 ))));
-    }
-    else {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i4 + src.m_shape.N4 * (
-          i3 + src.m_shape.N3 * (
-          i2 + src.m_shape.N2 * (
-          i1 ))) + i0 * src.m_stride.value ;
-    }
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-6 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 6 )
-                  ), unsigned >::type i0 ,
-                  const unsigned i1 ,
-                  const unsigned i2 ,
-                  const unsigned i3 ,
-                  const unsigned i4 ,
-                  const unsigned i5 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    enum { is_left = is_same< typename src_traits::array_layout , LayoutLeft >::value };
-
-    assert_shape_bounds( src.m_shape, 6, i0, i1, i2, i3, i4, i5);
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    if ( is_left ) {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i0 + src.m_stride.value * (
-          i1 + src.m_shape.N1 * (
-          i2 + src.m_shape.N2 * (
-          i3 + src.m_shape.N3 * (
-          i4 + src.m_shape.N4 * (
-          i5 )))));
-    }
-    else {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i5 + src.m_shape.N5 * (
-          i4 + src.m_shape.N4 * (
-          i3 + src.m_shape.N3 * (
-          i2 + src.m_shape.N2 * (
-          i1 )))) + i0 * src.m_stride.value ;
-    }
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-7 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 7 )
-                  ), unsigned >::type i0 ,
-                  const unsigned i1 ,
-                  const unsigned i2 ,
-                  const unsigned i3 ,
-                  const unsigned i4 ,
-                  const unsigned i5 ,
-                  const unsigned i6 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    enum { is_left = is_same< typename src_traits::array_layout , LayoutLeft >::value };
-
-    assert_shape_bounds( src.m_shape, 7, i0, i1, i2, i3, i4, i5, i6 );
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    if ( is_left ) {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i0 + src.m_stride.value * (
-          i1 + src.m_shape.N1 * (
-          i2 + src.m_shape.N2 * (
-          i3 + src.m_shape.N3 * (
-          i4 + src.m_shape.N4 * (
-          i5 + src.m_shape.N5 * (
-          i6 ))))));
-    }
-    else {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i6 + src.m_shape.N6 * (
-          i5 + src.m_shape.N5 * (
-          i4 + src.m_shape.N4 * (
-          i3 + src.m_shape.N3 * (
-          i2 + src.m_shape.N2 * (
-          i1 ))))) + i0 * src.m_stride.value ;
-    }
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-0 from Rank-8 */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
-                                    ViewTraits<ST,SL,SD,SM> >::assignable_value &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 0 ) &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 8 )
-                  ), unsigned >::type i0 ,
-                  const unsigned i1 ,
-                  const unsigned i2 ,
-                  const unsigned i3 ,
-                  const unsigned i4 ,
-                  const unsigned i5 ,
-                  const unsigned i6 ,
-                  const unsigned i7 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    enum { is_left = is_same< typename src_traits::array_layout , LayoutLeft >::value };
-
-    assert_shape_bounds( src.m_shape, 8, i0, i1, i2, i3, i4, i5, i6, i7 );
-
-    ViewTracking< dst_traits >::decrement( dst.m_ptr_on_device );
-
-    if ( is_left ) {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i0 + src.m_stride.value * (
-          i1 + src.m_shape.N1 * (
-          i2 + src.m_shape.N2 * (
-          i3 + src.m_shape.N3 * (
-          i4 + src.m_shape.N4 * (
-          i5 + src.m_shape.N5 * (
-          i6 + src.m_shape.N6 * i7 ))))));
-    }
-    else {
-      dst.m_ptr_on_device =
-        src.m_ptr_on_device +
-          i7 + src.m_shape.N7 * (
-          i6 + src.m_shape.N6 * (
-          i5 + src.m_shape.N5 * (
-          i4 + src.m_shape.N4 * (
-          i3 + src.m_shape.N3 * (
-          i2 + src.m_shape.N2 * (
-          i1 )))))) + i0 * src.m_stride.value ;
-    }
-
-    ViewTracking< dst_traits >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-1 array from range of Rank-1 array, either layout */
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM ,
-            typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const std::pair<iType,iType> & range ,
-                  typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
-                    &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 1 )
-                    &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 1 )
-                    &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 1 )
-                  ) >::type * = 0 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> traits_type ;
-    typedef typename traits_type::shape_type shape_type ;
-
-    ViewTracking< traits_type >::decrement( dst.m_ptr_on_device );
-
-    dst.m_shape.N0      = 0 ;
-    dst.m_ptr_on_device = 0 ;
-
-    if ( range.first < range.second ) {
-      assert_shape_bounds( src.m_shape , 1 , range.first );
-      assert_shape_bounds( src.m_shape , 1 , range.second - 1 );
-
-      dst.m_shape.N0 = range.second - range.first ;
-      dst.m_ptr_on_device = src.m_ptr_on_device + range.first ;
-
-      ViewTracking< traits_type >::increment( dst.m_ptr_on_device );
-    }
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-1 array from LayoutLeft Rank-2 array. */
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const ALL & ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
-                    &&
-                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutLeft >::value
-                    &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
-                    &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 1 )
-                    &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 1 )
-                  ), unsigned >::type i1 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> traits_type ;
-
-    ViewTracking< traits_type >::decrement( dst.m_ptr_on_device );
-
-    dst.m_shape.N0      = src.m_shape.N0 ;
-    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_stride.value * i1 ;
-
-    ViewTracking< traits_type >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract Rank-1 array from LayoutRight Rank-2 array. */
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const unsigned i0 ,
-                  const typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::assignable_value
-                    &&
-                    is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout , LayoutRight >::value
-                    &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank == 2 )
-                    &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank == 1 )
-                    &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic == 1 )
-                  ), ALL >::type & )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> traits_type ;
-
-    ViewTracking< traits_type >::decrement( dst.m_ptr_on_device );
-
-    dst.m_shape.N0      = src.m_shape.N1 ;
-    dst.m_ptr_on_device = src.m_ptr_on_device + src.m_stride.value * i0 ;
-
-    ViewTracking< traits_type >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Extract LayoutRight Rank-N array from range of LayoutRight Rank-N array */
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM ,
-            typename iType >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const std::pair<iType,iType> & range ,
-                  typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
-                    &&
-                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout , LayoutRight >::value
-                    &&
-                    ( ViewTraits<ST,SL,SD,SM>::rank > 1 )
-                    &&
-                    ( ViewTraits<DT,DL,DD,DM>::rank_dynamic > 0 )
-                  )>::type * = 0 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> traits_type ;
-    typedef typename traits_type::shape_type shape_type ;
-    typedef typename View<DT,DL,DD,DM,Specialize>::stride_type stride_type ;
-
-    ViewTracking< traits_type >::decrement( dst.m_ptr_on_device );
-
-    shape_type ::assign( dst.m_shape, 0, 0, 0, 0, 0, 0, 0, 0 );
-    stride_type::assign( dst.m_stride , 0 );
-    dst.m_ptr_on_device = 0 ;
-
-    if ( range.first < range.second ) {
-      assert_shape_bounds( src.m_shape , 8 , range.first ,      0,0,0,0,0,0,0);
-      assert_shape_bounds( src.m_shape , 8 , range.second - 1 , 0,0,0,0,0,0,0);
-
-      shape_type::assign( dst.m_shape, range.second - range.first ,
-                          src.m_shape.N1 , src.m_shape.N2 , src.m_shape.N3 ,
-                          src.m_shape.N4 , src.m_shape.N5 , src.m_shape.N6 , src.m_shape.N7 );
-
-      stride_type::assign( dst.m_stride , src.m_stride.value );
-
-      dst.m_ptr_on_device = src.m_ptr_on_device + range.first * src.m_stride.value ;
-
-      ViewTracking< traits_type >::increment( dst.m_ptr_on_device );
-    }
-  }
-
-  //------------------------------------
-  /** \brief  Extract rank-2 from rank-2 array */
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM ,
-            typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const std::pair<iType0,iType0> & range0 ,
-                  const std::pair<iType1,iType1> & range1 ,
-                  typename enable_if< (
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
-                    &&
-                    ViewTraits<DT,DL,DD,DM>::rank == 2
-                    &&
-                    ViewTraits<DT,DL,DD,DM>::rank_dynamic == 2
-                  ) >::type * = 0 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> traits_type ;
-    typedef typename traits_type::shape_type shape_type ;
-    enum { left = is_same< typename traits_type::array_layout , LayoutLeft >::value };
-
-    ViewTracking< traits_type >::decrement( dst.m_ptr_on_device );
-
-    dst.m_shape.N0      = 0 ;
-    dst.m_shape.N1      = 0 ;
-    dst.m_stride.value  = 0 ;
-    dst.m_ptr_on_device = 0 ;
-
-    if ( range0.first < range0.second && range1.first < range1.second ) {
-      assert_shape_bounds( src.m_shape , 2 , range0.first , range1.first );
-      assert_shape_bounds( src.m_shape , 2 , range0.second - 1 , range1.second - 1 );
-
-      dst.m_shape.N0 = range0.second - range0.first ;
-      dst.m_shape.N1 = range1.second - range1.first ;
-      dst.m_stride   = src.m_stride ;
-
-      if ( left ) {
-        // operator: dst.m_ptr_on_device[ i0 + dst.m_stride * i1 ]
-        dst.m_ptr_on_device = src.m_ptr_on_device + range0.first + dst.m_stride.value * range1.first ;
-      }
-      else {
-        // operator: dst.m_ptr_on_device[ i0 * dst.m_stride + i1 ]
-        dst.m_ptr_on_device = src.m_ptr_on_device + range0.first * dst.m_stride.value + range1.first ;
-      }
-
-      ViewTracking< traits_type >::increment( dst.m_ptr_on_device );
-    }
-  }
-
-  //------------------------------------
-  /** \brief  Deep copy data from compatible value type, layout, rank, and specialization.
-   *          Check the dimensions and allocation lengths at runtime.
-   */
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  inline static
-  void deep_copy( const View<DT,DL,DD,DM,Specialize> & dst ,
-                  const View<ST,SL,SD,SM,Specialize> & src ,
-                  const typename Impl::enable_if<(
-                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::scalar_type ,
-                                   typename ViewTraits<ST,SL,SD,SM>::non_const_scalar_type >::value
-                    &&
-                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
-                                   typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
-                    &&
-                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
-                  )>::type * = 0 )
-  {
-    typedef typename ViewTraits<DT,DL,DD,DM>::memory_space dst_memory_space ;
-    typedef typename ViewTraits<ST,SL,SD,SM>::memory_space src_memory_space ;
-
-    if ( dst.m_ptr_on_device != src.m_ptr_on_device ) {
-
-      Impl::assert_shapes_are_equal( dst.m_shape , src.m_shape );
-
-      const size_t nbytes = dst.m_shape.scalar_size * capacity( dst.m_shape , dst.m_stride );
-
-      DeepCopy< dst_memory_space , src_memory_space >( dst.m_ptr_on_device , src.m_ptr_on_device , nbytes );
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_VIEWDEFAULT_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_ViewSupport.hpp b/kokkos/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
deleted file mode 100644
index 29d195b..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VIEWSUPPORT_HPP
-#define KOKKOS_VIEWSUPPORT_HPP
-
-#include <impl/Kokkos_Shape.hpp>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  Evaluate if LHS = RHS view assignment is allowed. */
-template< class ViewLHS , class ViewRHS >
-struct ViewAssignable
-{
-  // Same memory space.
-  // Same value type.
-  // Compatible 'const' qualifier
-  // Cannot assign managed = unmannaged
-  enum { assignable_value =
-    ( is_same< typename ViewLHS::value_type ,
-               typename ViewRHS::value_type >::value
-      ||
-      is_same< typename ViewLHS::value_type ,
-               typename ViewRHS::const_value_type >::value )
-    &&
-    is_same< typename ViewLHS::memory_space ,
-             typename ViewRHS::memory_space >::value
-    &&
-    ( ! ( ViewLHS::is_managed && ! ViewRHS::is_managed ) )
-  };
-
-  enum { assignable_shape =
-    // Compatible shape and matching layout:
-    ( ShapeCompatible< typename ViewLHS::shape_type ,
-                       typename ViewRHS::shape_type >::value
-      &&
-      is_same< typename ViewLHS::array_layout ,
-               typename ViewRHS::array_layout >::value )
-    ||
-    // Matching layout, same rank, and LHS dynamic rank
-    ( is_same< typename ViewLHS::array_layout ,
-               typename ViewRHS::array_layout >::value
-      &&
-      int(ViewLHS::rank) == int(ViewRHS::rank)
-      &&
-      int(ViewLHS::rank) == int(ViewLHS::rank_dynamic) )
-    ||
-    // Both rank-0, any shape and layout
-    ( int(ViewLHS::rank) == 0 && int(ViewRHS::rank) == 0 )
-    ||
-    // Both rank-1 and LHS is dynamic rank-1, any shape and layout
-    ( int(ViewLHS::rank) == 1 && int(ViewRHS::rank) == 1 &&
-      int(ViewLHS::rank_dynamic) == 1 )
-    };
-
-  enum { value = assignable_value && assignable_shape };
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class ShapeType , class LayoutType , class Enable = void >
-class LayoutStride ;
-
-/* Arrays with rank <= 1 have no stride */
-template< class ShapeType , class LayoutType >
-class LayoutStride< ShapeType , LayoutType ,
-                    typename enable_if< ShapeType::rank <= 1 >::type >
-{
-public:
-
-  enum { dynamic = false };
-  enum { value = 0 };
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign( LayoutStride & , const unsigned ) {}
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_no_padding( LayoutStride & , const ShapeType & ) {}
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_with_padding( LayoutStride & , const ShapeType & ) {}
-};
-
-/* Array with LayoutLeft and 0 == rank_dynamic have static stride that are is not padded. */
-template< class ShapeType >
-class LayoutStride< ShapeType , LayoutLeft ,
-                    typename enable_if<(
-                      ( 1 <  ShapeType::rank ) &&
-                      ( 0 == ShapeType::rank_dynamic )
-                    )>::type >
-{
-public:
-
-  enum { dynamic = false };
-  enum { value   = ShapeType::N0 };
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign( LayoutStride & , const unsigned ) {}
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_no_padding( LayoutStride & , const ShapeType & ) {}
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_with_padding( LayoutStride & , const ShapeType & ) {}
-};
-
-/* Array with LayoutRight and 1 >= rank_dynamic have static stride that is not padded */
-template< class ShapeType >
-class LayoutStride< ShapeType , LayoutRight ,
-                    typename enable_if<(
-                      ( 1 <  ShapeType::rank ) &&
-                      ( 1 >= ShapeType::rank_dynamic )
-                    )>::type >
-{
-public:
-
-  enum { dynamic = false };
-  enum { value   = ShapeType::N1 * ShapeType::N2 * ShapeType::N3 *
-                   ShapeType::N4 * ShapeType::N5 * ShapeType::N6 * ShapeType::N7 };
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign( LayoutStride & , const unsigned ) {}
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_no_padding( LayoutStride & , const ShapeType & ) {}
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_with_padding( LayoutStride & , const ShapeType & ) {}
-};
-
-
-/* Otherwise array has runtime stride that is padded. */
-template< class ShapeType , class LayoutType , class Enable >
-class LayoutStride
-{
-public:
-
-  enum { dynamic = true };
-
-  unsigned value ;
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign( LayoutStride & stride , const unsigned n ) { stride.value = n ; }
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_no_padding( LayoutStride & vs , const ShapeType & sh )
-    {
-      enum { left = is_same< LayoutType , LayoutLeft >::value };
-
-      // Left  layout arrays are aligned on the first dimension.
-      // Right layout arrays are aligned on blocks of the 2-8th dimensions.
-      vs.value = ShapeType::rank <= 1 ? 0 : (
-                 left ? sh.N0
-                      : sh.N1 * sh.N2 * sh.N3 * sh.N4 * sh.N5 * sh.N6 * sh.N7 );
-    }
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign_with_padding( LayoutStride & vs , const ShapeType & sh )
-    {
-      enum { div   = MEMORY_ALIGNMENT / ShapeType::scalar_size };
-      enum { mod   = MEMORY_ALIGNMENT % ShapeType::scalar_size };
-      enum { align = 0 == mod ? div : 0 };
-
-      assign_no_padding( vs , sh );
-
-      if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < vs.value ) {
-
-        const unsigned count_mod = vs.value % ( div ? div : 1 );
-
-        if ( count_mod ) { vs.value += align - count_mod ; }
-      }
-    }
-};
-
-template< class ShapeType , class LayoutType >
-KOKKOS_INLINE_FUNCTION
-size_t capacity( const ShapeType & shape ,
-                 const LayoutStride< ShapeType , LayoutType > & stride )
-{
-  enum { left = is_same< LayoutType , LayoutLeft >::value };
-
-  return ShapeType::rank <= 1 ? size_t(shape.N0) : (
-         left ? size_t( stride.value * shape.N1 * shape.N2 * shape.N3 * shape.N4 * shape.N5 * shape.N6 * shape.N7 )
-              : size_t( stride.value * shape.N0 ));
-}
-
-template< typename iType , class ShapeType , class LayoutType >
-KOKKOS_INLINE_FUNCTION
-void stride( iType * const s , const ShapeType & shape ,
-                               const LayoutStride< ShapeType , LayoutType > & stride )
-{
-  enum { rank = ShapeType::rank };
-  enum { left = is_same< LayoutType , LayoutLeft >::value };
-
-  if ( 0 < rank ) {
-    if ( 1 == rank ) {
-      s[0] = 1 ;
-    }
-    else if ( left ) {
-      s[0] = 1 ;
-      s[1] = stride.value ;
-      if ( 2 < rank ) { s[2] = s[1] * shape.N1 ; }
-      if ( 3 < rank ) { s[3] = s[2] * shape.N2 ; }
-      if ( 4 < rank ) { s[4] = s[3] * shape.N3 ; }
-      if ( 5 < rank ) { s[5] = s[4] * shape.N4 ; }
-      if ( 6 < rank ) { s[6] = s[5] * shape.N5 ; }
-      if ( 7 < rank ) { s[7] = s[6] * shape.N6 ; }
-    }
-    else {
-      s[rank-1] = 1 ;
-      if ( 7 < rank ) { s[6] = s[7] * shape.N7 ; }
-      if ( 6 < rank ) { s[5] = s[6] * shape.N6 ; }
-      if ( 5 < rank ) { s[4] = s[5] * shape.N5 ; }
-      if ( 4 < rank ) { s[3] = s[4] * shape.N4 ; }
-      if ( 3 < rank ) { s[2] = s[3] * shape.N3 ; }
-      if ( 2 < rank ) { s[1] = s[2] * shape.N2 ; }
-      s[0] = stride.value ;
-    }
-  }
-}
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-/** \brief  View tracking increment/decrement only happens when
- *          view memory is managed and executing in the host space.
- */
-template< class ViewTraits , class Enable = void >
-struct ViewTracking {
-  KOKKOS_INLINE_FUNCTION static void increment( const void * ) {}
-  KOKKOS_INLINE_FUNCTION static void decrement( const void * ) {}
-};
-
-template< class ViewTraits >
-struct ViewTracking< ViewTraits ,
-                     typename enable_if<(
-                       ViewTraits::is_managed &&
-                       Impl::is_same< HostSpace , ExecutionSpace >::value
-                     )>::type >
-{
-  typedef typename ViewTraits::memory_space memory_space ;
-
-  KOKKOS_INLINE_FUNCTION static void increment( const void * ptr )
-    { memory_space::increment( ptr ); }
-
-  KOKKOS_INLINE_FUNCTION static void decrement( const void * ptr )
-    { memory_space::decrement( ptr ); }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template< class DstMemorySpace , class SrcMemorySpace >
-struct DeepCopy ;
-
-template< class OutputView , unsigned Rank = OutputView::Rank >
-struct ViewInit
-{
-  typedef typename OutputView::device_type device_type ;
-  typedef typename OutputView::scalar_type scalar_type ;
-  typedef typename device_type::size_type  size_type ;
-
-  const OutputView output ;
-
-  explicit ViewInit( const OutputView & arg_out ) : output( arg_out )
-    { parallel_for( output.dimension_0() , *this ); }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i0 ) const
-  {
-    const scalar_type default_value = scalar_type();
-
-    for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) {
-    for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) {
-    for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) {
-    for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) {
-    for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) {
-    for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) {
-    for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) {
-      new (&output.at(i0,i1,i2,i3,i4,i5,i6,i7)) scalar_type(default_value) ;
-    }}}}}}}
-  }
-};
-
-template< class OutputView >
-struct ViewInit< OutputView , 1 >
-{
-  typedef typename OutputView::device_type device_type ;
-  typedef typename OutputView::value_type  value_type ;
-  typedef typename device_type::size_type  size_type ;
-
-  const OutputView output ;
-
-  explicit ViewInit( const OutputView & arg_out ) : output( arg_out )
-    { parallel_for( output.dimension_0() , *this ); }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i0 ) const
-  {
-    value_type default_value = value_type();
-    new (&output(i0)) value_type(default_value) ;
-  }
-};
-
-template< class OutputView >
-struct ViewInit< OutputView , 0 >
-{
-  typedef typename OutputView::device_type device_type ;
-  typedef typename OutputView::value_type  value_type ;
-  typedef typename device_type::size_type  size_type ;
-
-  const OutputView output ;
-
-  explicit ViewInit( const OutputView & arg_out ) : output( arg_out )
-    { parallel_for( 1 , *this ); }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type /*i0*/ ) const
-  {
-    value_type default_value = value_type();
-    new (&(*output)) value_type(default_value) ;
-  }
-};
-
-template< class Device >
-struct ViewInitialize
-{
-  template< class ViewType >
-  inline explicit ViewInitialize( const ViewType & view )
-    { ViewInit<ViewType> init( view ); }
-};
-
-template< class OutputView , class InputView  , unsigned Rank = OutputView::Rank >
-struct ViewRemap
-{
-  typedef typename OutputView::device_type device_type ;
-  typedef typename device_type::size_type  size_type ;
-
-  const OutputView output ;
-  const InputView  input ;
-  const size_type n0 ;
-  const size_type n1 ;
-  const size_type n2 ;
-  const size_type n3 ;
-  const size_type n4 ;
-  const size_type n5 ;
-  const size_type n6 ;
-  const size_type n7 ;
-
-  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
-    : output( arg_out ), input( arg_in )
-    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
-    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
-    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
-    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
-    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
-    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
-    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
-    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
-    {
-      parallel_for( n0 , *this );
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i0 ) const
-  {
-    for ( size_type i1 = 0 ; i1 < n1 ; ++i1 ) {
-    for ( size_type i2 = 0 ; i2 < n2 ; ++i2 ) {
-    for ( size_type i3 = 0 ; i3 < n3 ; ++i3 ) {
-    for ( size_type i4 = 0 ; i4 < n4 ; ++i4 ) {
-    for ( size_type i5 = 0 ; i5 < n5 ; ++i5 ) {
-    for ( size_type i6 = 0 ; i6 < n6 ; ++i6 ) {
-    for ( size_type i7 = 0 ; i7 < n7 ; ++i7 ) {
-      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input.at(i0,i1,i2,i3,i4,i5,i6,i7);
-    }}}}}}}
-  }
-};
-
-template< class OutputView , class InputView  >
-struct ViewRemap< OutputView ,  InputView , 0 >
-{
-  typedef typename OutputView::value_type   value_type ;
-  typedef typename OutputView::memory_space dst_space ;
-  typedef typename InputView ::memory_space src_space ;
-
-  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
-  {
-    DeepCopy< dst_space , src_space >( arg_out.ptr_on_device() ,
-                                       arg_in.ptr_on_device() ,
-                                       sizeof(value_type) );
-  }
-};
-
-template< class OutputView , unsigned Rank = OutputView::Rank >
-struct ViewFill
-{
-  typedef typename OutputView::device_type       device_type ;
-  typedef typename OutputView::const_value_type  const_value_type ;
-  typedef typename device_type::size_type        size_type ;
-
-  const OutputView output ;
-  const_value_type input ;
-
-  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
-    : output( arg_out ), input( arg_in )
-    {
-      parallel_for( output.dimension_0() , *this );
-    }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type i0 ) const
-  {
-    for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) {
-    for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) {
-    for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) {
-    for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) {
-    for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) {
-    for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) {
-    for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) {
-      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
-    }}}}}}}
-  }
-};
-
-template< class OutputView >
-struct ViewFill< OutputView , 0 >
-{
-  typedef typename OutputView::device_type       device_type ;
-  typedef typename OutputView::const_value_type  const_value_type ;
-  typedef typename OutputView::memory_space      dst_space ;
-
-  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
-  {
-    DeepCopy< dst_space , dst_space >( arg_out.ptr_on_device() , & arg_in ,
-                                       sizeof(const_value_type) );
-  }
-};
-
-} // namespace Impl
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_VIEWSUPPORT_HPP */
-
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp b/kokkos/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
deleted file mode 100644
index 71bc244..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_VIEWTILELEFT_HPP
-#define KOKKOS_VIEWTILELEFT_HPP
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-struct LayoutTileLeftFast ;
-struct LayoutTileLeftSlow ;
-
-template< typename ScalarType , unsigned N0 , unsigned N1 ,
-          class RankDynamic , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ScalarType , ScalarType ,
-                       LayoutTileLeft<N0,N1,true> , unsigned_<2> , RankDynamic ,
-                       MemorySpace , MemoryTraits >
-{ typedef LayoutTileLeftFast type ; };
-
-template< typename ScalarType , unsigned N0 , unsigned N1 ,
-          class RankDynamic , class MemorySpace , class MemoryTraits >
-struct ViewSpecialize< ScalarType , ScalarType ,
-                       LayoutTileLeft<N0,N1,false> , unsigned_<2> , RankDynamic ,
-                       MemorySpace , MemoryTraits >
-{ typedef LayoutTileLeftSlow type ; };
-
-//----------------------------------------------------------------------------
-
-template<>
-struct ViewAssignment< LayoutTileLeftFast , void , void >
-{
-private:
-
-  template< class DT , class DL , class DD , class DM >
-  inline
-  void allocate( View<DT,DL,DD,DM,LayoutTileLeftFast> & dst , const std::string label )
-  {
-    typedef View<DT,DL,DD,DM,LayoutTileLeftFast>  DstViewType ;
-    typedef typename DstViewType::memory_space  memory_space ;
-
-    ViewTracking< DstViewType >::decrement( dst.m_ptr_on_device );
-
-    dst.m_ptr_on_device = (typename DstViewType::value_type *)
-      memory_space::allocate( label ,
-                              typeid(typename DstViewType::value_type) ,
-                              sizeof(typename DstViewType::value_type) ,
-                              dst.capacity() );
-
-    ViewInitialize< typename DstViewType::device_type > init( dst );
-  }
-
-public:
-
-  template< class DT , class DL , class DD , class DM >
-  inline
-  ViewAssignment( View<DT,DL,DD,DM,LayoutTileLeftFast> & dst ,
-                  const typename enable_if< ViewTraits<DT,DL,DD,DM>::is_managed , std::string >::type & label ,
-                  const size_t n0 ,
-                  const size_t n1 ,
-                  const size_t = 0 ,
-                  const size_t = 0 ,
-                  const size_t = 0 ,
-                  const size_t = 0 ,
-                  const size_t = 0 ,
-                  const size_t = 0 )
-  {
-    typedef View<DT,DL,DD,DM,LayoutTileLeftFast>  DstViewType ;
-
-    dst.m_shape.N0 = n0 ;
-    dst.m_shape.N1 = n1 ;
-    dst.m_tile_N0  = ( n0 + DstViewType::MASK_0 ) >> DstViewType::SHIFT_0 ;
-
-    allocate( dst , label );
-  }
-
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  ViewAssignment(       View<DT,DL,DD,DM,LayoutTileLeftFast> & dst ,
-                  const View<ST,SL,SD,SM,LayoutTileLeftFast> & src ,
-                  typename enable_if<
-                    is_same< View<DT,DL,DD,DM,LayoutTileLeftFast> ,
-                             typename View<ST,SL,SD,SM,LayoutTileLeftFast>::HostMirror >::value
-                  >::type * = 0 )
-  {
-    dst.m_shape   = src.m_shape ;
-    dst.m_tile_N0 = src.m_tile_N0 ;
-    allocate( dst , "mirror" );
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template<>
-struct ViewAssignment< LayoutTileLeftFast , LayoutTileLeftFast, void >
-{
-  /** \brief Assign compatible views */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,LayoutTileLeftFast> & dst ,
-                  const View<ST,SL,SD,SM,LayoutTileLeftFast> & src ,
-                  const typename enable_if<(
-                    ViewAssignable< ViewTraits<DT,DL,DD,DM> , ViewTraits<ST,SL,SD,SM> >::value
-                  )>::type * = 0 )
-  {
-    typedef View<DT,DL,DD,DM,LayoutTileLeftFast> DstViewType ;
-    typedef typename DstViewType::shape_type    shape_type ;
-    typedef typename DstViewType::memory_space  memory_space ;
-    typedef typename DstViewType::memory_traits memory_traits ;
-
-    ViewTracking< DstViewType >::decrement( dst.m_ptr_on_device );
-
-    shape_type::assign( dst.m_shape, src.m_shape.N0 , src.m_shape.N1 );
-
-    dst.m_tile_N0       = src.m_tile_N0 ;
-    dst.m_ptr_on_device = src.m_ptr_on_device ;
-
-    ViewTracking< DstViewType >::increment( dst.m_ptr_on_device );
-  }
-
-  //------------------------------------
-  /** \brief  Deep copy data from compatible value type, layout, rank, and specialization.
-   *          Check the dimensions and allocation lengths at runtime.
-   */
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  inline static
-  void deep_copy( const View<DT,DL,DD,DM,Impl::LayoutTileLeftFast> & dst ,
-                  const View<ST,SL,SD,SM,Impl::LayoutTileLeftFast> & src ,
-                  const typename Impl::enable_if<(
-                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type ,
-                                   typename ViewTraits<ST,SL,SD,SM>::non_const_value_type >::value
-                    &&
-                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
-                                   typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
-                    &&
-                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
-                  )>::type * = 0 )
-  {
-    typedef ViewTraits<DT,DL,DD,DM> dst_traits ;
-    typedef ViewTraits<ST,SL,SD,SM> src_traits ;
-
-    if ( dst.m_ptr_on_device != src.m_ptr_on_device ) {
-
-      Impl::assert_shapes_are_equal( dst.m_shape , src.m_shape );
-
-      const size_t n_dst = sizeof(typename dst_traits::scalar_type) * dst.capacity();
-      const size_t n_src = sizeof(typename src_traits::scalar_type) * src.capacity();
-
-      Impl::assert_counts_are_equal( n_dst , n_src );
-
-      DeepCopy< typename dst_traits::memory_space ,
-                typename src_traits::memory_space >( dst.m_ptr_on_device , src.m_ptr_on_device , n_dst );
-    }
-  }
-};
-
-//----------------------------------------------------------------------------
-
-template<>
-struct ViewAssignment< LayoutDefault , LayoutTileLeftFast, void >
-{
-  /** \brief Extracting a single tile from a tiled view */
-
-  template< class DT , class DL , class DD , class DM ,
-            class ST , class SL , class SD , class SM >
-  KOKKOS_INLINE_FUNCTION
-  ViewAssignment(       View<DT,DL,DD,DM,LayoutDefault> & dst ,
-                  const View<ST,SL,SD,SM,LayoutTileLeftFast> & src ,
-                  const unsigned i0 ,
-                  const typename enable_if<(
-                    is_same< View<DT,DL,DD,DM,LayoutDefault> ,
-                             typename View<ST,SL,SD,SM,LayoutTileLeftFast>::tile_type >::value
-                  ), unsigned >::type i1 )
-  {
-    typedef View<DT,DL,DD,DM,LayoutDefault> DstViewType ;
-    typedef typename DstViewType::shape_type    shape_type ;
-    typedef typename DstViewType::memory_space  memory_space ;
-    typedef typename DstViewType::memory_traits memory_traits ;
-
-    ViewTracking< DstViewType >::decrement( dst.m_ptr_on_device );
-
-    enum { N0 = SL::N0 };
-    enum { N1 = SL::N1 };
-    enum { SHIFT_0 = power_of_two<N0>::value };
-    enum { MASK_0 = N0 - 1 };
-    enum { SHIFT_1 = power_of_two<N1>::value };
-
-    const unsigned NT0 = ( src.dimension_0() + MASK_0 ) >> SHIFT_0 ;
-
-    dst.m_ptr_on_device = src.m_ptr_on_device + (( i0 + i1 * NT0 ) << ( SHIFT_0 + SHIFT_1 ));
-
-    ViewTracking< DstViewType >::increment( dst.m_ptr_on_device );
-  }
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-template< class DataType , class Arg1Type , class Arg2Type , class Arg3Type >
-class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::LayoutTileLeftFast >
-  : public ViewTraits< DataType , Arg1Type , Arg2Type , Arg3Type >
-{
-private:
-  template< class , class , class > friend struct Impl::ViewAssignment ;
-
-  typedef ViewTraits< DataType , Arg1Type , Arg2Type , Arg3Type > traits ;
-
-  typedef Impl::ViewAssignment<Impl::LayoutTileLeftFast> alloc ;
-
-  typedef Impl::ViewAssignment<Impl::LayoutTileLeftFast,
-                               Impl::LayoutTileLeftFast> assign ;
-
-  typename traits::value_type * m_ptr_on_device ;
-  typename traits::shape_type   m_shape ;
-  unsigned                      m_tile_N0 ;
-
-  typedef typename traits::array_layout layout ;
-
-  enum { SHIFT_0 = Impl::power_of_two<layout::N0>::value };
-  enum { SHIFT_1 = Impl::power_of_two<layout::N1>::value };
-  enum { MASK_0  = layout::N0 - 1 };
-  enum { MASK_1  = layout::N1 - 1 };
-
-public:
-
-  typedef Impl::LayoutTileLeftFast specialize ;
-
-  typedef View< typename traits::const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type ,
-                typename traits::memory_traits > const_type ;
-
-  typedef View< typename traits::non_const_data_type ,
-                typename traits::array_layout ,
-                typename traits::device_type::host_mirror_device_type ,
-                void > HostMirror ;
-
-  enum { Rank = 2 };
-
-  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_shape ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_shape.N0 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_shape.N1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return 1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return 1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return 1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return 1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return 1 ; }
-  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return 1 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  View() : m_ptr_on_device(0) {}
-
-  KOKKOS_INLINE_FUNCTION
-  ~View() { Impl::ViewTracking< traits >::decrement( m_ptr_on_device ); }
-
-  KOKKOS_INLINE_FUNCTION
-  View( const View & rhs ) : m_ptr_on_device(0) { (void)assign( *this , rhs ); }
-
-  KOKKOS_INLINE_FUNCTION
-  View & operator = ( const View & rhs ) { (void)assign( *this , rhs ); return *this ; }
-
-  //------------------------------------
-  // Array allocator and member access operator:
-
-  View( const std::string & label , const size_t n0 , const size_t n1 )
-    : m_ptr_on_device(0) { (void)alloc( *this , label , n0 , n1 ); }
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename traits::value_type & operator()( const iType0 & i0 , const iType1 & i1 ) const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-
-      // Use care to insert necessary parentheses as the
-      // shift operators have lower precedence than the arithmatic operators.
-
-      return m_ptr_on_device[
-        // ( ( Tile offset                               ) *  ( Tile size       ) )
-         + ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) )
-        // ( Offset within tile                       )
-         + ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ] ;
-    }
-
-  //------------------------------------
-  // Accept but ignore extra indices, they should be zero.
-
-  template< typename iType0 , typename iType1 >
-  KOKKOS_INLINE_FUNCTION
-  typename traits::value_type &
-    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
-        const int , const int , const int , const int ) const
-    {
-      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
-      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_shape, i0,i1 );
-
-      // Use care to insert necessary parentheses as the
-      // shift operators have lower precedence than the arithmatic operators.
-
-      return m_ptr_on_device[
-        // ( ( Tile offset                               ) *  ( Tile size       ) )
-         + ( ( (i0>>SHIFT_0) + m_tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) )
-        // ( Offset within tile                       )
-         + ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ] ;
-    }
-
-  //------------------------------------
-  // Tile specialization specific declarations and functions:
-
-  typedef View< typename traits::value_type [ layout::N0 ][ layout::N1 ] ,
-                LayoutLeft ,
-                typename traits::device_type ,
-                MemoryUnmanaged >
-    tile_type ;
-
-  KOKKOS_INLINE_FUNCTION
-  typename traits::value_type * ptr_on_device() const { return m_ptr_on_device ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t tiles_in_dimension_0() const { return m_tile_N0 ; }
-
-  KOKKOS_INLINE_FUNCTION
-  size_t tiles_in_dimension_1() const { return ( m_shape.N1 + MASK_1 ) >> SHIFT_1 ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  size_t global_to_tile_index_0( const iType & global_i0 ) const
-    { return global_i0 >> SHIFT_0 ; }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  size_t global_to_tile_index_1( const iType & global_i1 ) const
-    { return global_i1 >> SHIFT_1 ; }
-
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  size_t global_to_local_tile_index_0( const iType & global_i0 ) const
-    { return global_i0 & MASK_0 ; }
-
-  template< typename iType >
-  KOKKOS_INLINE_FUNCTION
-  size_t global_to_local_tile_index_1( const iType & global_i1 ) const
-    { return global_i1 & MASK_1 ; }
-
-
-  //------------------------------------
-
-  KOKKOS_INLINE_FUNCTION
-  typename traits::size_type capacity() const
-  {
-    return ( m_tile_N0 * ( ( m_shape.N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 );
-  }
-};
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_hwloc.cpp b/kokkos/kokkos/core/src/impl/Kokkos_hwloc.cpp
deleted file mode 100644
index 0399f43..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_hwloc.cpp
+++ /dev/null
@@ -1,897 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#define DEBUG_PRINT 0
-
-#include <iostream>
-#include <sstream>
-
-#include <KokkosCore_config.h>
-#include <Kokkos_hwloc.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-int host_thread_binding( const std::pair<unsigned,unsigned> team_topo ,
-                               std::pair<unsigned,unsigned> thread_coord[] )
-{
-  const std::pair<unsigned,unsigned> current = hwloc::get_this_thread_coordinate();
-  const int thread_count = team_topo.first * team_topo.second ;
-
-  int i = 0 ;
-
-  // Match one of the requests:
-  for ( i = 0 ; i < thread_count && current != thread_coord[i] ; ++i );
-
-  if ( thread_count == i ) {
-    // Match the NUMA request:
-    for ( i = 0 ; i < thread_count && current.first != thread_coord[i].first ; ++i );
-  }
-
-  if ( thread_count == i ) {
-    // Match any unclaimed request:
-    for ( i = 0 ; i < thread_count && ~0u == thread_coord[i].first  ; ++i );
-  }
-
-  if ( i < thread_count ) {
-    if ( ! hwloc::bind_this_thread( thread_coord[i] ) ) i = thread_count ;
-  }
-
-  if ( i < thread_count ) {
-
-#if DEBUG_PRINT
-    if ( current != thread_coord[i] ) {
-      std::cout << "  host_thread_binding("
-                << team_topo.first << "x" << team_topo.second
-                << ") rebinding from ("
-                << current.first << ","
-                << current.second
-                << ") to ("
-                << thread_coord[i].first << ","
-                << thread_coord[i].second
-                << ")" << std::endl ;
-    }
-#endif
-
-    thread_coord[i].first  = ~0u ;
-    thread_coord[i].second = ~0u ;
-  }
-
-  return i < thread_count ? i : -1 ;
-}
-
-
-void host_thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                          const std::pair<unsigned,unsigned> core_use ,
-                          const std::pair<unsigned,unsigned> core_topo ,
-                                std::pair<unsigned,unsigned> thread_coord[] )
-{
-  const std::pair<unsigned,unsigned> base( core_topo.first  - core_use.first ,
-                                           core_topo.second - core_use.second );
-
-  for ( unsigned thread_rank = 0 , team_rank = 0 ; team_rank < team_topo.first ; ++team_rank ) {
-  for ( unsigned worker_rank = 0 ; worker_rank < team_topo.second ; ++worker_rank , ++thread_rank ) {
-
-    unsigned team_in_numa_count = 0 ;
-    unsigned team_in_numa_rank  = 0 ;
-
-    { // Distribute teams among NUMA regions:
-      // team_count = k * bin + ( #NUMA - k ) * ( bin + 1 )
-      const unsigned bin  = team_topo.first / core_use.first ;
-      const unsigned bin1 = bin + 1 ;
-      const unsigned k    = core_use.first * bin1 - team_topo.first ;
-      const unsigned part = k * bin ;
-
-      if ( team_rank < part ) {
-        thread_coord[ thread_rank ].first = base.first + team_rank / bin ;
-        team_in_numa_rank  = team_rank % bin ;
-        team_in_numa_count = bin ;
-      }
-      else {
-        thread_coord[ thread_rank ].first = base.first + k + ( team_rank - part ) / bin1 ;
-        team_in_numa_rank  = ( team_rank - part ) % bin1 ;
-        team_in_numa_count = bin1 ;
-      }
-    }
-
-    { // Distribute workers to cores within this NUMA region:
-      // worker_in_numa_count = k * bin + ( (#CORE/NUMA) - k ) * ( bin + 1 )
-      const unsigned worker_in_numa_count = team_in_numa_count * team_topo.second ;
-      const unsigned worker_in_numa_rank  = team_in_numa_rank  * team_topo.second + worker_rank ;
-
-      const unsigned bin  = worker_in_numa_count / core_use.second ;
-      const unsigned bin1 = bin + 1 ;
-      const unsigned k    = core_use.second * bin1 - worker_in_numa_count ;
-      const unsigned part = k * bin ;
-
-      thread_coord[ thread_rank ].second = base.second +
-        ( ( worker_in_numa_rank < part )
-          ? ( worker_in_numa_rank / bin )
-          : ( k + ( worker_in_numa_rank - part ) / bin1 ) );
-    }
-  }}
-
-#if DEBUG_PRINT
-
-  std::cout << "Kokkos::host_thread_mapping (unrotated)" << std::endl ;
-
-  for ( unsigned g = 0 , t = 0 ; g < team_topo.first ; ++g ) {
-    std::cout << "  team[" << g
-              << "] on numa[" << thread_coord[t].first
-              << "] cores(" ;
-    for ( unsigned w = 0 ; w < team_topo.second ; ++w , ++t ) {
-      std::cout << " " << thread_coord[t].second ;
-    }
-    std::cout << " )" << std::endl ;
-  }
-
-#endif
-
-}
-
-void host_thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                          const std::pair<unsigned,unsigned> core_use ,
-                          const std::pair<unsigned,unsigned> core_topo ,
-                          const std::pair<unsigned,unsigned> master_coord ,
-                                std::pair<unsigned,unsigned> thread_coord[] )
-{
-  const unsigned thread_count = team_topo.first * team_topo.second ;
-  const unsigned core_base    = core_topo.second - core_use.second ;
-
-  host_thread_mapping( team_topo , core_use , core_topo , thread_coord );
-
-  // The master core should be thread #0 so rotate all coordinates accordingly ...
-
-  const std::pair<unsigned,unsigned> offset
-    ( ( thread_coord[0].first  < master_coord.first  ? master_coord.first  - thread_coord[0].first  : 0 ) ,
-      ( thread_coord[0].second < master_coord.second ? master_coord.second - thread_coord[0].second : 0 ) );
-
-  for ( unsigned i = 0 ; i < thread_count ; ++i ) {
-    thread_coord[i].first  = ( thread_coord[i].first + offset.first ) % core_use.first ;
-    thread_coord[i].second = core_base + ( thread_coord[i].second + offset.second - core_base ) % core_use.second ;
-  }
-
-#if DEBUG_PRINT
-
-  std::cout << "Kokkos::host_thread_mapping (rotated)" << std::endl ;
-
-  for ( unsigned g = 0 , t = 0 ; g < team_topo.first ; ++g ) {
-    std::cout << "  team[" << g
-              << "] on numa[" << thread_coord[t].first
-              << "] cores(" ;
-    for ( unsigned w = 0 ; w < team_topo.second ; ++w , ++t ) {
-      std::cout << " " << thread_coord[t].second ;
-    }
-    std::cout << " )" << std::endl ;
-  }
-
-#endif
-
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace hwloc {
-
-std::pair<unsigned,unsigned> use_core_topology( const unsigned thread_count )
-{
-  const unsigned hwloc_numa_count       = Kokkos::hwloc::get_available_numa_count();
-  const unsigned hwloc_cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-  const unsigned hwloc_threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-  const unsigned hwloc_capacity         = hwloc_numa_count * hwloc_cores_per_numa * hwloc_threads_per_core ;
-
-  if ( hwloc_capacity < thread_count ) {
-    std::ostringstream msg ;
-
-    msg << "Kokkos::hwloc::use_core_topology FAILED : Requested more cores or threads than HWLOC reports are available "
-        << " numa_count(" << hwloc_numa_count << ") , cores_per_numa(" << hwloc_cores_per_numa << ")"
-        << " capacity(" << hwloc_capacity << ")" ;
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  const std::pair<unsigned,unsigned> core_topo( hwloc_numa_count , hwloc_cores_per_numa );
-
-  // Start by assuming use of all available cores
-  std::pair<unsigned,unsigned> use_core_topo = core_topo ;
-
-  if ( thread_count <= ( core_topo.first - 1 ) * core_topo.second ) {
-    // Can spawn all requested threads on their own core within fewer NUMA regions of cores.
-    use_core_topo.first = ( thread_count + core_topo.second - 1 ) / core_topo.second ;
-  }
-
-  if ( thread_count <= core_topo.first * ( core_topo.second - 1 ) ) {
-    // Can spawn all requested threads on their own core and have excess core.
-    use_core_topo.second = ( thread_count + core_topo.first - 1 ) / core_topo.first ;
-  }
-
-  if ( core_topo.first * core_topo.second < thread_count &&
-       thread_count <= core_topo.first * ( core_topo.second - 1 ) * hwloc_threads_per_core ) {
-    // Will oversubscribe cores and can omit one core
-    --use_core_topo.second ;
-  }
-
-  return use_core_topo ;
-}
-
-int thread_binding( const std::pair<unsigned,unsigned> team_topo ,
-                          std::pair<unsigned,unsigned> thread_coord[] )
-{
-  const std::pair<unsigned,unsigned> current = hwloc::get_this_thread_coordinate();
-  const int thread_count = team_topo.first * team_topo.second ;
-
-  int i = 0 ;
-
-  // Match one of the requests:
-  for ( i = 0 ; i < thread_count && current != thread_coord[i] ; ++i );
-
-  if ( thread_count == i ) {
-    // Match the NUMA request:
-    for ( i = 0 ; i < thread_count && current.first != thread_coord[i].first ; ++i );
-  }
-
-  if ( thread_count == i ) {
-    // Match any unclaimed request:
-    for ( i = 0 ; i < thread_count && ~0u == thread_coord[i].first  ; ++i );
-  }
-
-  if ( i < thread_count ) {
-    if ( ! hwloc::bind_this_thread( thread_coord[i] ) ) i = thread_count ;
-  }
-
-  if ( i < thread_count ) {
-
-#if DEBUG_PRINT
-    if ( current != thread_coord[i] ) {
-      std::cout << "  host_thread_binding("
-                << team_topo.first << "x" << team_topo.second
-                << ") rebinding from ("
-                << current.first << ","
-                << current.second
-                << ") to ("
-                << thread_coord[i].first << ","
-                << thread_coord[i].second
-                << ")" << std::endl ;
-    }
-#endif
-
-    thread_coord[i].first  = ~0u ;
-    thread_coord[i].second = ~0u ;
-  }
-
-  return i < thread_count ? i : -1 ;
-}
-
-
-void thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                     const std::pair<unsigned,unsigned> core_use ,
-                     const std::pair<unsigned,unsigned> core_topo ,
-                           std::pair<unsigned,unsigned> thread_coord[] )
-{
-  const std::pair<unsigned,unsigned> base( core_topo.first  - core_use.first ,
-                                           core_topo.second - core_use.second );
-
-  for ( unsigned thread_rank = 0 , team_rank = 0 ; team_rank < team_topo.first ; ++team_rank ) {
-  for ( unsigned worker_rank = 0 ; worker_rank < team_topo.second ; ++worker_rank , ++thread_rank ) {
-
-    unsigned team_in_numa_count = 0 ;
-    unsigned team_in_numa_rank  = 0 ;
-
-    { // Distribute teams among NUMA regions:
-      // team_count = k * bin + ( #NUMA - k ) * ( bin + 1 )
-      const unsigned bin  = team_topo.first / core_use.first ;
-      const unsigned bin1 = bin + 1 ;
-      const unsigned k    = core_use.first * bin1 - team_topo.first ;
-      const unsigned part = k * bin ;
-
-      if ( team_rank < part ) {
-        thread_coord[ thread_rank ].first = base.first + team_rank / bin ;
-        team_in_numa_rank  = team_rank % bin ;
-        team_in_numa_count = bin ;
-      }
-      else {
-        thread_coord[ thread_rank ].first = base.first + k + ( team_rank - part ) / bin1 ;
-        team_in_numa_rank  = ( team_rank - part ) % bin1 ;
-        team_in_numa_count = bin1 ;
-      }
-    }
-
-    { // Distribute workers to cores within this NUMA region:
-      // worker_in_numa_count = k * bin + ( (#CORE/NUMA) - k ) * ( bin + 1 )
-      const unsigned worker_in_numa_count = team_in_numa_count * team_topo.second ;
-      const unsigned worker_in_numa_rank  = team_in_numa_rank  * team_topo.second + worker_rank ;
-
-      const unsigned bin  = worker_in_numa_count / core_use.second ;
-      const unsigned bin1 = bin + 1 ;
-      const unsigned k    = core_use.second * bin1 - worker_in_numa_count ;
-      const unsigned part = k * bin ;
-
-      thread_coord[ thread_rank ].second = base.second +
-        ( ( worker_in_numa_rank < part )
-          ? ( worker_in_numa_rank / bin )
-          : ( k + ( worker_in_numa_rank - part ) / bin1 ) );
-    }
-  }}
-
-#if DEBUG_PRINT
-
-  std::cout << "Kokkos::hwloc::thread_mapping (unrotated)" << std::endl ;
-
-  for ( unsigned g = 0 , t = 0 ; g < team_topo.first ; ++g ) {
-    std::cout << "  team[" << g
-              << "] on numa[" << thread_coord[t].first
-              << "] cores(" ;
-    for ( unsigned w = 0 ; w < team_topo.second ; ++w , ++t ) {
-      std::cout << " " << thread_coord[t].second ;
-    }
-    std::cout << " )" << std::endl ;
-  }
-
-#endif
-
-}
-
-void thread_mapping( const std::pair<unsigned,unsigned> team_topo ,
-                     const std::pair<unsigned,unsigned> core_use ,
-                     const std::pair<unsigned,unsigned> core_topo ,
-                     const std::pair<unsigned,unsigned> master_coord ,
-                           std::pair<unsigned,unsigned> thread_coord[] )
-{
-  const unsigned thread_count = team_topo.first * team_topo.second ;
-  const unsigned core_base    = core_topo.second - core_use.second ;
-
-  thread_mapping( team_topo , core_use , core_topo , thread_coord );
-
-  // The master core should be thread #0 so rotate all coordinates accordingly ...
-
-  const std::pair<unsigned,unsigned> offset
-    ( ( thread_coord[0].first  < master_coord.first  ? master_coord.first  - thread_coord[0].first  : 0 ) ,
-      ( thread_coord[0].second < master_coord.second ? master_coord.second - thread_coord[0].second : 0 ) );
-
-  for ( unsigned i = 0 ; i < thread_count ; ++i ) {
-    thread_coord[i].first  = ( thread_coord[i].first + offset.first ) % core_use.first ;
-    thread_coord[i].second = core_base + ( thread_coord[i].second + offset.second - core_base ) % core_use.second ;
-  }
-
-#if DEBUG_PRINT
-
-  std::cout << "Kokkos::hwloc::thread_mapping (rotated)" << std::endl ;
-
-  for ( unsigned g = 0 , t = 0 ; g < team_topo.first ; ++g ) {
-    std::cout << "  team[" << g
-              << "] on numa[" << thread_coord[t].first
-              << "] cores(" ;
-    for ( unsigned w = 0 ; w < team_topo.second ; ++w , ++t ) {
-      std::cout << " " << thread_coord[t].second ;
-    }
-    std::cout << " )" << std::endl ;
-  }
-
-#endif
-
-}
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#if defined( KOKKOS_HAVE_HWLOC )
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-
-/*--------------------------------------------------------------------------*/
-/* Third Party Libraries */
-
-/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
-#include <hwloc.h>
-
-#define  REQUIRED_HWLOC_API_VERSION  0x000010300
-
-#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
-#error "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace hwloc {
-namespace {
-
-enum { MAX_CORE = 1024 };
-
-std::pair<unsigned,unsigned> s_core_topology(0,0);
-unsigned                     s_core_capacity(0);
-hwloc_topology_t             s_hwloc_topology(0);
-hwloc_bitmap_t               s_hwloc_location(0);
-hwloc_bitmap_t               s_process_binding(0);
-hwloc_bitmap_t               s_core[ MAX_CORE ];
-
-struct Sentinel {
-  ~Sentinel();
-  Sentinel();
-};
-
-void sentinel()
-{ static Sentinel self ; }
-
-Sentinel::~Sentinel()
-{
-  hwloc_topology_destroy( s_hwloc_topology );
-  hwloc_bitmap_free( s_process_binding );
-  hwloc_bitmap_free( s_hwloc_location );
-}
-
-Sentinel::Sentinel()
-{
-  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
-  s_core_capacity   = 0 ;
-  s_hwloc_topology  = 0 ;
-  s_hwloc_location  = 0 ;
-  s_process_binding = 0 ;
-
-  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;
-
-  hwloc_topology_init( & s_hwloc_topology );
-  hwloc_topology_load( s_hwloc_topology );
-
-  s_hwloc_location  = hwloc_bitmap_alloc();
-  s_process_binding = hwloc_bitmap_alloc();
-
-  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
-
-  // Choose a hwloc object type for the NUMA level, which may not exist.
-
-  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;
-
-  {
-    // Object types to search, in order.
-    static const hwloc_obj_type_t candidate_root_type[] =
-      { HWLOC_OBJ_NODE     /* NUMA region     */
-      , HWLOC_OBJ_SOCKET   /* hardware socket */
-      , HWLOC_OBJ_MACHINE  /* local machine   */
-      };
-
-    enum { CANDIDATE_ROOT_TYPE_COUNT =
-             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };
-
-    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
-      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
-        root_type = candidate_root_type[k] ;
-      }
-    }
-  }
-
-  // Determine which of these 'root' types are available to this process.
-  // The process may have been bound (e.g., by MPI) to a subset of these root types.
-  // Determine current location of the master (calling) process>
-
-  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
-
-  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );
-
-  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );
-
-  unsigned root_base     = max_root ;
-  unsigned root_count    = 0 ;
-  unsigned core_per_root = 0 ;
-  unsigned pu_per_core   = 0 ;
-  bool     symmetric     = true ;
-
-  for ( unsigned i = 0 ; i < max_root ; ++i ) {
-
-    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );
-
-    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
-
-      ++root_count ;
-
-      // Remember which root (NUMA) object the master thread is running on.
-      // This will be logical NUMA rank #0 for this process.
-
-      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
-        root_base = i ;
-      }
-
-      // Count available cores:
-
-      const unsigned max_core =
-        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
-                                                root->allowed_cpuset ,
-                                                HWLOC_OBJ_CORE );
-
-      unsigned core_count = 0 ;
-
-      for ( unsigned j = 0 ; j < max_core ; ++j ) {
-
-        const hwloc_obj_t core =
-          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
-                                               root->allowed_cpuset ,
-                                               HWLOC_OBJ_CORE , j );
-
-        // If process' cpuset intersects core's cpuset then process can access this core.
-        // Must use intersection instead of inclusion because the Intel-Phi
-        // MPI may bind the process to only one of the core's hyperthreads.
-        //
-        // Assumption: if the process can access any hyperthread of the core
-        // then it has ownership of the entire core.
-        // This assumes that it would be performance-detrimental
-        // to spawn more than one MPI process per core and use nested threading.
-
-        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
-
-          ++core_count ;
-
-          const unsigned pu_count =
-            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
-                                                    core->allowed_cpuset ,
-                                                    HWLOC_OBJ_PU );
-
-          if ( pu_per_core == 0 ) pu_per_core = pu_count ;
-
-          // Enforce symmetry by taking the minimum:
-
-          pu_per_core = std::min( pu_per_core , pu_count );
-
-          if ( pu_count != pu_per_core ) symmetric = false ;
-        }
-      }
-
-      if ( 0 == core_per_root ) core_per_root = core_count ;
-
-      // Enforce symmetry by taking the minimum:
-
-      core_per_root = std::min( core_per_root , core_count );
-
-      if ( core_count != core_per_root ) symmetric = false ;
-    }
-  }
-
-  s_core_topology.first  = root_count ;
-  s_core_topology.second = core_per_root ;
-  s_core_capacity        = pu_per_core ;
-
-  // Fill the 's_core' array for fast mapping from a core coordinate to the
-  // hwloc cpuset object required for thread location querying and binding.
-
-  for ( unsigned i = 0 ; i < max_root ; ++i ) {
-
-    const unsigned root_rank = ( i + root_base ) % max_root ;
-
-    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );
-
-    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
-
-      const unsigned max_core =
-        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
-                                                root->allowed_cpuset ,
-                                                HWLOC_OBJ_CORE );
-
-      unsigned core_count = 0 ;
-
-      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {
-
-        const hwloc_obj_t core =
-          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
-                                               root->allowed_cpuset ,
-                                               HWLOC_OBJ_CORE , j );
-
-        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
-
-          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;
-
-          ++core_count ;
-        }
-      }
-    }
-  }
-
-  hwloc_bitmap_free( proc_cpuset_location );
-
-  if ( ! symmetric ) {
-    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
-              << std::endl ;
-  }
-}
-
-
-inline
-void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap )
-{
-  s << "{" ;
-  for ( int i = hwloc_bitmap_first( bitmap ) ;
-        -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) {
-    s << " " << i ;
-  }
-  s << " }" ;
-}
-
-} // namespace
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-bool available()
-{ return true ; }
-
-unsigned get_available_numa_count()
-{ sentinel(); return s_core_topology.first ; }
-
-unsigned get_available_cores_per_numa()
-{ sentinel(); return s_core_topology.second ; }
-
-unsigned get_available_threads_per_core()
-{ sentinel(); return s_core_capacity ; }
-
-
-std::pair<unsigned,unsigned>
-get_core_topology()
-{ sentinel(); return s_core_topology ; }
-
-unsigned
-get_core_capacity()
-{ sentinel(); return s_core_capacity ; }
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-unsigned bind_this_thread(
-  const unsigned               coordinate_count ,
-  std::pair<unsigned,unsigned> coordinate[] )
-{
-  unsigned i = 0 ;
-
-  try {
-    const std::pair<unsigned,unsigned> current = get_this_thread_coordinate();
-
-    // Match one of the requests:
-    for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i );
-
-    if ( coordinate_count == i ) {
-      // Match the first request (typically NUMA):
-      for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i );
-    }
-
-    if ( coordinate_count == i ) {
-      // Match any unclaimed request:
-      for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first  ; ++i );
-    }
-
-    if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) {
-       // Failed to bind:
-       i = ~0u ;
-    }
-
-    if ( i < coordinate_count ) {
-
-#if DEBUG_PRINT
-      if ( current != coordinate[i] ) {
-        std::cout << "  host_thread_binding: rebinding from ("
-                  << current.first << ","
-                  << current.second
-                  << ") to ("
-                  << coordinate[i].first << ","
-                  << coordinate[i].second
-                  << ")" << std::endl ;
-      }
-#endif
-
-      coordinate[i].first  = ~0u ;
-      coordinate[i].second = ~0u ;
-    }
-  }
-  catch( ... ) {
-    i = ~0u ;
-  }
-
-  return i ;
-}
-
-
-bool bind_this_thread( const std::pair<unsigned,unsigned> coord )
-{
-  sentinel();
-
-#if DEBUG_PRINT
-
-  std::cout << "Kokkos::bind_this_thread() at " ;
-
-  hwloc_get_last_cpu_location( s_hwloc_topology ,
-                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  print_bitmap( std::cout , s_hwloc_location );
-
-  std::cout << " to " ;
-
-  print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] );
-
-  std::cout << std::endl ;
-
-#endif
-
-  // As safe and fast as possible.
-  // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
-  return coord.first  < s_core_topology.first &&
-         coord.second < s_core_topology.second &&
-         0 == hwloc_set_cpubind( s_hwloc_topology ,
-                                 s_core[ coord.second + coord.first * s_core_topology.second ] ,
-                                 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
-}
-
-bool unbind_this_thread()
-{
-  sentinel();
-
-#define HWLOC_DEBUG_PRINT 0
-
-#if HWLOC_DEBUG_PRINT
-
-  std::cout << "Kokkos::unbind_this_thread() from " ;
-
-  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  print_bitmap( std::cout , s_hwloc_location );
-
-#endif
-
-  const bool result =
-    s_hwloc_topology &&
-    0 == hwloc_set_cpubind( s_hwloc_topology ,
-                            s_process_binding ,
-                            HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
-
-#if HWLOC_DEBUG_PRINT
-
-  std::cout << " to " ;
-
-  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  print_bitmap( std::cout , s_hwloc_location );
-
-  std::cout << std::endl ;
-
-#endif
-
-  return result ;
-
-#undef HWLOC_DEBUG_PRINT
-
-}
-
-//----------------------------------------------------------------------------
-
-std::pair<unsigned,unsigned> get_this_thread_coordinate()
-{
-  sentinel();
-
-  const unsigned n = s_core_topology.first * s_core_topology.second ;
-
-  std::pair<unsigned,unsigned> coord(0,0);
-
-  // Using the pre-allocated 's_hwloc_location' to avoid memory
-  // allocation by this thread.  This call is NOT thread-safe.
-  hwloc_get_last_cpu_location( s_hwloc_topology ,
-                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
-
-  unsigned i = 0 ;
-
-  while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ;
-
-  if ( i < n ) {
-    coord.first  = i / s_core_topology.second ;
-    coord.second = i % s_core_topology.second ;
-  }
-  else {
-    std::ostringstream msg ;
-    msg << "Kokkos::get_this_thread_coordinate() FAILED :" ;
-
-    if ( 0 != s_process_binding && 0 != s_hwloc_location ) {
-      msg << " cpu_location" ;
-      print_bitmap( msg , s_hwloc_location );
-      msg << " is not a member of the process_cpu_set" ;
-      print_bitmap( msg , s_process_binding );
-    }
-    else {
-      msg << " not initialized" ;
-    }
-    throw std::runtime_error( msg.str() );
-  }
-  return coord ;
-}
-
-//----------------------------------------------------------------------------
-
-} /* namespace hwloc */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#else /* ! defined( KOKKOS_HAVE_HWLOC ) */
-
-namespace Kokkos {
-namespace hwloc {
-
-bool available() { return false ; }
-
-unsigned get_available_numa_count() { return 1 ; }
-unsigned get_available_cores_per_numa() { return 1 ; }
-unsigned get_available_threads_per_core() { return 1 ; }
-
-unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] )
-{ return ~0 ; }
-
-bool bind_this_thread( const std::pair<unsigned,unsigned> )
-{ return false ; }
-
-bool unbind_this_thread()
-{ return true ; }
-
-std::pair<unsigned,unsigned> get_this_thread_coordinate()
-{ return std::pair<unsigned,unsigned>(0,0); }
-
-std::pair<unsigned,unsigned> get_core_topology()
-{ return std::pair<unsigned,unsigned>(1,1); }
-
-unsigned get_core_capacity()
-{ return 1 ; }
-
-} // namespace hwloc
-} // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
-
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_spinwait.cpp b/kokkos/kokkos/core/src/impl/Kokkos_spinwait.cpp
deleted file mode 100644
index d9377cc..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Macros.hpp>
-#include <impl/Kokkos_spinwait.hpp>
-
-/*--------------------------------------------------------------------------*/
-
-#if ! defined( KOKKOS_DISABLE_ASM ) && \
-    ( defined( __GNUC__ ) || \
-      defined( __GNUG__ ) || \
-      defined( __INTEL_COMPILER__ ) )
-
-#ifndef __arm__
-/* Pause instruction to prevent excess processor bus usage */
-#define YIELD   asm volatile("pause\n":::"memory")
-#else
-/* No-operation instruction to idle the thread. */
-#define YIELD   asm volatile("nop")
-#endif
-
-#elif ! defined( KOKKOS_HAVE_WINTHREAD )
-
-#include <sched.h>
-
-#define YIELD  sched_yield()
-
-#else
-
-#include <process.h>
-
-#define YIELD  Sleep(0)
-
-#endif
-
-/*--------------------------------------------------------------------------*/
-
-namespace Kokkos {
-namespace Impl {
-
-void spinwait( volatile int & flag , const int value )
-{
-  while ( value == flag ) {
-    YIELD ;
-  }
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
diff --git a/kokkos/kokkos/core/src/impl/Kokkos_spinwait.hpp b/kokkos/kokkos/core/src/impl/Kokkos_spinwait.hpp
deleted file mode 100644
index f2b42e9..0000000
--- a/kokkos/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//   Kokkos: Manycore Performance-Portable Multidimensional Arrays
-//              Copyright (2012) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions?  Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-
-#ifndef KOKKOS_SPINWAIT_HPP
-#define KOKKOS_SPINWAIT_HPP
-
-namespace Kokkos {
-namespace Impl {
-
-void spinwait( volatile int & flag , const int value );
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-#undef KOKKOS_YIELD
-
-#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
-
diff --git a/kokkos/kokkos/linalg/src/Kokkos_CrsMatrix.hpp b/kokkos/kokkos/linalg/src/Kokkos_CrsMatrix.hpp
index 9a55fb4..ecf0187 100644
--- a/kokkos/kokkos/linalg/src/Kokkos_CrsMatrix.hpp
+++ b/kokkos/kokkos/linalg/src/Kokkos_CrsMatrix.hpp
@@ -53,18 +53,9 @@
 #include <assert.h>
 #include <algorithm>
 
-#include <Kokkos_View.hpp>
-#include <Kokkos_Atomic.hpp>
-#ifdef KOKKOS_HAVE_CUDA
-#  include <Kokkos_Cuda.hpp>
-#endif
-#include <Kokkos_Macros.hpp>
+#include <Kokkos_Core.hpp>
 #include <Kokkos_StaticCrsGraph.hpp>
-#include <Kokkos_MV.hpp>
-
-#ifndef _OPENMP
-#include <Kokkos_Threads.hpp>
-#endif // ! _OPENMP
+#include <Kokkos_Vector.hpp>
 
 #ifdef KOKKOS_USE_CUSPARSE
 #  include <cusparse_v2.h>
@@ -95,19 +86,19 @@ namespace Kokkos {
 ///
 /// Here is an example loop over the entries in the row:
 /// \code
-/// typedef typename SparseRowView<MatrixType>::scalar_type scalar_type;
+/// typedef typename SparseRowView<MatrixType>::value_type value_type;
 /// typedef typename SparseRowView<MatrixType>::ordinal_type ordinal_type;
 ///
 /// SparseRowView<MatrixType> A_i = ...;
 /// const int numEntries = A_i.length;
 /// for (int k = 0; k < numEntries; ++k) {
-///   scalar_type A_ij = A_i.value (k);
+///   value_type A_ij = A_i.value (k);
 ///   ordinal_type j = A_i.colidx (k);
 ///   // ... do something with A_ij and j ...
 /// }
 /// \endcode
 ///
-/// MatrixType must provide the \c scalar_type and \c ordinal_type
+/// MatrixType must provide the \c value_type and \c ordinal_type
 /// typedefs.  In addition, it must make sense to use SparseRowView to
 /// view a row of MatrixType.  In particular, the values and column
 /// indices of a row must be accessible using the <tt>values</tt>
@@ -119,13 +110,13 @@ namespace Kokkos {
 template<class MatrixType>
 struct SparseRowView {
   //! The type of the values in the row.
-  typedef typename MatrixType::scalar_type scalar_type;
+  typedef typename MatrixType::value_type value_type;
   //! The type of the column indices in the row.
   typedef typename MatrixType::ordinal_type ordinal_type;
 
 private:
   //! Array of values in the row.
-  scalar_type* values_;
+  value_type* values_;
   //! Array of (local) column indices in the row.
   ordinal_type* colidx_;
   //! Stride between successive entries in the row.
@@ -140,7 +131,7 @@ struct SparseRowView {
   ///   each of the above arrays.
   /// \param count [in] Number of entries in the row.
   KOKKOS_INLINE_FUNCTION
-  SparseRowView (scalar_type* const values,
+  SparseRowView (value_type* const values,
                  ordinal_type* const colidx,
                  const int stride,
                  const int count) :
@@ -159,7 +150,7 @@ struct SparseRowView {
   /// "Entry i" is not necessarily the entry with column index i, nor
   /// does i necessarily correspond to the (local) row index.
   KOKKOS_INLINE_FUNCTION
-  scalar_type& value (const int& i) const {
+  value_type& value (const int& i) const {
     return values_[i*stride_];
   }
 
@@ -184,13 +175,13 @@ struct SparseRowView {
 template<class MatrixType>
 struct SparseRowViewConst {
   //! The type of the values in the row.
-  typedef const typename MatrixType::nonconst_scalar_type scalar_type;
+  typedef const typename MatrixType::nonconst_value_type value_type;
   //! The type of the column indices in the row.
   typedef const typename MatrixType::nonconst_ordinal_type ordinal_type;
 
 private:
   //! Array of values in the row.
-  scalar_type* values_;
+  value_type* values_;
   //! Array of (local) column indices in the row.
   ordinal_type* colidx_;
   //! Stride between successive entries in the row.
@@ -205,7 +196,7 @@ struct SparseRowViewConst {
   ///   each of the above arrays.
   /// \param count [in] Number of entries in the row.
   KOKKOS_INLINE_FUNCTION
-  SparseRowViewConst (scalar_type* const values,
+  SparseRowViewConst (value_type* const values,
                       ordinal_type* const colidx,
                       const int stride,
                       const int count) :
@@ -224,7 +215,7 @@ struct SparseRowViewConst {
   /// "Entry i" is not necessarily the entry with column index i, nor
   /// does i necessarily correspond to the (local) row index.
   KOKKOS_INLINE_FUNCTION
-  scalar_type& value (const int& i) const {
+  value_type& value (const int& i) const {
     return values_[i*stride_];
   }
 
@@ -260,24 +251,12 @@ template<typename ScalarType,
 class CrsMatrix {
 public:
   typedef Device      device_type;
-  typedef ScalarType  scalar_type;
+  typedef ScalarType  value_type;
   typedef OrdinalType ordinal_type;
   typedef MemoryTraits memory_traits;
   typedef SizeType size_type;
 
-  // FIXME (mfh 28 Sep 2013) Cuda::host_mirror_device_type is Threads.
-  // Shouldn't CrsMatrix::host_device_type always be the same as its
-  // Device's host_mirror_device_type?
-  //
-  // OpenMP is the default host type if you turned on OpenMP when
-  // building.  OpenMP is not on by default, so if you specified in
-  // the build that you wanted OpenMP, then we say that the default
-  // host type is OpenMP instead of Threads.
-#ifdef _OPENMP
-  typedef Kokkos::OpenMP host_device_type;
-#else
-  typedef Kokkos::Threads host_device_type;
-#endif
+  typedef Kokkos::DefaultHostExecutionSpace host_device_type;
 
   //! Type of a host-memory mirror of the sparse matrix.
   typedef CrsMatrix<ScalarType, OrdinalType, host_device_type, MemoryTraits> HostMirror;
@@ -323,11 +302,11 @@ class CrsMatrix {
   //! Type of the "row map" (which contains the offset for each row's data).
   typedef typename StaticCrsGraphType::row_map_type row_map_type;
   //! Kokkos Array type of the entries (values) in the sparse matrix.
-  typedef Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type, MemoryTraits> values_type;
+  typedef Kokkos::View<value_type*, Kokkos::LayoutLeft, device_type, MemoryTraits> values_type;
   //! Const version of the type of the entries in the sparse matrix.
-  typedef typename values_type::const_scalar_type  const_scalar_type;
+  typedef typename values_type::const_value_type  const_value_type;
   //! Nonconst version of the type of the entries in the sparse matrix.
-  typedef typename values_type::non_const_scalar_type  non_const_scalar_type;
+  typedef typename values_type::non_const_value_type  non_const_value_type;
 
 #ifdef KOKKOS_USE_CUSPARSE
   cusparseHandle_t cusparse_handle;
@@ -845,87 +824,6 @@ generateHostGraph ( OrdinalType nrows,
 
 }
 
-// FIXME (mfh 09 Aug 2013) These "shuffle" operations need to move
-// into kokkos/core, because they are fundamental to Kokkos and not
-// specific to sparse matrices.
-//
-// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
-// or other GPUs.  We provide a generic definition (which is trivial
-// and doesn't do what it claims to do) because we don't actually use
-// this function unless we are on a suitable GPU, with a suitable
-// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
-// internal parameter depends both on the Device and the Scalar type,
-// and it controls whether shfl_down() gets called.)
-template<typename Scalar>
-KOKKOS_INLINE_FUNCTION
-Scalar shfl_down(const Scalar &val, const int& delta, const int& width){
-  return val;
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-unsigned int shfl_down<unsigned int>(const unsigned int &val, const int& delta, const int& width){
-#ifdef __CUDA_ARCH__
-  #if (__CUDA_ARCH__ >= 300)
-    unsigned int tmp1 = val;
-    int tmp = *reinterpret_cast<int*>(&tmp1);
-    tmp = __shfl_down(tmp,delta,width);
-    return *reinterpret_cast<unsigned int*>(&tmp);
-  #else
-    return val;
-  #endif
-#else
-  return val;
-#endif
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-int shfl_down<int>(const int &val, const int& delta, const int& width){
-#ifdef __CUDA_ARCH__
-  #if (__CUDA_ARCH__ >= 300)
-    return __shfl_down(val,delta,width);
-  #else
-    return val;
-  #endif
-#else
-  return val;
-#endif
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-float shfl_down<float>(const float &val, const int& delta, const int& width){
-#ifdef __CUDA_ARCH__
-  #if (__CUDA_ARCH__ >= 300)
-    return __shfl_down(val,delta,width);
-  #else
-    return val;
-  #endif
-#else
-  return val;
-#endif
-}
-
-template<>
-KOKKOS_INLINE_FUNCTION
-double shfl_down<double>(const double &val, const int& delta, const int& width){
-#ifdef __CUDA_ARCH__
-  #if (__CUDA_ARCH__ >= 300)
-    int lo = __double2loint(val);
-    int hi = __double2hiint(val);
-    lo = __shfl_down(lo,delta,width);
-    hi = __shfl_down(hi,delta,width);
-    return __hiloint2double(hi,lo);
-  #else
-    return val;
-  #endif
-#else
-  return val;
-#endif
-}
-
-
 template<class RangeVector,
          class CrsMatrix,
          class DomainVector,
@@ -937,8 +835,8 @@ template<class RangeVector,
 struct MV_MultiplyFunctor {
   typedef typename CrsMatrix::device_type                   device_type ;
   typedef typename CrsMatrix::ordinal_type                    size_type ;
-  typedef typename CrsMatrix::non_const_scalar_type         scalar_type ;
-  typedef typename Kokkos::View<scalar_type*, typename CrsMatrix::device_type> range_values;
+  typedef typename CrsMatrix::non_const_value_type         value_type ;
+  typedef typename Kokkos::View<value_type*, typename CrsMatrix::device_type> range_values;
 
   CoeffVector1 beta;
   CoeffVector2 alpha;
@@ -954,7 +852,7 @@ struct MV_MultiplyFunctor {
   void strip_mine (const size_type i, const size_type kk) const {
     const size_type iRow = i / ThreadsPerRow;
     const int lane = i % ThreadsPerRow;
-    scalar_type sum[UNROLL];
+    value_type sum[UNROLL];
     // FIXME (mfh 29 Sep 2013) These pragmas ("ivdep", "unroll", and
     // "loop count") should be protected by macros that identify the
     // compilers which support them.
@@ -965,7 +863,7 @@ struct MV_MultiplyFunctor {
 #pragma unroll
     for (size_type k = 0 ; k < UNROLL ; ++k) {
       // NOTE (mfh 09 Aug 2013) This requires that assignment from int
-      // (in this case, 0) to scalar_type be defined.  It's not for
+      // (in this case, 0) to value_type be defined.  It's not for
       // types like arprec and dd_real.
       //
       // mfh 29 Sep 2013: On the other hand, arprec and dd_real won't
@@ -984,7 +882,7 @@ struct MV_MultiplyFunctor {
 #pragma loop count (15)
 #pragma unroll
       for (size_type iEntry = lane; iEntry < row.length; iEntry += ThreadsPerRow) {
-        const scalar_type val = row.value(iEntry);
+        const value_type val = row.value(iEntry);
         const size_type ind = row.colidx(iEntry);
 
 #pragma unroll
@@ -999,7 +897,7 @@ struct MV_MultiplyFunctor {
 #pragma loop count (15)
 #pragma unroll
       for(size_type iEntry = lane ; iEntry < row.length ; iEntry+=ThreadsPerRow) {
-        const scalar_type val = row.value(iEntry);
+        const value_type val = row.value(iEntry);
         const size_type ind = row.colidx(iEntry);
 
 #pragma unroll
@@ -1061,7 +959,7 @@ struct MV_MultiplyFunctor {
   void strip_mine_1 (const size_type i) const {
     const size_type iRow = i/ThreadsPerRow;
     const int lane = i%ThreadsPerRow;
-    scalar_type sum = 0;
+    value_type sum = 0;
 
     if(doalpha != -1) {
       const SparseRowView<CrsMatrix> row = m_A.row(iRow);
@@ -1216,8 +1114,8 @@ struct MV_MultiplyFunctor {
   struct MV_MultiplySingleFunctor {
     typedef typename CrsMatrix::device_type                   device_type ;
     typedef typename CrsMatrix::ordinal_type                    size_type ;
-    typedef typename CrsMatrix::non_const_scalar_type         scalar_type ;
-    typedef typename Kokkos::View<scalar_type*, typename CrsMatrix::device_type> range_values;
+    typedef typename CrsMatrix::non_const_value_type         value_type ;
+    typedef typename Kokkos::View<value_type*, typename CrsMatrix::device_type> range_values;
 
     CoeffVector1 beta;
     CoeffVector2 alpha;
@@ -1230,7 +1128,7 @@ struct MV_MultiplyFunctor {
     void operator()(const size_type i) const {
       const size_type iRow = i/ThreadsPerRow;
       const int lane = i%ThreadsPerRow;
-      scalar_type sum = 0;
+      value_type sum = 0;
 
       if (doalpha != -1) {
 	const SparseRowView<CrsMatrix> row = m_A.row(iRow);
@@ -1301,33 +1199,33 @@ struct MV_MultiplyFunctor {
       if (y.dimension_1() != numVecs) {
 	std::ostringstream msg;
 	msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and x do not match\n";
-	msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
-	    << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
-	    << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
-	    << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
-	    << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
+	msg << "\t Labels are: y(" << y.label() << ") b("
+	    << betav.label() << ") a("
+	    << alphav.label() << ") x("
+	    << A.values.label() << ") x("
+	    << x.label() << ")\n";
 	msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") x(" << x.dimension_0() << "," << x.dimension_1() << ")\n";
 	Impl::throw_runtime_exception( msg.str() );
       }
       if (numRows > y.dimension_0()) {
 	std::ostringstream msg;
 	msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of y and A do not match\n";
-	msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
-	    << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
-	    << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
-	    << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
-	    << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
+  msg << "\t Labels are: y(" << y.label() << ") b("
+      << betav.label() << ") a("
+      << alphav.label() << ") x("
+      << A.values.label() << ") x("
+      << x.label() << ")\n";
 	msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") A(" << A.numCols() << "," << A.numRows() << ")\n";
 	Impl::throw_runtime_exception( msg.str() );
       }
       if (numCols > x.dimension_0()) {
 	std::ostringstream msg;
 	msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of x and A do not match\n";
-	msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
-	    << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
-	    << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
-	    << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
-	    << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
+  msg << "\t Labels are: y(" << y.label() << ") b("
+      << betav.label() << ") a("
+      << alphav.label() << ") x("
+      << A.values.label() << ") x("
+      << x.label() << ")\n";
 	msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") A(" << A.numCols() << "," << A.numRows() << ")\n";
 	Impl::throw_runtime_exception( msg.str() );
       }
@@ -1335,11 +1233,11 @@ struct MV_MultiplyFunctor {
 	if (betav.dimension_0()!=numVecs) {
 	  std::ostringstream msg;
 	  msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and b do not match\n";
-	  msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
-	      << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
-	      << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
-	      << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
-	      << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
+	  msg << "\t Labels are: y(" << y.label() << ") b("
+	      << betav.label() << ") a("
+	      << alphav.label() << ") x("
+	      << A.values.label() << ") x("
+	      << x.label() << ")\n";
 	  msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") b(" << betav.dimension_0() << ")\n";
 	  Impl::throw_runtime_exception( msg.str() );
 	}
@@ -1348,11 +1246,11 @@ struct MV_MultiplyFunctor {
 	if(alphav.dimension_0()!=numVecs) {
 	  std::ostringstream msg;
 	  msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of x and b do not match\n";
-	  msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
-	      << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
-	      << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
-	      << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
-	      << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
+	  msg << "\t Labels are: y(" << y.label() << ") b("
+	      << betav.label() << ") a("
+	      << alphav.label() << ") x("
+	      << A.values.label() << ") x("
+	      << x.label() << ")\n";
 	  msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") b(" << betav.dimension_0() << ")\n";
 	  Impl::throw_runtime_exception( msg.str() );
 	}
@@ -1420,22 +1318,22 @@ struct MV_MultiplyFunctor {
       typedef View< typename DomainVector::const_data_type ,
                     typename DomainVector::array_layout ,
                     typename DomainVector::device_type ,
-                    Kokkos::MemoryRandomRead >
+                    Kokkos::MemoryRandomAccess >
       DomainVectorType;
 
       typedef View< typename CoeffVector1::const_data_type ,
                     typename CoeffVector1::array_layout ,
                     typename CoeffVector1::device_type ,
-                    Kokkos::MemoryRandomRead >
+                    Kokkos::MemoryRandomAccess >
       CoeffVector1Type;
 
       typedef View< typename CoeffVector2::const_data_type ,
                     typename CoeffVector2::array_layout ,
                     typename CoeffVector2::device_type ,
-                    Kokkos::MemoryRandomRead >
+                    Kokkos::MemoryRandomAccess >
       CoeffVector2Type;
 
-      typedef CrsMatrix<typename TCrsMatrix::const_scalar_type,
+      typedef CrsMatrix<typename TCrsMatrix::const_value_type,
                         typename TCrsMatrix::ordinal_type,
                         typename TCrsMatrix::device_type,
                         typename TCrsMatrix::memory_traits,
@@ -1446,7 +1344,7 @@ struct MV_MultiplyFunctor {
       #ifndef KOKKOS_FAST_COMPILE
       MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
                          CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,
-	          ThreadsPerRow<typename TCrsMatrix::device_type,typename TCrsMatrix::non_const_scalar_type>::value> op ;
+	          ThreadsPerRow<typename TCrsMatrix::device_type,typename TCrsMatrix::non_const_value_type>::value> op ;
       const typename CrsMatrixType::ordinal_type nrow = A.numRows();
       op.m_A = A ;
       op.m_x = x ;
@@ -1454,12 +1352,12 @@ struct MV_MultiplyFunctor {
       op.beta = betav;
       op.alpha = alphav;
       op.n = x.dimension(1);
-      Kokkos::parallel_for(nrow*ThreadsPerRow<typename TCrsMatrix::device_type,typename TCrsMatrix::non_const_scalar_type>::value , op);
+      Kokkos::parallel_for("SPMV n-rhs",nrow*ThreadsPerRow<typename TCrsMatrix::device_type,typename TCrsMatrix::non_const_value_type>::value , op);
 
 #else // NOT KOKKOS_FAST_COMPILE
 
       MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, CoeffVector1Type, CoeffVector2Type, 2, 2,
-	          ThreadsPerRow<typename TCrsMatrix::device_type, typename TCrsMatrix::non_const_scalar_type>::value> op ;
+	          ThreadsPerRow<typename TCrsMatrix::device_type, typename TCrsMatrix::non_const_value_type>::value> op ;
 
       int numVecs = x.dimension_1();
       CoeffVector1 beta = betav;
@@ -1468,7 +1366,7 @@ struct MV_MultiplyFunctor {
       if (doalpha != 2) {
 	      alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
 	      typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
-	      typename CoeffVector2::scalar_type s_a = (typename CoeffVector2::scalar_type) doalpha;
+	      typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
 
 	      for (int i = 0; i < numVecs; ++i)
 	        h_a(i) = s_a;
@@ -1479,7 +1377,7 @@ struct MV_MultiplyFunctor {
       if (dobeta != 2) {
 	      beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
 	      typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
-	      typename CoeffVector1::scalar_type s_b = (typename CoeffVector1::scalar_type) dobeta;
+	      typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
 
 	      for(int i = 0; i < numVecs; i++)
 	        h_b(i) = s_b;
@@ -1494,8 +1392,8 @@ struct MV_MultiplyFunctor {
       op.beta = beta;
       op.alpha = alpha;
       op.n = x.dimension_1();
-      Kokkos::parallel_for (nrow * ThreadsPerRow<typename TCrsMatrix::device_type,
-                            typename TCrsMatrix::non_const_scalar_type>::value, op);
+      Kokkos::parallel_for ("SPMV n-rhs",nrow * ThreadsPerRow<typename TCrsMatrix::device_type,
+                            typename TCrsMatrix::non_const_value_type>::value, op);
 #endif // KOKKOS_FAST_COMPILE
     }
   }
@@ -1532,22 +1430,20 @@ struct MV_MultiplyFunctor {
       typedef View< typename DomainVector::const_data_type ,
                     typename DomainVector::array_layout ,
                     typename DomainVector::device_type ,
-                    Kokkos::MemoryRandomRead >
+                    Kokkos::MemoryRandomAccess >
       DomainVectorType;
 
       typedef View< typename CoeffVector1::const_data_type ,
                     typename CoeffVector1::array_layout ,
-                    typename CoeffVector1::device_type ,
-                    Kokkos::MemoryRandomRead >
+                    typename CoeffVector1::device_type>
       CoeffVector1Type;
 
       typedef View< typename CoeffVector2::const_data_type ,
                     typename CoeffVector2::array_layout ,
-                    typename CoeffVector2::device_type ,
-                    Kokkos::MemoryRandomRead >
+                    typename CoeffVector2::device_type>
       CoeffVector2Type;
 
-      typedef CrsMatrix<typename TCrsMatrix::const_scalar_type,
+      typedef CrsMatrix<typename TCrsMatrix::const_value_type,
                         typename TCrsMatrix::ordinal_type,
                         typename TCrsMatrix::device_type,
                         typename TCrsMatrix::memory_traits,
@@ -1559,7 +1455,7 @@ struct MV_MultiplyFunctor {
 #ifndef KOKKOS_FAST_COMPILE
       MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
                                CoeffVector1Type, CoeffVector2Type, doalpha, dobeta
-	,ThreadsPerRow<typename CrsMatrixType::device_type,typename CrsMatrixType::non_const_scalar_type>::value> op ;
+	,ThreadsPerRow<typename CrsMatrixType::device_type,typename CrsMatrixType::non_const_value_type>::value> op ;
       const typename CrsMatrixType::ordinal_type nrow = A.numRows();
       op.m_A = A ;
       op.m_x = x ;
@@ -1567,14 +1463,14 @@ struct MV_MultiplyFunctor {
       op.beta = betav;
       op.alpha = alphav;
       op.n = x.dimension(1);
-      Kokkos::parallel_for (nrow * ThreadsPerRow<typename TCrsMatrix::device_type,
-                            typename TCrsMatrix::non_const_scalar_type>::value, op);
+      Kokkos::parallel_for ("SPMV",nrow * ThreadsPerRow<typename TCrsMatrix::device_type,
+                            typename TCrsMatrix::non_const_value_type>::value, op);
 
 #else // NOT KOKKOS_FAST_COMPILE
 
       MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
                                CoeffVector1Type, CoeffVector2Type, 2, 2,
-	ThreadsPerRow<typename CrsMatrixType::device_type, typename CrsMatrixType::non_const_scalar_type>::value> op;
+	ThreadsPerRow<typename CrsMatrixType::device_type, typename CrsMatrixType::non_const_value_type>::value> op;
 
       int numVecs = x.dimension_1();
       CoeffVector1 beta = betav;
@@ -1583,7 +1479,7 @@ struct MV_MultiplyFunctor {
       if(doalpha!=2) {
 	      alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
 	      typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
-	      typename CoeffVector2::scalar_type s_a = (typename CoeffVector2::scalar_type) doalpha;
+	      typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
 
 	      for(int i = 0; i < numVecs; i++)
 	        h_a(i) = s_a;
@@ -1593,7 +1489,7 @@ struct MV_MultiplyFunctor {
       if(dobeta!=2) {
 	      beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
 	      typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
-	      typename CoeffVector1::scalar_type s_b = (typename CoeffVector1::scalar_type) dobeta;
+	      typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
 
 	      for(int i = 0; i < numVecs; i++)
 	        h_b(i) = s_b;
@@ -1607,8 +1503,8 @@ struct MV_MultiplyFunctor {
       op.beta = beta;
       op.alpha = alpha;
       op.n = x.dimension_1();
-      Kokkos::parallel_for (nrow * ThreadsPerRow<typename TCrsMatrix::device_type,
-                            typename TCrsMatrix::non_const_scalar_type>::value, op);
+      Kokkos::parallel_for ("SPMV",nrow * ThreadsPerRow<typename TCrsMatrix::device_type,
+                            typename TCrsMatrix::non_const_value_type>::value, op);
 #endif // KOKKOS_FAST_COMPILE
     }
   }
@@ -1711,7 +1607,7 @@ struct MV_MultiplyFunctor {
       return;
     }
 #endif // KOKKOS_USE_MKL
-    typedef Kokkos::View<typename DomainVector::scalar_type*, typename DomainVector::device_type> aVector;
+    typedef Kokkos::View<typename DomainVector::value_type*, typename DomainVector::device_type> aVector;
     aVector a;
 
     return MV_Multiply (a, y, a, A, x, 0, 1);
@@ -1720,7 +1616,7 @@ struct MV_MultiplyFunctor {
   template<class RangeVector, class CrsMatrix, class DomainVector>
   void
   MV_Multiply (const RangeVector& y,
-	       typename DomainVector::const_scalar_type s_a,
+	       typename DomainVector::const_value_type s_a,
 	       const CrsMatrix& A,
 	       const DomainVector& x)
   {
@@ -1734,7 +1630,7 @@ struct MV_MultiplyFunctor {
       return;
     }
 #endif // KOKKOS_USE_MKL
-    typedef Kokkos::View<typename RangeVector::scalar_type*, typename RangeVector::device_type> aVector;
+    typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector;
     aVector a;
     const int numVecs = x.dimension_1();
 
@@ -1746,20 +1642,16 @@ struct MV_MultiplyFunctor {
 
     if (s_a != 0) {
       a = aVector("a", numVecs);
-      typename aVector::HostMirror h_a = Kokkos::create_mirror_view (a);
-      for (int i = 0; i < numVecs; ++i) {
-	h_a(i) = s_a;
-      }
-      Kokkos::deep_copy(a, h_a);
+      Kokkos::deep_copy(a, s_a);
       return MV_Multiply (a, y, a, A, x, 0, 2);
     }
   }
 
   template<class RangeVector, class CrsMatrix, class DomainVector>
   void
-  MV_Multiply (typename RangeVector::const_scalar_type s_b,
+  MV_Multiply (typename RangeVector::const_value_type s_b,
 	       const RangeVector& y,
-	       typename DomainVector::const_scalar_type s_a,
+	       typename DomainVector::const_value_type s_a,
 	       const CrsMatrix& A,
 	       const DomainVector& x)
   {
@@ -1773,7 +1665,7 @@ struct MV_MultiplyFunctor {
       return;
     }
 #endif // KOKKOS_USE_MKL
-    typedef Kokkos::View<typename RangeVector::scalar_type*, typename RangeVector::device_type> aVector;
+    typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector;
     aVector a;
     aVector b;
     int numVecs = x.dimension_1();
diff --git a/kokkos/kokkos/linalg/src/Kokkos_MV.hpp b/kokkos/kokkos/linalg/src/Kokkos_MV.hpp
index 7084dbd..9138a20 100644
--- a/kokkos/kokkos/linalg/src/Kokkos_MV.hpp
+++ b/kokkos/kokkos/linalg/src/Kokkos_MV.hpp
@@ -1,20 +1,8 @@
 #ifndef KOKKOS_MULTIVECTOR_H_
 #define KOKKOS_MULTIVECTOR_H_
 
-#include <KokkosCore_config.h>
-
-#include <Kokkos_View.hpp>
-#include <Kokkos_Threads.hpp>
-
-#ifdef KOKKOS_HAVE_OPENMP
-#include <Kokkos_OpenMP.hpp>
-#endif
-#ifdef KOKKOS_HAVE_CUDA
-#include <Kokkos_Cuda.hpp>
-#endif
-#include <Kokkos_Macros.hpp>
-#include <Kokkos_ParallelReduce.hpp>
 #include <ctime>
+#include <Kokkos_Core.hpp>
 
 namespace Kokkos {
 
@@ -32,7 +20,7 @@ struct MultiVectorDynamic{
 #endif
   typedef typename Kokkos::View<Scalar**  , layout, device>  type ;
   typedef typename Kokkos::View<const Scalar**  , layout, device>  const_type ;
-  typedef typename Kokkos::View<const Scalar**  , layout, device, Kokkos::MemoryRandomRead>  random_read_type ;
+  typedef typename Kokkos::View<const Scalar**  , layout, device, Kokkos::MemoryRandomAccess>  random_read_type ;
   MultiVectorDynamic() {}
   ~MultiVectorDynamic() {}
 };
@@ -43,7 +31,7 @@ struct MultiVectorStatic{
   typedef typename device::array_layout layout;
   typedef typename Kokkos::View<Scalar*[n]  , layout, device>  type ;
   typedef typename Kokkos::View<const Scalar*[n]  , layout, device>  const_type ;
-  typedef typename Kokkos::View<const Scalar*[n]  , layout, device, Kokkos::MemoryRandomRead>  random_read_type ;
+  typedef typename Kokkos::View<const Scalar*[n]  , layout, device, Kokkos::MemoryRandomAccess>  random_read_type ;
   MultiVectorStatic() {}
   ~MultiVectorStatic() {}
 };
@@ -56,7 +44,6 @@ struct MultiVectorStatic{
 template<class RVector, class aVector, class XVector>
 struct MV_MulScalarFunctor
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   RVector m_r;
@@ -78,7 +65,6 @@ struct MV_MulScalarFunctor
 template<class aVector, class XVector>
 struct MV_MulScalarFunctorSelf
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   XVector m_x;
@@ -95,16 +81,16 @@ struct MV_MulScalarFunctorSelf
   }
 };
 
-template<class RVector, class DataType,class Layout,class Device, class MemoryManagement,class Specialisation, class XVector>
-RVector MV_MulScalar( const RVector & r, const typename Kokkos::View<DataType,Layout,Device,MemoryManagement,Specialisation> & a, const XVector & x)
+template<class RVector, class DataType, class XVector, class ...Args>
+RVector MV_MulScalar( const RVector & r, const typename Kokkos::View<DataType,Args...> & a, const XVector & x)
 {
-  typedef	typename Kokkos::View<DataType,Layout,Device,MemoryManagement> aVector;
+  typedef	typename Kokkos::View<DataType,Args...> aVector;
   if(r==x) {
     MV_MulScalarFunctorSelf<aVector,XVector> op ;
 	op.m_x = x ;
 	op.m_a = a ;
 	op.n = x.dimension(1);
-	Kokkos::parallel_for( x.dimension(0) , op );
+	Kokkos::parallel_for("MV_MulScalar",x.dimension(0) , op );
 	return r;
   }
 
@@ -113,19 +99,19 @@ RVector MV_MulScalar( const RVector & r, const typename Kokkos::View<DataType,La
   op.m_x = x ;
   op.m_a = a ;
   op.n = x.dimension(1);
-  Kokkos::parallel_for( x.dimension(0) , op );
+  Kokkos::parallel_for("MV_MulScalar",x.dimension(0) , op );
   return r;
 }
 
 template<class RVector, class XVector>
-struct MV_MulScalarFunctor<RVector,typename XVector::scalar_type,XVector>
+struct MV_MulScalarFunctor<RVector,typename XVector::const_value_type,XVector>
 {
   typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   RVector m_r;
   typename XVector::const_type m_x ;
-  typename XVector::scalar_type m_a ;
+  typename XVector::value_type m_a ;
   size_type n;
   MV_MulScalarFunctor() {n=1;}
   //--------------------------------------------------------------------------
@@ -140,13 +126,13 @@ struct MV_MulScalarFunctor<RVector,typename XVector::scalar_type,XVector>
 };
 
 template<class XVector>
-struct MV_MulScalarFunctorSelf<typename XVector::scalar_type,XVector>
+struct MV_MulScalarFunctorSelf<typename XVector::value_type,XVector>
 {
   typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   XVector m_x;
-  typename XVector::scalar_type   m_a ;
+  typename XVector::value_type   m_a ;
   size_type n;
   //--------------------------------------------------------------------------
 
@@ -160,23 +146,23 @@ struct MV_MulScalarFunctorSelf<typename XVector::scalar_type,XVector>
 };
 
 template<class RVector, class XVector>
-RVector MV_MulScalar( const RVector & r, const typename XVector::scalar_type &a, const XVector & x)
+RVector MV_MulScalar( const RVector & r, const typename XVector::value_type &a, const XVector & x)
 {
   if(r==x) {
-    MV_MulScalarFunctorSelf<typename XVector::scalar_type,XVector> op ;
+    MV_MulScalarFunctorSelf<typename XVector::value_type,XVector> op ;
 	op.m_x = x ;
 	op.m_a = a ;
 	op.n = x.dimension(1);
-	Kokkos::parallel_for( x.dimension(0) , op );
+	Kokkos::parallel_for("MV_MulScalar",x.dimension(0) , op );
 	return r;
   }
 
-  MV_MulScalarFunctor<RVector,typename XVector::scalar_type,XVector> op ;
+  MV_MulScalarFunctor<RVector,typename XVector::value_type,XVector> op ;
   op.m_r = r ;
   op.m_x = x ;
   op.m_a = a ;
   op.n = x.dimension(1);
-  Kokkos::parallel_for( x.dimension(0) , op );
+  Kokkos::parallel_for("MV_MulScalar",x.dimension(0) , op );
   return r;
 }
 /*------------------------------------------------------------------------------------------
@@ -189,7 +175,6 @@ RVector MV_MulScalar( const RVector & r, const typename XVector::scalar_type &a,
 template<class RVector,class aVector, class XVector, class bVector, class YVector, int scalar_x, int scalar_y,int UNROLL>
 struct MV_AddUnrollFunctor
 {
-  typedef typename RVector::device_type        device_type;
   typedef typename RVector::size_type            size_type;
 
   RVector   m_r ;
@@ -257,7 +242,6 @@ for(size_type k=0;k<UNROLL;k++)
 template<class RVector,class aVector, class XVector, class bVector, class YVector, int scalar_x, int scalar_y>
 struct MV_AddVectorFunctor
 {
-  typedef typename RVector::device_type        device_type;
   typedef typename RVector::size_type            size_type;
 
   RVector   m_r ;
@@ -325,7 +309,7 @@ struct MV_AddVectorFunctor
 /* Variants of Functors with a and b being scalars. */
 
 template<class RVector, class XVector, class YVector, int scalar_x, int scalar_y,int UNROLL>
-struct MV_AddUnrollFunctor<RVector,typename XVector::scalar_type, XVector, typename YVector::scalar_type,YVector,scalar_x,scalar_y,UNROLL>
+struct MV_AddUnrollFunctor<RVector,typename XVector::value_type, XVector, typename YVector::value_type,YVector,scalar_x,scalar_y,UNROLL>
 {
   typedef typename RVector::device_type        device_type;
   typedef typename RVector::size_type            size_type;
@@ -333,8 +317,8 @@ struct MV_AddUnrollFunctor<RVector,typename XVector::scalar_type, XVector, typen
   RVector   m_r ;
   XVector  m_x ;
   YVector   m_y ;
-  typename XVector::scalar_type m_a;
-  typename YVector::scalar_type m_b;
+  typename XVector::value_type m_a;
+  typename YVector::value_type m_b;
   size_type n;
   size_type start;
 
@@ -393,7 +377,7 @@ for(size_type k=0;k<UNROLL;k++)
 };
 
 template<class RVector, class XVector, class YVector, int scalar_x, int scalar_y>
-struct MV_AddVectorFunctor<RVector,typename XVector::scalar_type, XVector, typename YVector::scalar_type,YVector,scalar_x,scalar_y>
+struct MV_AddVectorFunctor<RVector,typename XVector::value_type, XVector, typename YVector::value_type,YVector,scalar_x,scalar_y>
 {
   typedef typename RVector::device_type        device_type;
   typedef typename RVector::size_type            size_type;
@@ -401,8 +385,8 @@ struct MV_AddVectorFunctor<RVector,typename XVector::scalar_type, XVector, typen
   RVector   m_r ;
   XVector  m_x ;
   YVector   m_y ;
-  typename XVector::scalar_type m_a;
-  typename YVector::scalar_type m_b;
+  typename XVector::value_type m_a;
+  typename YVector::value_type m_b;
   size_type n;
 
   MV_AddVectorFunctor() {n=1;}
@@ -473,7 +457,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<1,1>", x.dimension(0) , op );
      return r;
    }
    if(a==1&&b==-1) {
@@ -484,7 +468,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<1,-1>",  x.dimension(0) , op );
      return r;
    }
    if(a==-1&&b==1) {
@@ -495,7 +479,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<-1,1>",  x.dimension(0) , op );
      return r;
    }
    if(a==-1&&b==-1) {
@@ -506,7 +490,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<-1,-1>",  x.dimension(0) , op );
      return r;
    }
    if(a*a!=1&&b==1) {
@@ -517,7 +501,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<2,1>",  x.dimension(0) , op );
      return r;
    }
    if(a*a!=1&&b==-1) {
@@ -528,7 +512,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<2,-1>",  x.dimension(0) , op );
      return r;
    }
    if(a==1&&b*b!=1) {
@@ -539,7 +523,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<1,2>",  x.dimension(0) , op );
      return r;
    }
    if(a==-1&&b*b!=1) {
@@ -550,7 +534,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddUnroll<-1,2>",  x.dimension(0) , op );
      return r;
    }
    MV_AddUnrollFunctor<RVector,aVector,XVector,bVector,YVector,2,2,UNROLL> op ;
@@ -560,7 +544,7 @@ RVector MV_AddUnroll( const RVector & r,const aVector &av,const XVector & x,
    op.m_a = av ;
    op.m_b = bv ;
    op.n = x.dimension(1);
-   Kokkos::parallel_for( x.dimension(0) , op );
+   Kokkos::parallel_for("MV_AddUnroll<2,2>",  x.dimension(0) , op );
 
    return r;
 }
@@ -621,7 +605,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<1,1>", x.dimension(0) , op );
      return r;
    }
    if(a==1&&b==-1) {
@@ -632,7 +616,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<1,-1>", x.dimension(0) , op );
      return r;
    }
    if(a==-1&&b==1) {
@@ -643,7 +627,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<-1,1>", x.dimension(0) , op );
      return r;
    }
    if(a==-1&&b==-1) {
@@ -654,7 +638,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<-1,-1>",  x.dimension(0) , op );
      return r;
    }
    if(a*a!=1&&b==1) {
@@ -665,7 +649,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<2,1>", x.dimension(0) , op );
      return r;
    }
    if(a*a!=1&&b==-1) {
@@ -676,7 +660,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<2,-1>", x.dimension(0) , op );
      return r;
    }
    if(a==1&&b*b!=1) {
@@ -687,7 +671,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<1,2>", x.dimension(0) , op );
      return r;
    }
    if(a==-1&&b*b!=1) {
@@ -698,7 +682,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
      op.m_a = av ;
      op.m_b = bv ;
      op.n = x.dimension(1);
-     Kokkos::parallel_for( x.dimension(0) , op );
+     Kokkos::parallel_for("MV_AddVector<-1,2>", x.dimension(0) , op );
      return r;
    }
    MV_AddVectorFunctor<RVector,aVector,XVector,bVector,YVector,2,2> op ;
@@ -708,7 +692,7 @@ RVector MV_AddVector( const RVector & r,const aVector &av,const XVector & x,
    op.m_a = av ;
    op.m_b = bv ;
    op.n = x.dimension(1);
-   Kokkos::parallel_for( x.dimension(0) , op );
+   Kokkos::parallel_for("MV_AddVector<2,2>", x.dimension(0) , op );
 
    return r;
 }
@@ -723,9 +707,9 @@ RVector MV_Add( const RVector & r,const aVector &av,const XVector & x,
 		return MV_AddVector( r,av,x,bv,y,a,b);
 
 	if(x.dimension_1()==1) {
-    typedef View<typename RVector::scalar_type*,typename RVector::device_type> RVector1D;
-    typedef View<typename XVector::const_scalar_type*,typename XVector::device_type> XVector1D;
-    typedef View<typename YVector::const_scalar_type*,typename YVector::device_type> YVector1D;
+    typedef View<typename RVector::value_type*,typename RVector::device_type> RVector1D;
+    typedef View<typename XVector::const_value_type*,typename XVector::device_type> XVector1D;
+    typedef View<typename YVector::const_value_type*,typename YVector::device_type> YVector1D;
 
     RVector1D r_1d = Kokkos::subview< RVector1D >( r , ALL(),0 );
     XVector1D x_1d = Kokkos::subview< XVector1D >( x , ALL(),0 );
@@ -741,9 +725,9 @@ template<class RVector,class XVector,class YVector>
 RVector MV_Add( const RVector & r, const XVector & x, const YVector & y)
 {
   if(x.dimension_1()==1) {
-    typedef View<typename RVector::scalar_type*,typename RVector::device_type> RVector1D;
-    typedef View<typename XVector::const_scalar_type*,typename XVector::device_type> XVector1D;
-    typedef View<typename YVector::const_scalar_type*,typename YVector::device_type> YVector1D;
+    typedef View<typename RVector::value_type*,typename RVector::device_type> RVector1D;
+    typedef View<typename XVector::const_value_type*,typename XVector::device_type> XVector1D;
+    typedef View<typename YVector::const_value_type*,typename YVector::device_type> YVector1D;
 
     RVector1D r_1d = Kokkos::subview< RVector1D >( r , ALL(),0 );
     XVector1D x_1d = Kokkos::subview< XVector1D >( x , ALL(),0 );
@@ -752,7 +736,7 @@ RVector MV_Add( const RVector & r, const XVector & x, const YVector & y)
     V_Add(r_1d,x_1d,y_1d);
     return r;
   } else {
-	  typename XVector::scalar_type a = 1.0;
+	  typename XVector::value_type a = 1.0;
     return MV_Add(r,a,x,a,y,1,1);
   }
 }
@@ -761,9 +745,9 @@ template<class RVector,class XVector,class bVector, class YVector>
 RVector MV_Add( const RVector & r, const XVector & x, const bVector & bv, const YVector & y )
 {
   if(x.dimension_1()==1) {
-    typedef View<typename RVector::scalar_type*,typename RVector::device_type> RVector1D;
-    typedef View<typename XVector::const_scalar_type*,typename XVector::device_type> XVector1D;
-    typedef View<typename YVector::const_scalar_type*,typename YVector::device_type> YVector1D;
+    typedef View<typename RVector::value_type*,typename RVector::device_type> RVector1D;
+    typedef View<typename XVector::const_value_type*,typename XVector::device_type> XVector1D;
+    typedef View<typename YVector::const_value_type*,typename YVector::device_type> YVector1D;
 
     RVector1D r_1d = Kokkos::subview< RVector1D >( r , ALL(),0 );
     XVector1D x_1d = Kokkos::subview< XVector1D >( x , ALL(),0 );
@@ -779,7 +763,6 @@ RVector MV_Add( const RVector & r, const XVector & x, const bVector & bv, const
 template<class XVector,class YVector>
 struct MV_DotProduct_Right_FunctorVector
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
   typedef typename XVector::value_type        value_type[];
   size_type value_count;
@@ -826,7 +809,6 @@ struct MV_DotProduct_Right_FunctorVector
 template<class XVector,class YVector,int UNROLL>
 struct MV_DotProduct_Right_FunctorUnroll
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
   typedef typename XVector::value_type        value_type[];
   size_type value_count;
@@ -875,7 +857,7 @@ rVector MV_Dot(const rVector &r, const XVector & x, const YVector & y, int n = -
         op.m_y = y;
         op.value_count = numVecs;
 
-        Kokkos::parallel_reduce( n , op, r );
+        Kokkos::parallel_reduce("MV_Dot(>16)", n , op, r );
         return r;
      }
      else
@@ -885,7 +867,7 @@ rVector MV_Dot(const rVector &r, const XVector & x, const YVector & y, int n = -
            op.m_x = x;
            op.m_y = y;
            op.value_count = numVecs;
-           Kokkos::parallel_reduce( n , op, r );
+           Kokkos::parallel_reduce("MV_Dot(16)", n , op, r );
       	   break;
        }
        case 15: {
@@ -1002,12 +984,12 @@ rVector MV_Dot(const rVector &r, const XVector & x, const YVector & y, int n = -
       	   break;
        }
        case 1: {
-         typedef View<typename XVector::const_scalar_type*,typename XVector::device_type> XVector1D;
-         typedef View<typename YVector::const_scalar_type*,typename YVector::device_type> YVector1D;
+         typedef View<typename XVector::const_value_type*,typename XVector::device_type> XVector1D;
+         typedef View<typename YVector::const_value_type*,typename YVector::device_type> YVector1D;
 
          XVector1D x_1d = Kokkos::subview< XVector1D >( x , ALL(),0 );
          YVector1D y_1d = Kokkos::subview< YVector1D >( y , ALL(),0 );
-         r[0] = V_Dot(x_1d,y_1d,n);
+         r[0] = V_Dot("V_Dot",x_1d,y_1d,n);
       	   break;
        }
      }
@@ -1021,7 +1003,6 @@ rVector MV_Dot(const rVector &r, const XVector & x, const YVector & y, int n = -
 template<class RVector, class aVector, class XVector>
 struct V_MulScalarFunctor
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   RVector m_r;
@@ -1039,7 +1020,6 @@ struct V_MulScalarFunctor
 template<class aVector, class XVector>
 struct V_MulScalarFunctorSelf
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   XVector m_x;
@@ -1053,15 +1033,15 @@ struct V_MulScalarFunctorSelf
   }
 };
 
-template<class RVector, class DataType,class Layout,class Device, class MemoryManagement,class Specialisation, class XVector>
-RVector V_MulScalar( const RVector & r, const typename Kokkos::View<DataType,Layout,Device,MemoryManagement,Specialisation> & a, const XVector & x)
+template<class RVector, class DataType,class XVector,class ...Args>
+RVector V_MulScalar( const RVector & r, const typename Kokkos::View<DataType,Args...> & a, const XVector & x)
 {
-  typedef	typename Kokkos::View<DataType,Layout,Device,MemoryManagement> aVector;
+  typedef	typename Kokkos::View<DataType,Args...> aVector;
   if(r==x) {
     V_MulScalarFunctorSelf<aVector,XVector> op ;
 	op.m_x = x ;
 	op.m_a = a ;
-	Kokkos::parallel_for( x.dimension(0) , op );
+	Kokkos::parallel_for("MV_MulScalarSelf", x.dimension(0) , op );
 	return r;
   }
 
@@ -1069,19 +1049,18 @@ RVector V_MulScalar( const RVector & r, const typename Kokkos::View<DataType,Lay
   op.m_r = r ;
   op.m_x = x ;
   op.m_a = a ;
-  Kokkos::parallel_for( x.dimension(0) , op );
+  Kokkos::parallel_for("MV_MulScalar", x.dimension(0) , op );
   return r;
 }
 
 template<class RVector, class XVector>
-struct V_MulScalarFunctor<RVector,typename XVector::scalar_type,XVector>
+struct V_MulScalarFunctor<RVector,typename XVector::const_value_type,XVector>
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   RVector m_r;
   typename XVector::const_type m_x ;
-  typename XVector::scalar_type m_a ;
+  typename XVector::value_type m_a ;
   //--------------------------------------------------------------------------
 
   KOKKOS_INLINE_FUNCTION
@@ -1092,13 +1071,12 @@ struct V_MulScalarFunctor<RVector,typename XVector::scalar_type,XVector>
 };
 
 template<class XVector>
-struct V_MulScalarFunctorSelf<typename XVector::scalar_type,XVector>
+struct V_MulScalarFunctorSelf<typename XVector::const_value_type,XVector>
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
 
   XVector m_x;
-  typename XVector::scalar_type   m_a ;
+  typename XVector::value_type   m_a ;
   //--------------------------------------------------------------------------
 
   KOKKOS_INLINE_FUNCTION
@@ -1110,38 +1088,37 @@ struct V_MulScalarFunctorSelf<typename XVector::scalar_type,XVector>
 
 
 template<class RVector, class XVector>
-RVector V_MulScalar( const RVector & r, const typename XVector::scalar_type &a, const XVector & x)
+RVector V_MulScalar( const RVector & r, const typename XVector::value_type &a, const XVector & x)
 {
   if(r==x) {
-    V_MulScalarFunctorSelf<typename XVector::scalar_type,XVector> op ;
+    V_MulScalarFunctorSelf<typename XVector::const_value_type,XVector> op ;
 	op.m_x = x ;
 	op.m_a = a ;
-	Kokkos::parallel_for( x.dimension(0) , op );
+	Kokkos::parallel_for("MV_MulScalarSelf", x.dimension(0) , op );
 	return r;
   }
 
-  V_MulScalarFunctor<RVector,typename XVector::scalar_type,XVector> op ;
+  V_MulScalarFunctor<RVector,typename XVector::const_value_type,XVector> op ;
   op.m_r = r ;
   op.m_x = x ;
   op.m_a = a ;
-  Kokkos::parallel_for( x.dimension(0) , op );
+  Kokkos::parallel_for("MV_MulScalar", x.dimension(0) , op );
   return r;
 }
 
 template<class RVector, class XVector, class YVector, int scalar_x, int scalar_y>
 struct V_AddVectorFunctor
 {
-  typedef typename RVector::device_type        device_type;
   typedef typename RVector::size_type            size_type;
-  typedef typename XVector::scalar_type 	   scalar_type;
+  typedef typename XVector::value_type 	   value_type;
   RVector   m_r ;
   typename XVector::const_type  m_x ;
   typename YVector::const_type   m_y ;
-  const scalar_type m_a;
-  const scalar_type m_b;
+  const value_type m_a;
+  const value_type m_b;
 
   //--------------------------------------------------------------------------
-  V_AddVectorFunctor(const RVector& r, const scalar_type& a,const XVector& x,const scalar_type& b,const YVector& y):
+  V_AddVectorFunctor(const RVector& r, const value_type& a,const XVector& x,const value_type& b,const YVector& y):
 	  m_r(r),m_x(x),m_y(y),m_a(a),m_b(b)
   { }
 
@@ -1172,14 +1149,13 @@ struct V_AddVectorFunctor
 template<class RVector, class XVector, int scalar_x>
 struct V_AddVectorSelfFunctor
 {
-  typedef typename RVector::device_type        device_type;
   typedef typename RVector::size_type            size_type;
-  typedef typename XVector::scalar_type      scalar_type;
+  typedef typename XVector::value_type      value_type;
   RVector   m_r ;
   typename XVector::const_type  m_x ;
-  const scalar_type m_a;
+  const value_type m_a;
 
-  V_AddVectorSelfFunctor(const RVector& r, const scalar_type& a,const XVector& x):
+  V_AddVectorSelfFunctor(const RVector& r, const value_type& a,const XVector& x):
     m_r(r),m_x(x),m_a(a)
   { }
 
@@ -1195,26 +1171,26 @@ struct V_AddVectorSelfFunctor
   }
 };
 template<class RVector, class XVector, class YVector, int doalpha, int dobeta>
-RVector V_AddVector( const RVector & r,const typename XVector::scalar_type &av,const XVector & x,
-		const typename XVector::scalar_type &bv, const YVector & y,int n=-1)
+RVector V_AddVector( const RVector & r,const typename XVector::value_type &av,const XVector & x,
+		const typename XVector::value_type &bv, const YVector & y,int n=-1)
 {
   if(n == -1) n = x.dimension_0();
   if(r.ptr_on_device()==x.ptr_on_device() && doalpha == 1) {
     V_AddVectorSelfFunctor<RVector,YVector,dobeta> f(r,bv,y);
-    parallel_for(n,f);
+    parallel_for("V_AddVectorSelf",n,f);
   } else if(r.ptr_on_device()==y.ptr_on_device() && dobeta == 1) {
     V_AddVectorSelfFunctor<RVector,XVector,doalpha> f(r,av,x);
-    parallel_for(n,f);
+    parallel_for("V_AddVectorSelf",n,f);
   } else {
     V_AddVectorFunctor<RVector,XVector,YVector,doalpha,dobeta> f(r,av,x,bv,y);
-    parallel_for(n,f);
+    parallel_for("V_AddVector",n,f);
   }
   return r;
 }
 
 template<class RVector, class XVector, class YVector>
-RVector V_AddVector( const RVector & r,const typename XVector::scalar_type &av,const XVector & x,
-		const typename YVector::scalar_type &bv, const YVector & y, int n = -1,
+RVector V_AddVector( const RVector & r,const typename XVector::value_type &av,const XVector & x,
+		const typename YVector::value_type &bv, const YVector & y, int n = -1,
 		int a=2,int b=2)
 {
 	if(a==-1) {
@@ -1264,7 +1240,7 @@ RVector V_Add( const RVector & r, const XVector & x, const YVector & y, int n=-1
 }
 
 template<class RVector,class XVector,class YVector>
-RVector V_Add( const RVector & r, const XVector & x, const typename XVector::scalar_type  & bv, const YVector & y,int n=-1 )
+RVector V_Add( const RVector & r, const XVector & x, const typename XVector::value_type  & bv, const YVector & y,int n=-1 )
 {
   int b = 2;
   //if(bv == 0) b = 0;
@@ -1274,7 +1250,7 @@ RVector V_Add( const RVector & r, const XVector & x, const typename XVector::sca
 }
 
 template<class RVector,class XVector,class YVector>
-RVector V_Add( const RVector & r, const typename XVector::scalar_type  & av, const XVector & x, const typename XVector::scalar_type  & bv, const YVector & y,int n=-1 )
+RVector V_Add( const RVector & r, const typename XVector::value_type  & av, const XVector & x, const typename XVector::value_type  & bv, const YVector & y,int n=-1 )
 {
   int a = 2;
   int b = 2;
@@ -1291,9 +1267,8 @@ RVector V_Add( const RVector & r, const typename XVector::scalar_type  & av, con
 template<class XVector, class YVector>
 struct V_DotFunctor
 {
-  typedef typename XVector::device_type        device_type;
   typedef typename XVector::size_type            size_type;
-  typedef typename XVector::non_const_scalar_type 	   value_type;
+  typedef typename XVector::non_const_value_type 	   value_type;
   XVector  m_x ;
   YVector   m_y ;
 
@@ -1307,28 +1282,15 @@ struct V_DotFunctor
   {
 	  sum+=m_x(i)*m_y(i);
   }
-
-  KOKKOS_INLINE_FUNCTION
-  void init( volatile value_type &update) const
-  {
-    update = 0;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void join( volatile value_type &update ,
-                    const volatile value_type &source ) const
-  {
-	update += source ;
-  }
 };
 
 template<class XVector, class YVector>
-typename XVector::scalar_type V_Dot( const XVector & x, const YVector & y, int n = -1)
+typename XVector::value_type V_Dot( const XVector & x, const YVector & y, int n = -1)
 {
   V_DotFunctor<XVector,YVector> f(x,y);
   if (n<0) n = x.dimension_0();
-  typename XVector::non_const_scalar_type ret_val;
-  parallel_reduce(n,f,ret_val);
+  typename XVector::non_const_value_type ret_val;
+  parallel_reduce("V_Dot",n,f,ret_val);
   return ret_val;
 }
 }//end namespace Kokkos
diff --git a/kokkos/src/Hex8_box_utils.hpp b/kokkos/src/Hex8_box_utils.hpp
index f7dfb11..aec3c00 100644
--- a/kokkos/src/Hex8_box_utils.hpp
+++ b/kokkos/src/Hex8_box_utils.hpp
@@ -100,7 +100,7 @@ void get_hex8_node_coords_3d(Scalar x, Scalar y, Scalar z,
 }
 
 template<typename GlobalOrdinal, typename Scalar>
-KOKKOS_INLINE_FUNCTION
+inline
 void
 get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
                           GlobalOrdinal elemID,
@@ -161,7 +161,7 @@ get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
 }
 
 template<typename GlobalOrdinal, typename Scalar>
-KOKKOS_INLINE_FUNCTION
+inline
 void
 get_elem_nodes_and_coords(const simple_mesh_description<GlobalOrdinal>& mesh,
                           GlobalOrdinal elemID,
diff --git a/kokkos/src/Kokkos_Types.hpp b/kokkos/src/Kokkos_Types.hpp
index c7b3f16..232acf1 100644
--- a/kokkos/src/Kokkos_Types.hpp
+++ b/kokkos/src/Kokkos_Types.hpp
@@ -1,35 +1,9 @@
-#include <KokkosCore_config.h>
-#ifdef KOKKOS_HAVE_PTHREAD
-  #include <Kokkos_Threads.hpp>
-  typedef Kokkos::Threads host_device_type;
-  #ifndef KOKKOS_HAVE_CUDA
-    typedef Kokkos::Threads device_device_type;
-  #endif
-#else
-  #ifdef KOKKOS_HAVE_OPENMP
-    #include <Kokkos_OpenMP.hpp>
-    typedef Kokkos::OpenMP host_device_type;
-    #ifndef KOKKOS_HAVE_CUDA
-      typedef Kokkos::OpenMP device_device_type;
-    #endif
-  #else
-    #ifdef KOKKOS_HAVE_SERIAL
-      #include <Kokkos_Serial.hpp>
-      typedef Kokkos::Serial host_device_type;
-      #ifndef KOKKOS_HAVE_CUDA
-        typedef Kokkos::Serial device_device_type;
-      #endif
-    #else
-      #error "No Kokkos Host Device defined"
-    #endif
-  #endif
-#endif
-#ifdef KOKKOS_HAVE_CUDA
-  #include <Kokkos_Cuda.hpp>
-  typedef Kokkos::Cuda device_device_type;
-#endif
+#include<Kokkos_Core.hpp>
+#include<Kokkos_CrsMatrix.hpp>
+#include<Kokkos_MV.hpp>
 
-#include <Kokkos_View.hpp>
+typedef Kokkos::DefaultHostExecutionSpace host_device_type;
+typedef Kokkos::DefaultExecutionSpace device_device_type;
 
 typedef int GlobalOrdinal;
 typedef Kokkos::View<GlobalOrdinal*,device_device_type> v_global_ordinal;
diff --git a/kokkos/src/Makefile b/kokkos/src/Makefile
index bd28610..fd6e5b4 100644
--- a/kokkos/src/Makefile
+++ b/kokkos/src/Makefile
@@ -1,21 +1,16 @@
 #-----------------------------------------------------------------------
 SHELL = /bin/sh
 
-MPIPATH = /opt/mpi
-
-
 CXX = mpicxx 
 CC = mpicc
 LINK = mpicxx 
 
-#Note: when using absolute paths KOKKOSPATH and KOKKOSPATH_INC should be the same
-#KOKKOSPATH is using for copying source files into the Object directory
-#KOKKOSPATH_INC is used during the actual compilation
-KOKKOSPATH = ../kokkos
-KOKKOSPATH_INC = ../../kokkos
+# Kokkos Settings:
 
-PWD = `pwd`
+KOKKOS_DEVICES = OpenMP
+KOKKOS_ARCH = SNB
 
+# MiniFE Settings:
 MINIFE_TYPES =  \
 	-DMINIFE_SCALAR=double   \
 	-DMINIFE_LOCAL_ORDINAL=int      \
@@ -23,101 +18,73 @@ MINIFE_TYPES =  \
 
 MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
 # MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
- 
-CPPFLAGS = -O3 -I. -mavx -I../ -I../../utils -I../../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMPICH_IGNORE_CXX_SEEK -I$(KOKKOSPATH_INC)/core/src -I$(KOKKOSPATH_INC)/containers/src -I$(KOKKOSPATH_INC)/linalg/src -fPIC 
-LINKFLAGS = -O3 -mavx   
-
-#Use MPI
-CPPFLAGS += -DHAVE_MPI -I$(MPIPATH)/include
-
-##Enable DEBUG
-#CPPFLAGS += -g -G -DKOKKOSARRAY_EXPRESSION_CHECK -DENABLE_TRACEBACK
-#LINKFLAGS += -g
 
-#Enable Single Precision
-#CPPFLAGS += -DPRECISION=1
-
-SRC = $(shell ls *.cpp;)
-CPY = $(PWD)/*.cpp
+MINIFE_INFO = 1
+MINIFE_KERNELS = 0
 
-KOKKOS_SRC = $(shell cd $(KOKKOSPATH)/core/src/impl; ls *.cpp;)
-KOKKOS_CPY = $(KOKKOSPATH)/core/src/impl/*.cpp
 
-#Use OpenMP backend
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/OpenMP; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/OpenMP/*.cpp
-CPPFLAGS += -DKOKKOS_HAVE_OPENMP -fopenmp
-LINKFLAGS += -fopenmp 
+#PATHS
 
-##Use PThreads Backend
-#KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/Threads; ls *.cpp;)
-#KOKKOS_CPY += $(KOKKOSPATH)/core/src/Threads/*.cpp
-#CPPFLAGS += -DKOKKOS_HAVE_PTHREAD 
-#USRLIB += -lpthread
+MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST))))
 
-##Use HWLOC
-#HWLOCPATH = ./
-#CPPFLAGS += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
-#USRLIB += -L$(HWLOCPATH)/lib -lhwloc 
+MINIFE_PATH = ${MAKEFILE_PATH}..
 
-#-----------------------------------------------------------------------
-OBJ = $(KOKKOS_SRC:.cpp=.o)
+KOKKOS_PATH ?= ../kokkos
+KOKKOS_PATH_INC ?= ${KOKKOS_PATH}
 
-SYSLIB = $(LIBMPI) $(INTELLIB) $(LIBIB)
+all:generate_info miniFE.x
 
+CXXFLAGS = -O3 
 
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
+LDFLAGS = -O3   
 
-LDFLAGS = $(LINKFLAGS)
-LIBS= $(USRLIB) $(SYSLIB)
 
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
+MINIFE_INCLUDES  = -I./ -I${MINIFE_PATH} -I${MINIFE_PATH}/src -I${MINIFE_PATH}/kokkos/linalg/src 
+MINIFE_INCLUDES += -I${MINIFE_PATH}/fem -I${MINIFE_PATH}/utils -I${MINIFE_PATH}/common
+override CXXFLAGS += -DMPICH_IGNORE_CXX_SEEK -fPIC $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) ${MINIFE_INCLUDES}  
+override CXXFLAGS += -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -DUSE_MPI_WTIME
 
+#Use MPI
+override CXXFLAGS += -DHAVE_MPI
 
-copy: 
-	@if [ ! -d Obj ]; then mkdir Obj; fi
-	@echo '' > Obj/KokkosCore_config.h
-	@cp -p $(SRC) Obj
-	@cp -p $(KOKKOS_CPY) Obj
-	@cp Makefile Obj/Makefile
-	@cd Obj; ../get_common_files
-	@cd Obj; $(MAKE) all "OBJ = $(OBJ)"
+include $(KOKKOS_PATH)/Makefile.kokkos
 
+#Enable Single Precision
+#override CXXFLAGS += -DPRECISION=1
 
-OBJ += BoxPartition.o YAML_Doc.o YAML_Element.o
-OBJ += param_utils.o utils.o  mytimer.o
-OBJ += main.o
+SRC = $(wildcard $(MINIFE_PATH)/src/*.cpp)
+SRC += $(MINIFE_PATH)/common/YAML_Doc.cpp $(MINIFE_PATH)/common/YAML_Element.cpp
+SRC += $(wildcard $(MINIFE_PATH)/utils/*.cpp)
 
+HEADERS = $(wildcard $(MAKEFILE_PATH)/src/*.hpp)
+HEADERS += $(MINIFE_PATH)/common/YAML_Doc.hpp $(MINIFE_PATH)/common/YAML_Element.hpp
+HEADERS += $(wildcard $(MINIFE_PATH)/utils/*.hpp)
 
-MINIFE_INFO = 1
-MINIFE_KERNELS = 0
+vpath %.cpp $(sort $(dir $(SRC)))
 
-vpath %.cpp ../../utils
+$(warning $(SRC))
+OBJ = $(notdir $(SRC:.cpp=.o))
+$(warning $(OBJ))
+$(warning $(HEADERS))
+$(warning $(KOKKOS_PATH))
+$(warning $(KOKKOS_CPPFLAGS))
 
-all:generate_info miniFE.x
-
-miniFE.x:$(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) ../*.hpp generate_info
-	$(INSTRUMENT) $(LINK) $(CXXFLAGS) $(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o ../miniFE.x $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
+generate_info: 
+	@${MINIFE_PATH}/common/generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
+	
 
-generate_info:
-	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
+miniFE.x:$(OBJ) $(KOKKOS_LINK_DEPENDS) generate_info $(HEADERS)
+	$(INSTRUMENT) $(LINK) $(LDFLAGS) $(KOKKOS_LDFLAGS) $(OBJ) $(KOKKOS_LIBS) -o miniFE.x
 
 test:
 	./run_test x
 
-%.o:%.cpp *.hpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-
-%.o:%.c *.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
 clean:
 	rm -rf *.o *.a miniFE.x *.linkinfo miniFE_info.hpp Obj
 
 realclean: clean
 	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* minife_debug*
   
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $< -o $(notdir $@)
+  
\ No newline at end of file
diff --git a/kokkos/src/Makefile.cray b/kokkos/src/Makefile.cray
deleted file mode 100644
index 3e475f5..0000000
--- a/kokkos/src/Makefile.cray
+++ /dev/null
@@ -1,143 +0,0 @@
-#-----------------------------------------------------------------------
-SHELL = /bin/sh
-
-MPIPATH = /opt/mpi
-
-
-CXX = `which CC`
-CC = `which CC`
-LINK = `which CC`
-
-MPI = yes
-OMP = yes
-
-ifeq ($(SVN), yes)
-  KOKKOSPATH = /opt/Trilinos/packages/kokkos
-  KOKKOSPATH_INC = $(KOKKOSPATH)
-else
-  KOKKOSPATH = ../kokkos
-  KOKKOSPATH_INC = ../../kokkos
-endif
-
-HWLOCPATH = ./
-
-PWD = `pwd`
-
-SRC = $(shell ls *.cpp;)
-CPY = $(PWD)/*.cpp
-
-KOKKOS_SRC = $(shell cd $(KOKKOSPATH)/core/src/impl; ls *.cpp;)
-KOKKOS_CPY = $(KOKKOSPATH)/core/src/impl/*.cpp
-
-ifeq ($(OMP),yes)
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/OpenMP; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/OpenMP/*.cpp
-else
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/Threads; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/Threads/*.cpp
-endif
-
-MINIFE_TYPES =  \
-        -DMINIFE_SCALAR=double   \
-        -DMINIFE_LOCAL_ORDINAL=int      \
-        -DMINIFE_GLOBAL_ORDINAL=int
-
-MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
-# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
-
-#-----------------------------------------------------------------------
-OBJ = $(KOKKOS_SRC:.cpp=.o)
-
-CPPFLAGS = -O3 -I. -I../ -I../../utils -I../../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMPICH_IGNORE_CXX_SEEK -I$(KOKKOSPATH_INC)/core/src -I$(KOKKOSPATH_INC)/containers/src -I$(KOKKOSPATH_INC)/linalg/src -fPIC  
-LINKFLAGS = -O3  
-
-ifeq ($(MPI), yes)
-CPPFLAGS += -DHAVE_MPI 
-endif
-
-ifeq ($(OMP),yes)
-CPPFLAGS += -DKOKKOS_HAVE_OPENMP 
-LINKFLAGS +=  
-else
-CPPFLAGS += -DKOKKOS_HAVE_PTHREAD -h nopragma=omp -h noomp
-USRLIB += 
-endif
-
-ifeq ($(HWLOC),yes)
-CPPFLAGS += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
-USRLIB += -L$(HWLOCPATH)/lib -lhwloc 
-endif
-
-ifeq ($(ANSI_ALIAS), yes)
-CPPFLAGS += -ansi-alias 
-LINKFLAGS += -ansi-alias
-endif
-
-ifeq ($(DEBUG), yes)
-CPPFLAGS += -g -G -DKOKKOSARRAY_EXPRESSION_CHECK -DENABLE_TRACEBACK
-LINKFLAGS += -g
-endif
-
-ifeq ($(LIBRT),yes)
-CPPFLAGS += -DKOKKOS_USE_LIBRT -DPREC_TIMER
-USRLIB += -lrt
-endif
-
-SYSLIB = $(LIBMPI) $(INTELLIB) $(LIBIB)
-
-
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
-
-LDFLAGS = $(LINKFLAGS)
-LIBS= $(USRLIB) $(SYSLIB)
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-OBJ += BoxPartition.o YAML_Doc.o YAML_Element.o
-OBJ += param_utils.o utils.o  mytimer.o
-OBJ += main.o
-
-copy: 
-	@if [ ! -d Obj_cray ]; then mkdir Obj_cray; fi
-	@echo '' > Obj_cray/KokkosCore_config.h
-	@cp -p $(SRC) Obj_cray
-	@cp -p $(KOKKOS_CPY) Obj_cray
-	@cp Makefile.cray Obj_cray/Makefile
-	@cd Obj_cray; ../get_common_files
-	@cd Obj_cray; $(MAKE) all "OBJ = $(OBJ)"
-
-
-
-
-MINIFE_INFO = 1
-MINIFE_KERNELS = 0
-
-vpath %.cpp ../../utils
-
-all:generate_info miniFE.cray
-
-miniFE.cray:$(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) ../*.hpp generate_info
-	$(INSTRUMENT) $(LINK) $(CXXFLAGS) $(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o ../miniFE.cray $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
-
-generate_info:
-	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
-
-test:
-	./run_test cray
-
-%.o:%.cpp *.hpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-
-%.o:%.c *.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
-clean:
-	rm -rf *.o *.a miniFE.cray *.linkinfo miniFE_info.hpp Obj_cray
-
-realclean: clean
-	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* minife_debug*
-  
diff --git a/kokkos/src/Makefile.cuda b/kokkos/src/Makefile.cuda
deleted file mode 100644
index be9a303..0000000
--- a/kokkos/src/Makefile.cuda
+++ /dev/null
@@ -1,176 +0,0 @@
-#-----------------------------------------------------------------------
-SHELL = /bin/sh
-
-MPIPATH = /opt/mpi
-
-CXX=nvcc
-CC=nvcc
-LINK =    $(MPIPATH)/bin/mpicxx
-
-CUDA = yes
-MPI = yes
-CUDA_ARCH = sm_35
-
-ifeq ($(SVN), yes)
-  KOKKOSPATH = /opt/Trilinos/packages/kokkos
-  KOKKOSPATH_INC = $(KOKKOSPATH)
-else
-  #when taking relative paths the include path must be one level further down 
-  #because it starts off in the Obj directory
-  KOKKOSPATH = ../kokkos
-  KOKKOSPATH_INC = ../../kokkos
-endif
-
-HWLOCPATH = ./
-CUDAPATH = /usr/local/cuda
-
-PWD = `pwd`
-
-SRC = $(shell ls *.cpp;)
-CPY = $(PWD)/*.cpp
-
-KOKKOS_SRC = $(shell cd $(KOKKOSPATH)/core/src/impl; ls *.cpp;)
-KOKKOS_CPY = $(KOKKOSPATH)/core/src/impl/*.cpp
-
-ifeq ($(OMP),yes)
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/OpenMP; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/OpenMP/*.cpp
-else
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/Threads; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/Threads/*.cpp
-endif
-
-ifeq ($(CUDA),yes)
-KOKKOS_CUDASRC += $(shell cd $(KOKKOSPATH)/core/src/Cuda; ls *.cu;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/Cuda/*.cu
-endif
-
-MINIFE_TYPES =  \
-        -DMINIFE_SCALAR=double   \
-        -DMINIFE_LOCAL_ORDINAL=int      \
-        -DMINIFE_GLOBAL_ORDINAL=int
-
-MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
-# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
-
-#-----------------------------------------------------------------------
-OBJ = $(KOKKOS_SRC:.cpp=.o) $(KOKKOS_CUDASRC:.cu=.o)
-
-CPPFLAGS = -m64 -O3 -I. -I../ -I../../utils -I../../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMPICH_IGNORE_CXX_SEEK -I$(KOKKOSPATH_INC)/core/src/ -I$(KOKKOSPATH_INC)/containers/src -I$(KOKKOSPATH_INC)/linalg/src -I$(MPIPATH)/include -arch=$(CUDA_ARCH) -maxrregcount=64 -x cu -Xcompiler -fPIC -restrict  
-LINKFLAGS = -m64 -O3 -L$(MPIPATH)/lib -L$(INTELPATH)/lib/intel64 -L$(CUDAPATH)/lib64
-
-ifeq ($(MPI), yes)
-CPPFLAGS += -DHAVE_MPI -I$(MPIPATH)/include -DGPU_MPI
-endif
-
-ifeq ($(CUDA), yes)
-CPPFLAGS += -DDEVICE=2 -DKOKKOS_HAVE_CUDA
-endif
-
-ifeq ($(CUSPARSE), yes)
-CPPFLAGS += -DKOKKOS_USE_CUSPARSE
-USRLIB += -lcusparse
-endif
-
-ifeq ($(CUBLAS), yes)
-CPPFLAGS += -DKOKKOS_USE_CUBLAS
-USRLIB += -lcublas
-endif
-
-ifeq ($(AVX), yes)
-CPPFLAGS += -Xcompiler -mavx
-LINKFLAGS += -mavx
-endif
-
-ifeq ($(OMP),yes)
-CPPFLAGS += -DKOKKOS_HAVE_OPENMP -Xcompiler -fopenmp
-LINKFLAGS += -fopenmp 
-else
-CPPFLAGS += -DKOKKOS_HAVE_PTHREAD 
-USRLIB += -lpthread
-endif
-
-ifeq ($(HWLOC),yes)
-CPPFLAGS += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
-USRLIB += -L$(HWLOCPATH)/lib -lhwloc 
-endif
-
-ifeq ($(RED_PREC), yes)
-CPPFLAGS += --use_fast_math
-endif
-
-ifeq ($(DEBUG), yes)
-CPPFLAGS += -g -G -DKOKKOSARRAY_EXPRESSION_CHECK -DENABLE_TRACEBACK
-LINKFLAGS += -g
-endif
-
-ifeq ($(LIBRT),yes)
-CPPFLAGS += -DKOKKOS_USE_LIBRT -DPREC_TIMER
-USRLIB += -lrt
-endif
-
-ifeq ($(CUDALDG), yes)
-CPPFLAGS += -DKOKKOS_USE_LDG_INTRINSIC
-endif
-
-SYSLIB =  -lcuda -lcudart $(LIBMPI) $(INTELLIB) $(LIBIB)
-
-
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
-
-LDFLAGS = $(LINKFLAGS)
-LIBS= $(USRLIB) $(SYSLIB)
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-OBJ += BoxPartition.o YAML_Doc.o YAML_Element.o
-OBJ += param_utils.o utils.o  mytimer.o
-OBJ += main.o
-
-copy: 
-	@if [ ! -d Obj_cuda ]; then mkdir Obj_cuda; fi
-	@echo '' > Obj_cuda/KokkosCore_config.h
-	@cp -p $(SRC) Obj_cuda
-	@cp -p $(KOKKOS_CPY) Obj_cuda
-	@cp Makefile.cuda Obj_cuda/Makefile
-	@cd Obj_cuda; ../get_common_files
-	@cd Obj_cuda; $(MAKE) all "OBJ = $(OBJ)"
-
-
-
-
-MINIFE_INFO = 1
-MINIFE_KERNELS = 0
-
-vpath %.cpp ../../utils
-
-all:generate_info miniFE.cuda
-
-miniFE.cuda:$(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) ../*.hpp generate_info
-	$(INSTRUMENT) $(LINK) $(CXXFLAGS) $(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o ../miniFE.cuda $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
-
-generate_info:
-	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
-
-test:
-	./run_test cuda
-.SUFFIXES: .cu
-
-%.o:%.cpp *.hpp
-	$(CXX) $(CUDA_SWITCH) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-.cu.o:
-	$(CXX) $(CUDA_SWITCH) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-
-%.o:%.c *.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
-clean:
-	rm -rf *.o *.a miniFE.cuda *.linkinfo miniFE_info.hpp Obj_cuda
-
-realclean: clean
-	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* minife_debug*
-  
diff --git a/kokkos/src/Makefile.intel b/kokkos/src/Makefile.intel
deleted file mode 100644
index ed83848..0000000
--- a/kokkos/src/Makefile.intel
+++ /dev/null
@@ -1,176 +0,0 @@
-#-----------------------------------------------------------------------
-SHELL = /bin/sh
-
-MPIPATH = /opt/mpi
-
-
-CXX = mpiicpc 
-CC = mpiicc
-LINK = mpiicpc 
-
-AVX = yes
-MPI = yes
-OMP = yes
-KNC = yes
-
-ifeq ($(SVN), yes)
-  KOKKOSPATH = /opt/Trilinos/packages/kokkos
-  KOKKOSPATH_INC = $(KOKKOSPATH)
-else
-  #when taking relative paths the include path must be one level further down 
-  #because it starts off in the Obj directory
-  KOKKOSPATH = ../kokkos
-  KOKKOSPATH_INC = ../../kokkos
-endif
-
-HWLOCPATH = ./
-
-PWD = `pwd`
-
-SRC = $(shell ls *.cpp;)
-CPY = $(PWD)/*.cpp
-
-KOKKOS_SRC = $(shell cd $(KOKKOSPATH)/core/src/impl; ls *.cpp;)
-KOKKOS_CPY = $(KOKKOSPATH)/core/src/impl/*.cpp
-
-ifeq ($(OMP),yes)
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/OpenMP; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/OpenMP/*.cpp
-else
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/Threads; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/Threads/*.cpp
-endif
-
-MINIFE_TYPES =  \
-        -DMINIFE_SCALAR=double   \
-        -DMINIFE_LOCAL_ORDINAL=int      \
-        -DMINIFE_GLOBAL_ORDINAL=int
-
-MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
-# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
-
-#-----------------------------------------------------------------------
-OBJ = $(KOKKOS_SRC:.cpp=.o)
-
-CPPFLAGS = -O3 -I. -I../ -I../../utils -I../../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMPICH_IGNORE_CXX_SEEK -I$(KOKKOSPATH_INC)/core/src -I$(KOKKOSPATH_INC)/containers/src -I$(KOKKOSPATH_INC)/linalg/src -fPIC -restrict  
-LINKFLAGS = -O3  
-
-ifeq ($(MPI), yes)
-CPPFLAGS += -DHAVE_MPI 
-endif
-
-#Check for KNC compile
-ifeq ($(KNC), yes)
-CPPFLAGS += -mmic 
-LINKFLAGS += -mmic  
-override AVX = 
-endif
-
-ifeq ($(AVX), yes)
-CPPFLAGS += -mavx
-LINKFLAGS += -mavx
-endif
-
-ifeq ($(OMP),yes)
-CPPFLAGS += -DKOKKOS_HAVE_OPENMP -fopenmp
-LINKFLAGS += -fopenmp 
-else
-CPPFLAGS += -DKOKKOS_HAVE_PTHREAD 
-USRLIB += -lpthread
-endif
-
-ifeq ($(HWLOC),yes)
-CPPFLAGS += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
-USRLIB += -L$(HWLOCPATH)/lib -lhwloc 
-endif
-
-ifeq ($(ANSI_ALIAS), yes)
-CPPFLAGS += -ansi-alias 
-LINKFLAGS += -ansi-alias
-endif
-
-ifeq ($(DEBUG), yes)
-CPPFLAGS += -g -G -DKOKKOSARRAY_EXPRESSION_CHECK -DENABLE_TRACEBACK
-LINKFLAGS += -g
-endif
-
-ifeq ($(RED_PREC), yes)
-ifeq ($(KNC), yes)
-CPPFLAGS += -fimf-precision=low -fimf-domain-exclusion=15 
-LINKFLAGS += -fimf-precision=low -fimf-domain-exclusion=15 
-else
-CCFLAGS += -mGLOB_default_function_attrs="use_approx_f64_divide=true"
-LINKFLAGS += -mGLOB_default_function_attrs="use_approx_f64_divide=true"
-endif
-endif
-
-ifeq ($(GSUNROLL), yes)
-ifeq ($(KNC), yes)
-CPPFLAGS += -mGLOB_default_function_attrs="gather_scatter_loop_unroll=7; use_gather_scatter_hint=on"
-LINKFLAGS += -mGLOB_default_function_attrs="gather_scatter_loop_unroll=7; use_gather_scatter_hint=on"
-endif
-endif
-
-ifeq ($(LIBRT),yes)
-CPPFLAGS += -DKOKKOS_USE_LIBRT -DPREC_TIMER
-USRLIB += -lrt
-endif
-
-SYSLIB = $(LIBMPI) $(INTELLIB) $(LIBIB)
-
-
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
-
-LDFLAGS = $(LINKFLAGS)
-LIBS= $(USRLIB) $(SYSLIB)
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-OBJ += BoxPartition.o YAML_Doc.o YAML_Element.o
-OBJ += param_utils.o utils.o  mytimer.o
-OBJ += main.o
-
-copy: 
-	@if [ ! -d Obj_intel ]; then mkdir Obj_intel; fi
-	@echo '' > Obj_intel/KokkosCore_config.h
-	@cp -p $(SRC) Obj_intel
-	@cp -p $(KOKKOS_CPY) Obj_intel
-	@cp Makefile.intel Obj_intel/Makefile
-	@cd Obj_intel; ../get_common_files
-	@cd Obj_intel; $(MAKE) all "OBJ = $(OBJ)" 
-
-
-
-
-MINIFE_INFO = 1
-MINIFE_KERNELS = 0
-
-vpath %.cpp ../../utils
-
-all:generate_info miniFE.intel
-
-miniFE.intel:$(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) ../*.hpp generate_info
-	$(INSTRUMENT) $(LINK) $(CXXFLAGS) $(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o ../miniFE.intel $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
-
-generate_info:
-	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
-
-test:
-	./run_test intel
-
-%.o:%.cpp *.hpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-
-%.o:%.c *.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
-clean:
-	rm -rf *.o *.a miniFE.intel *.linkinfo miniFE_info.hpp Obj_intel
-
-realclean: clean
-	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* minife_debug*
-  
diff --git a/kokkos/src/Makefile.openmpi b/kokkos/src/Makefile.openmpi
deleted file mode 100644
index 27b9988..0000000
--- a/kokkos/src/Makefile.openmpi
+++ /dev/null
@@ -1,151 +0,0 @@
-#-----------------------------------------------------------------------
-SHELL = /bin/sh
-
-MPIPATH = /opt/mpi
-
-
-CXX = mpicxx 
-CC = mpicc
-LINK = mpicxx 
-
-AVX = yes
-MPI = yes
-OMP = yes
-
-ifeq ($(SVN), yes)
-  KOKKOSPATH = /opt/Trilinos/packages/kokkos
-  KOKKOSPATH_INC = $(KOKKOSPATH)
-else
-  #when taking relative paths the include path must be one level further down 
-  #because it starts off in the Obj directory
-  KOKKOSPATH = ../kokkos
-  KOKKOSPATH_INC = ../../kokkos
-endif
-
-HWLOCPATH = ./
-
-PWD = `pwd`
-
-SRC = $(shell ls *.cpp;)
-CPY = $(PWD)/*.cpp
-
-KOKKOS_SRC = $(shell cd $(KOKKOSPATH)/core/src/impl; ls *.cpp;)
-KOKKOS_CPY = $(KOKKOSPATH)/core/src/impl/*.cpp
-
-ifeq ($(OMP),yes)
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/OpenMP; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/OpenMP/*.cpp
-else
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/Threads; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/Threads/*.cpp
-endif
-
-MINIFE_TYPES =  \
-        -DMINIFE_SCALAR=double   \
-        -DMINIFE_LOCAL_ORDINAL=int      \
-        -DMINIFE_GLOBAL_ORDINAL=int
-
-MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
-# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
-
-#-----------------------------------------------------------------------
-OBJ = $(KOKKOS_SRC:.cpp=.o)
-
-CPPFLAGS = -O3 -I. -I../ -I../../utils -I../../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMPICH_IGNORE_CXX_SEEK -I$(KOKKOSPATH_INC)/core/src -I$(KOKKOSPATH_INC)/containers/src -I$(KOKKOSPATH_INC)/linalg/src -fPIC  
-LINKFLAGS = -O3  
-
-ifeq ($(MPI), yes)
-CPPFLAGS += -DHAVE_MPI 
-endif
-
-ifeq ($(AVX), yes)
-CPPFLAGS += -mavx
-LINKFLAGS += -mavx
-endif
-
-ifeq ($(OMP),yes)
-CPPFLAGS += -DKOKKOS_HAVE_OPENMP -fopenmp
-LINKFLAGS += -fopenmp 
-else
-CPPFLAGS += -DKOKKOS_HAVE_PTHREAD 
-USRLIB += -lpthread
-endif
-
-ifeq ($(HWLOC),yes)
-CPPFLAGS += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
-USRLIB += -L$(HWLOCPATH)/lib -lhwloc 
-endif
-
-ifeq ($(ANSI_ALIAS), yes)
-CPPFLAGS += -ansi-alias 
-LINKFLAGS += -ansi-alias
-endif
-
-ifeq ($(DEBUG), yes)
-CPPFLAGS += -g -G -DKOKKOSARRAY_EXPRESSION_CHECK -DENABLE_TRACEBACK
-LINKFLAGS += -g
-endif
-
-ifeq ($(LIBRT),yes)
-CPPFLAGS += -DKOKKOS_USE_LIBRT -DPREC_TIMER
-USRLIB += -lrt
-endif
-
-SYSLIB = $(LIBMPI) $(INTELLIB) $(LIBIB)
-
-
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
-
-LDFLAGS = $(LINKFLAGS)
-LIBS= $(USRLIB) $(SYSLIB)
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-OBJ += BoxPartition.o YAML_Doc.o YAML_Element.o
-OBJ += param_utils.o utils.o  mytimer.o
-OBJ += main.o
-
-copy: 
-	@if [ ! -d Obj_openmpi ]; then mkdir Obj_openmpi; fi
-	@echo '' > Obj_openmpi/KokkosCore_config.h
-	@cp -p $(SRC) Obj_openmpi
-	@cp -p $(KOKKOS_CPY) Obj_openmpi
-	@cp Makefile.openmpi Obj_openmpi/Makefile
-	@cd Obj_openmpi; ../get_common_files
-	@cd Obj_openmpi; $(MAKE) all "OBJ = $(OBJ)"
-
-
-
-
-MINIFE_INFO = 1
-MINIFE_KERNELS = 0
-
-vpath %.cpp ../../utils
-
-all:generate_info miniFE.openmpi
-
-miniFE.openmpi:$(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) ../*.hpp generate_info
-	$(INSTRUMENT) $(LINK) $(CXXFLAGS) $(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o ../miniFE.openmpi $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
-
-generate_info:
-	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
-
-test:
-	./run_test openmpi
-
-%.o:%.cpp *.hpp
-	$(CXX) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-
-%.o:%.c *.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
-clean:
-	rm -rf *.o *.a miniFE.openmpi *.linkinfo miniFE_info.hpp Obj_openmpi
-
-realclean: clean
-	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* minife_debug*
-  
diff --git a/kokkos/src/Makefile.titan b/kokkos/src/Makefile.titan
deleted file mode 100644
index 230b277..0000000
--- a/kokkos/src/Makefile.titan
+++ /dev/null
@@ -1,180 +0,0 @@
-#-----------------------------------------------------------------------
-SHELL = /bin/sh
-
-MPIPATH = /opt/mpi
-
-# CXX = nvcc --compiler-bindir `which CC`
-# CC = nvcc --compiler-bindir `which CC`
-# LINK = nvcc --compiler-bindir `which CC`
-
-CC_HOST = `which CC`
-CXX = nvcc
-CC = nvcc
-LINK = nvcc
-
-CUDA = yes
-MPI = yes
-CUDA_ARCH = sm_35
-
-ifeq ($(SVN), yes)
-  KOKKOSPATH = /opt/Trilinos/kokkos
-  KOKKOSPATH_INC = $(KOKKOSPATH)
-else
-  #when taking relative paths the include path must be one level further down 
-  #because it starts off in the Obj directory
-  KOKKOSPATH = ../kokkos
-  KOKKOSPATH_INC = ../../kokkos
-endif
-
-HWLOCPATH = ./
-
-PWD = `pwd`
-
-SRC = $(shell ls *.cpp;)
-CPY = $(PWD)/*.cpp
-
-KOKKOS_SRC = $(shell cd $(KOKKOSPATH)/core/src/impl; ls *.cpp;)
-KOKKOS_CPY = $(KOKKOSPATH)/core/src/impl/*.cpp
-
-ifeq ($(OMP),yes)
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/OpenMP; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/OpenMP/*.cpp
-else
-KOKKOS_SRC += $(shell cd $(KOKKOSPATH)/core/src/Threads; ls *.cpp;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/Threads/*.cpp
-endif
-
-ifeq ($(CUDA),yes)
-KOKKOS_CUDASRC += $(shell cd $(KOKKOSPATH)/core/src/Cuda; ls *.cu;)
-KOKKOS_CPY += $(KOKKOSPATH)/core/src/Cuda/*.cu
-endif
-
-MINIFE_TYPES =  \
-        -DMINIFE_SCALAR=double   \
-        -DMINIFE_LOCAL_ORDINAL=int      \
-        -DMINIFE_GLOBAL_ORDINAL=int
-
-MINIFE_MATRIX_TYPE = -DMINIFE_CSR_MATRIX
-# MINIFE_MATRIX_TYPE = -DMINIFE_ELL_MATRIX
-
-#-----------------------------------------------------------------------
-OBJ = $(KOKKOS_SRC:.cpp=.o) $(KOKKOS_CUDASRC:.cu=.o)
-
-CPPFLAGS = --compiler-bindir $(CC_HOST) -m64 -O3 -I. -I../ -I../../utils -I../../fem $(MINIFE_TYPES) $(MINIFE_MATRIX_TYPE) -DMPICH_IGNORE_CXX_SEEK -I$(KOKKOSPATH_INC)/core/src/ -I$(KOKKOSPATH_INC)/containers/src -I$(KOKKOSPATH_INC)/linalg/src -I$(MPIPATH)/include -arch=$(CUDA_ARCH) -maxrregcount=64 -x cu -Xcompiler -fPIC -restrict  
-LINKFLAGS = --compiler-bindir $(CC_HOST) -m64 -O3 -L$(MPIPATH)/lib -L$(INTELPATH)/lib/intel64 
-
-ifeq ($(MPI), yes)
-CPPFLAGS += -DHAVE_MPI -I$(MPIPATH)/include -DGPU_MPI
-endif
-
-ifeq ($(CUDA), yes)
-CPPFLAGS += -DDEVICE=2 -DKOKKOS_HAVE_CUDA
-endif
-
-ifeq ($(CUSPARSE), yes)
-CPPFLAGS += -DKOKKOS_USE_CUSPARSE
-USRLIB += -lcusparse
-endif
-
-ifeq ($(CUBLAS), yes)
-CPPFLAGS += -DKOKKOS_USE_CUBLAS
-USRLIB += -lcublas
-endif
-
-ifeq ($(AVX), yes)
-CPPFLAGS += -Xcompiler -mavx
-LINKFLAGS += -mavx
-endif
-
-ifeq ($(OMP),yes)
-CPPFLAGS += -DKOKKOS_HAVE_OPENMP -Xcompiler -fopenmp
-LINKFLAGS += -fopenmp 
-else
-CPPFLAGS += -DKOKKOS_HAVE_PTHREAD 
-USRLIB += -lpthread
-endif
-
-ifeq ($(HWLOC),yes)
-CPPFLAGS += -DKOKKOS_HAVE_HWLOC -I$(HWLOCPATH)/include
-USRLIB += -L$(HWLOCPATH)/lib -lhwloc 
-endif
-
-ifeq ($(RED_PREC), yes)
-CPPFLAGS += --use_fast_math
-endif
-
-ifeq ($(DEBUG), yes)
-CPPFLAGS += -g -G -DKOKKOSARRAY_EXPRESSION_CHECK -DENABLE_TRACEBACK
-LINKFLAGS += -g
-endif
-
-ifeq ($(LIBRT),yes)
-CPPFLAGS += -DKOKKOS_USE_LIBRT -DPREC_TIMER
-USRLIB += -lrt
-endif
-
-ifeq ($(CUDALDG), yes)
-CPPFLAGS += -DKOKKOS_USE_LDG_INTRINSIC
-endif
-
-SYSLIB =  -lcuda -lcudart $(LIBMPI) $(INTELLIB) $(LIBIB)
-
-
-# For debugging, the macro MINIFE_DEBUG will cause miniFE to dump a log file
-# from each proc containing various information.
-# This macro will also enable a somewhat expensive range-check on indices in
-# the exchange_externals function.
-
-LDFLAGS = $(LINKFLAGS)
-LIBS= $(USRLIB) $(SYSLIB)
-
-# The MPICH_IGNORE_CXX_SEEK macro is required for some mpich versions,
-# such as the one on my cygwin machine.
-
-OBJ += BoxPartition.o YAML_Doc.o YAML_Element.o
-OBJ += param_utils.o utils.o  mytimer.o
-OBJ += main.o
-
-copy: 
-	@if [ ! -d Obj_titan ]; then mkdir Obj_titan; fi
-	@echo '' > Obj_titan/KokkosCore_config.h
-	@cp -p $(SRC) Obj_titan
-	@cp -p $(KOKKOS_CPY) Obj_titan
-	@cp Makefile.titan Obj_titan/Makefile
-	@cd Obj_titan; ../get_common_files
-	@cd Obj_titan; $(MAKE) all "OBJ = $(OBJ)"
-
-
-
-
-MINIFE_INFO = 1
-MINIFE_KERNELS = 0
-
-vpath %.cpp ../../utils
-
-all:generate_info miniFE.titan
-
-miniFE.titan:$(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) ../*.hpp generate_info
-	$(INSTRUMENT) $(LINK) $(CXXFLAGS) $(OBJ) $(MAIN_OBJ) $(OBJS) $(UTIL_OBJS) $(OPTIONAL_OBJS) -o ../miniFE.titan $(LDFLAGS) $(OPTIONAL_LIBS) $(LIBS)
-
-generate_info:
-	./generate_info_header "$(CXX)" "$(CXXFLAGS)" "miniFE" "MINIFE"
-
-test:
-	./run_test titan
-.SUFFIXES: .cu
-
-%.o:%.cpp *.hpp
-	$(CXX) $(CUDA_SWITCH) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-.cu.o:
-	$(CXX) $(CUDA_SWITCH) $(CXXFLAGS) $(CPPFLAGS) -DMINIFE_INFO=$(MINIFE_INFO) -DMINIFE_KERNELS=$(MINIFE_KERNELS) -c $<
-
-%.o:%.c *.h
-	$(CC) $(CFLAGS) $(CPPFLAGS) -c $<
-
-clean:
-	rm -rf *.o *.a *.x *.linkinfo miniFE_info.hpp Obj_titan
-
-realclean: clean
-	rm -f gmon.out gprof.* *~ *.yaml *.TVD.* *.mtx* *.vec* minife_debug*
-  
diff --git a/kokkos/src/SparseMatrix_functions.hpp b/kokkos/src/SparseMatrix_functions.hpp
index b5ad036..2ccfbdc 100644
--- a/kokkos/src/SparseMatrix_functions.hpp
+++ b/kokkos/src/SparseMatrix_functions.hpp
@@ -200,7 +200,6 @@ sum_in_symm_elem_matrix(size_t num,
 //std::cout<<std::endl;
 
   int row_offset = 0;
-  bool flag = false;
   for(size_t i=0; i<num; ++i) {
     GlobalOrdinal row = indices[i];
  
@@ -449,7 +448,7 @@ struct impose_dirichlet_functorA {
 	                      {}
 	  //--------------------------------------------------------------------------
 
-	  KOKKOS_INLINE_FUNCTION
+	  inline
 	  void operator()( const int i ) const
 	  {
 		  GlobalOrdinal first_local_row = _A.rows.size()>0 ? _A.rows[0] : 0;
@@ -481,7 +480,7 @@ struct impose_dirichlet_functorB {
 	                      {}
 	  //--------------------------------------------------------------------------
 
-	  KOKKOS_INLINE_FUNCTION
+	  inline
 	  void operator()( const int i ) const
 	  {
 		    GlobalOrdinal row = _A.rows[i];
@@ -523,11 +522,11 @@ impose_dirichlet(typename MatrixType::ScalarType prescribed_value,
   GlobalOrdinal last_local_row  = A.rows.size()>0 ? A.rows[A.rows.size()-1] : -1;
 
   impose_dirichlet_functorA<MatrixType,VectorType,typename MatrixType::HostMirror::device_type> fA(prescribed_value,A,b,bc_rows);
-  Kokkos::parallel_for(bc_rows.size(),fA);
+  Kokkos::parallel_for("impose_dirichlet_A<Host>",bc_rows.size(),fA);
   MatrixType::device_type::fence();
 
   impose_dirichlet_functorB<MatrixType,VectorType,typename MatrixType::HostMirror::device_type> fB(prescribed_value,A,b,bc_rows);
-  Kokkos::parallel_for(A.rows.size(),fB);
+  Kokkos::parallel_for("impose_dirichlet_B<Host>",A.rows.size(),fB);
   MatrixType::device_type::fence();
 }
 
diff --git a/kokkos/src/Vector.hpp b/kokkos/src/Vector.hpp
index 5b02637..08d8095 100644
--- a/kokkos/src/Vector.hpp
+++ b/kokkos/src/Vector.hpp
@@ -64,7 +64,7 @@ struct Vector {
 
   GlobalOrdinal startIndex;
   LocalOrdinal local_size;
-  Kokkos::vector<Scalar> coefs;
+  Kokkos::vector<Scalar,Kokkos::DefaultExecutionSpace> coefs;
 };
 
 
diff --git a/kokkos/src/Vector_functions.hpp b/kokkos/src/Vector_functions.hpp
index 398753b..69de148 100644
--- a/kokkos/src/Vector_functions.hpp
+++ b/kokkos/src/Vector_functions.hpp
@@ -95,7 +95,7 @@ void sum_into_vector(size_t num_indices,
   GlobalOrdinal first = vec.startIndex;
   GlobalOrdinal last = first + vec.local_size - 1;
 
-  Kokkos::vector<Scalar>& vec_coefs = vec.coefs;
+  Kokkos::vector<Scalar,Kokkos::DefaultExecutionSpace>& vec_coefs = vec.coefs;
 
   for(size_t i=0; i<num_indices; ++i) {
     if (indices[i] < first || indices[i] > last) continue;
diff --git a/kokkos/src/exchange_externals.hpp b/kokkos/src/exchange_externals.hpp
index e692a85..42ddf0d 100644
--- a/kokkos/src/exchange_externals.hpp
+++ b/kokkos/src/exchange_externals.hpp
@@ -47,9 +47,9 @@ template<typename MatrixType,
 struct
 exchange_externals_functor {
 	typedef device_device_type device_type;
-	Kokkos::vector<typename VectorType::ScalarType> _x;
-    Kokkos::vector<typename MatrixType::GlobalOrdinalType> _elements_to_send;
-	Kokkos::vector<typename MatrixType::ScalarType>        _send_buffer;
+	Kokkos::vector<typename VectorType::ScalarType,Kokkos::DefaultExecutionSpace> _x;
+    Kokkos::vector<typename MatrixType::GlobalOrdinalType,Kokkos::DefaultExecutionSpace> _elements_to_send;
+	Kokkos::vector<typename MatrixType::ScalarType,Kokkos::DefaultExecutionSpace>        _send_buffer;
 
 	exchange_externals_functor(const MatrixType& A,
 			const VectorType& x):_x(x.coefs),_elements_to_send(A.elements_to_send),_send_buffer(A.send_buffer) {
@@ -90,12 +90,12 @@ exchange_externals(MatrixType& A,
 
   int local_nrow = A.rows.size();
   int num_neighbors = A.neighbors.size();
-  const Kokkos::vector<LocalOrdinal>& recv_length = A.recv_length;
-  const Kokkos::vector<LocalOrdinal>& send_length = A.send_length;
-  const Kokkos::vector<int>& neighbors = A.neighbors;
-  const Kokkos::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+  const Kokkos::vector<LocalOrdinal,Kokkos::DefaultExecutionSpace>& recv_length = A.recv_length;
+  const Kokkos::vector<LocalOrdinal,Kokkos::DefaultExecutionSpace>& send_length = A.send_length;
+  const Kokkos::vector<int,Kokkos::DefaultExecutionSpace>& neighbors = A.neighbors;
+  const Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace>& elements_to_send = A.elements_to_send;
 
-  Kokkos::vector<Scalar>& send_buffer = A.send_buffer;
+  Kokkos::vector<Scalar,Kokkos::DefaultExecutionSpace>& send_buffer = A.send_buffer;
 
   //
   // first post receives, these are immediate receives
@@ -111,7 +111,7 @@ exchange_externals(MatrixType& A,
   // Externals are at end of locals
   //
 
-  Kokkos::vector<Scalar>& x_coefs = x.coefs;
+  Kokkos::vector<Scalar,Kokkos::DefaultExecutionSpace>& x_coefs = x.coefs;
 #ifndef GPU_MPI
   Scalar* x_external = x_coefs.h_view.ptr_on_device() + local_nrow;
 #else
@@ -150,7 +150,7 @@ exchange_externals(MatrixType& A,
     send_buffer[i] = x.coefs[elements_to_send[i]];
   }*/
   exchange_externals_functor<MatrixType,VectorType> f(A,x);
-  Kokkos::parallel_for(total_to_be_sent,f);
+  Kokkos::parallel_for("exchange_externals",total_to_be_sent,f);
 
   //
   // Send to each neighbor
@@ -222,12 +222,12 @@ begin_exchange_externals(MatrixType& A,
 
   int local_nrow = A.rows.size();
   int num_neighbors = A.neighbors.size();
-  const Kokkos::vector<LocalOrdinal>& recv_length = A.recv_length;
-  const Kokkos::vector<LocalOrdinal>& send_length = A.send_length;
-  const Kokkos::vector<int>& neighbors = A.neighbors;
-  const Kokkos::vector<GlobalOrdinal>& elements_to_send = A.elements_to_send;
+  const Kokkos::vector<LocalOrdinal,Kokkos::DefaultExecutionSpace>& recv_length = A.recv_length;
+  const Kokkos::vector<LocalOrdinal,Kokkos::DefaultExecutionSpace>& send_length = A.send_length;
+  const Kokkos::vector<int,Kokkos::DefaultExecutionSpace>& neighbors = A.neighbors;
+  const Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace>& elements_to_send = A.elements_to_send;
 
-  Kokkos::vector<Scalar> send_buffer(elements_to_send.size(), 0);
+  Kokkos::vector<Scalar,Kokkos::DefaultExecutionSpace> send_buffer(elements_to_send.size(), 0);
 
   //
   // first post receives, these are immediate receives
@@ -243,7 +243,7 @@ begin_exchange_externals(MatrixType& A,
   // Externals are at end of locals
   //
 
-  Kokkos::vector<Scalar>& x_coefs = x.coefs;
+  Kokkos::vector<Scalar,Kokkos::DefaultExecutionSpace>& x_coefs = x.coefs;
   Scalar* x_external = &(x_coefs[local_nrow]);
 
   MPI_Datatype mpi_dtype = TypeTraits<Scalar>::mpi_type();
diff --git a/kokkos/src/generate_matrix_structure.hpp b/kokkos/src/generate_matrix_structure.hpp
index d3eea45..ffef083 100644
--- a/kokkos/src/generate_matrix_structure.hpp
+++ b/kokkos/src/generate_matrix_structure.hpp
@@ -104,7 +104,7 @@ struct generate_matrix_structure_functor {
 		  box_dims[2] = box[2][1] - box[2][0];
 	}
 
-	KOKKOS_INLINE_FUNCTION
+	inline
 	void operator() (const int &roffset) const{
 		  int iz = roffset/(box_dims[1]*box_dims[0]) + box[2][0];
 		  int iy = (roffset/box_dims[0])%box_dims[1] + box[1][0];
@@ -149,7 +149,7 @@ generate_matrix_structure(const simple_mesh_description<typename MatrixType::Glo
   try {
   struct generate_matrix_structure_functor<MatrixType,host_device_type> functor(mesh,&A);
 
-  Kokkos::parallel_for(functor.box_dims[0]*functor.box_dims[1]*functor.box_dims[2],functor);
+  Kokkos::parallel_for("generate_matrix_structure<Host>",functor.box_dims[0]*functor.box_dims[1]*functor.box_dims[2],functor);
   host_device_type::fence();
 
   for(int i=0;i<functor.row_offsets.size()-1;i++) {
diff --git a/kokkos/src/main.cpp b/kokkos/src/main.cpp
index 9b25725..b179329 100644
--- a/kokkos/src/main.cpp
+++ b/kokkos/src/main.cpp
@@ -68,7 +68,7 @@
 // ************************************************************************
 
 void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params);
-void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads);
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs);
 void add_timestring_to_yaml(YAML_Doc& doc);
 
 //
@@ -82,37 +82,11 @@ int main(int argc, char** argv) {
   miniFE::Parameters params;
   miniFE::get_parameters(argc, argv, params);
 
-#ifdef KOKKOS_HAVE_CUDA
-  char* str;
-  int dev_count, device = params.device, local_rank = 0;
-  if((str = getenv("SLURM_LOCALID")) != NULL) {
-    local_rank = atoi(str);
-    device = local_rank % params.num_devices;
-    if(device >= params.skip_device) device++;
-  }
-  
-  if((str = getenv("MV2_COMM_WORLD_LOCAL_RANK")) != NULL) {
-    cudaGetDeviceCount(&dev_count);
-    local_rank = atoi(str);
-    device = local_rank % params.num_devices;
-
-    if(device >= params.skip_device) device++;
-  }
-  if((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")) != NULL) {
-    cudaGetDeviceCount(&dev_count);
-    local_rank = atoi(str);
-    device = local_rank % params.num_devices;
-
-    if(device >= params.skip_device) device++;
-  }
-
-  Kokkos::Cuda::host_mirror_device_type::initialize(params.numa,params.numthreads);
-  Kokkos::Cuda::SelectDevice select_device(device);
-  Kokkos::Cuda::initialize(select_device);
-#endif
-
   int numprocs = 1, myproc = 0;
   miniFE::initialize_mpi(argc, argv, numprocs, myproc);
+
+  Kokkos::initialize(argc,argv);
+
   if(myproc==0) {
     std::cout << "MiniFE Mini-App, Kokkos Peer Implementation" << std::endl;
   }
@@ -126,10 +100,6 @@ int main(int argc, char** argv) {
   //make sure each processor has the same parameters:
   miniFE::broadcast_parameters(params);
 
-#ifndef KOKKOS_HAVE_CUDA
-  device_device_type::initialize(params.numa,params.numthreads);
-#endif
-
   Box global_box = { 0, params.nx, 0, params.ny, 0, params.nz };
   std::vector<Box> local_boxes(numprocs);
 
@@ -164,7 +134,7 @@ int main(int argc, char** argv) {
   YAML_Doc doc("miniFE", MINIFE_VERSION, ".", osstr.str());
   if (myproc == 0) {
     add_params_to_yaml(doc, params);
-    add_configuration_to_yaml(doc, numprocs, params.numthreads);
+    add_configuration_to_yaml(doc, numprocs);
     add_timestring_to_yaml(doc);
   }
 
@@ -183,10 +153,9 @@ int main(int argc, char** argv) {
     doc.add("Total Program Time",total_time);
     doc.generateYAML();
   }
-#ifdef KOKKOS_HAVE_CUDA
-  host_device_type::finalize();
-#endif
-  device_device_type::finalize();
+
+  Kokkos::finalize();
+
   miniFE::finalize_mpi();
 
   return return_code;
@@ -210,7 +179,7 @@ void add_params_to_yaml(YAML_Doc& doc, miniFE::Parameters& params)
   }
 }
 
-void add_configuration_to_yaml(YAML_Doc& doc, int numprocs, int numthreads)
+void add_configuration_to_yaml(YAML_Doc& doc, int numprocs)
 {
   doc.get("Global Run Parameters")->add("number of processors", numprocs);
 
diff --git a/kokkos/src/make_local_matrix.hpp b/kokkos/src/make_local_matrix.hpp
index efb99a8..5d91b3b 100644
--- a/kokkos/src/make_local_matrix.hpp
+++ b/kokkos/src/make_local_matrix.hpp
@@ -80,7 +80,7 @@ make_local_matrix(MatrixType& A)
   ///////////////////////////////////////////
   // Scan the indices and transform to local
   ///////////////////////////////////////////
-  Kokkos::vector<GlobalOrdinal>& external_index = A.external_index;
+  Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace>& external_index = A.external_index;
 
   for(size_t i=0; i<A.rows.size(); ++i) {
     GlobalOrdinal* Acols = NULL;
@@ -107,11 +107,11 @@ make_local_matrix(MatrixType& A)
   // Go through list of externals to find out which processors must be accessed.
   ////////////////////////////////////////////////////////////////////////
 
-  Kokkos::vector<GlobalOrdinal> tmp_buffer(numprocs, 0); // Temp buffer space needed below
+  Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace> tmp_buffer(numprocs, 0); // Temp buffer space needed below
 
   // Build list of global index offset
 
-  Kokkos::vector<GlobalOrdinal> global_index_offsets(numprocs, 0);
+  Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace> global_index_offsets(numprocs, 0);
 
   tmp_buffer[myproc] = start_row; // This is my start row
 
@@ -127,7 +127,7 @@ make_local_matrix(MatrixType& A)
                 MPI_SUM, MPI_COMM_WORLD);
 
   // Go through list of externals and find the processor that owns each
-  Kokkos::vector<int> external_processor(num_external);
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace> external_processor(num_external);
 
   for(LocalOrdinal i=0; i<num_external; ++i) {
     GlobalOrdinal cur_ind = external_index[i];
@@ -148,7 +148,7 @@ make_local_matrix(MatrixType& A)
   /////////////////////////////////////////////////////////////////////////
 
   size_t count = local_nrow;
-  Kokkos::vector<GlobalOrdinal>& external_local_index = A.external_local_index;
+  Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace>& external_local_index = A.external_local_index;
   external_local_index.on_host();
   external_local_index.assign(num_external, -1);
 
@@ -177,7 +177,7 @@ make_local_matrix(MatrixType& A)
     }
   }
 
-  Kokkos::vector<int> new_external_processor(num_external, 0);
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace> new_external_processor(num_external, 0);
 
   for(int i=0; i<num_external; ++i) {
     new_external_processor[external_local_index[i]-local_nrow] =
@@ -196,7 +196,7 @@ make_local_matrix(MatrixType& A)
   ///
   ////////////////////////////////////////////////////////////////////////
 
-  Kokkos::vector<GlobalOrdinal> tmp_neighbors(numprocs, 0);
+  Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace> tmp_neighbors(numprocs, 0);
 
   int num_recv_neighbors = 0;
   int length             = 1;
@@ -231,7 +231,7 @@ make_local_matrix(MatrixType& A)
   ///
   ///////////////////////////////////////////////////////////////////////
 
-  Kokkos::vector<int> recv_list;
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace> recv_list;
   recv_list.push_back(new_external_processor[0]);
 
   for(LocalOrdinal i=1; i<num_external; ++i) {
@@ -244,7 +244,7 @@ make_local_matrix(MatrixType& A)
   // Send a 0 length message to each of our recv neighbors
   //
 
-  Kokkos::vector<int> send_list(num_send_neighbors, 0);
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace> send_list(num_send_neighbors, 0);
 
   //
   // first post receives, these are immediate receives
@@ -308,7 +308,7 @@ make_local_matrix(MatrixType& A)
   // order given by 'external_local_index'
   //
 
-  Kokkos::vector<GlobalOrdinal> new_external(num_external);
+  Kokkos::vector<GlobalOrdinal,Kokkos::DefaultExecutionSpace> new_external(num_external);
   for(LocalOrdinal i=0; i<num_external; ++i) {
     new_external[external_local_index[i] - local_nrow] = external_index[i];
   }
@@ -320,7 +320,7 @@ make_local_matrix(MatrixType& A)
   //
   /////////////////////////////////////////////////////////////////////////
 
-  Kokkos::vector<int> lengths(num_recv_neighbors);
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace> lengths(num_recv_neighbors);
 
   ++MPI_MY_TAG;
 
@@ -332,9 +332,9 @@ make_local_matrix(MatrixType& A)
               &request[i]);
   }
 
-  Kokkos::vector<int>& neighbors = A.neighbors;
-  Kokkos::vector<int>& recv_length = A.recv_length;
-  Kokkos::vector<int>& send_length = A.send_length;
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace>& neighbors = A.neighbors;
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace>& recv_length = A.recv_length;
+  Kokkos::vector<int,Kokkos::DefaultExecutionSpace>& send_length = A.send_length;
 
   neighbors.resize(num_recv_neighbors, 0);
   A.request.resize(num_recv_neighbors);
diff --git a/kokkos/src/perform_element_loop.hpp b/kokkos/src/perform_element_loop.hpp
index b55b4a7..93662a9 100644
--- a/kokkos/src/perform_element_loop.hpp
+++ b/kokkos/src/perform_element_loop.hpp
@@ -57,7 +57,7 @@ struct perform_element_loop_functor {
      // ElemData<GlobalOrdinal,Scalar> _elem_data;
 	  //--------------------------------------------------------------------------
 
-	  KOKKOS_INLINE_FUNCTION
+	  inline
 	  void operator()( const int i ) const
 	  {
 	        ElemData<GlobalOrdinal,Scalar> elem_data;// = _elem_data;
@@ -116,7 +116,7 @@ perform_element_loop(const simple_mesh_description<GlobalOrdinal>& mesh,
   compute_gradient_values(elem_data.grad_vals);
 
   struct perform_element_loop_functor<GlobalOrdinal, MatrixType,VectorType> f(&A,&b,mesh,h_elemIDs,elem_data);
-  Kokkos::parallel_for(h_elemIDs.dimension_0(),f);
+  Kokkos::parallel_for("perform_element_loop<Host>",h_elemIDs.dimension_0(),f);
   device_device_type::fence();
 }