diff --git a/regtest/basic/rt42c-cuda/plumed.dat b/regtest/basic/rt42c-cuda/plumed.dat
deleted file mode 100644
index eff3b9d96e..0000000000
--- a/regtest/basic/rt42c-cuda/plumed.dat
+++ /dev/null
@@ -1,10 +0,0 @@
-LOAD FILE=../../../../src/cudaCoord/Coordination.so
-gpu: CUDACOORDINATION GROUPA=1-108 R_0=1
-cpu: COORDINATION GROUPA=1-108 R_0=1 NOPBC
-
-diff: CUSTOM ARG=gpu,cpu FUNC=y-x PERIODIC=NO 
-
-PRINT ARG=diff FILE=colvar FMT=%8.4f STRIDE=1
-
-DUMPDERIVATIVES ARG=gpu,cpu FILE=deriv FMT=%8.4f
-FLUSH STRIDE=1
diff --git a/regtest/basic/rt42c-cuda/Makefile b/regtest/basic/rtCUDACOORDINATION-nopbc/Makefile
similarity index 100%
rename from regtest/basic/rt42c-cuda/Makefile
rename to regtest/basic/rtCUDACOORDINATION-nopbc/Makefile
diff --git a/regtest/basic/rt42c-cuda/colvar.reference b/regtest/basic/rtCUDACOORDINATION-nopbc/colvar.reference
similarity index 100%
rename from regtest/basic/rt42c-cuda/colvar.reference
rename to regtest/basic/rtCUDACOORDINATION-nopbc/colvar.reference
diff --git a/regtest/basic/rt42c-cuda/config b/regtest/basic/rtCUDACOORDINATION-nopbc/config
similarity index 100%
rename from regtest/basic/rt42c-cuda/config
rename to regtest/basic/rtCUDACOORDINATION-nopbc/config
diff --git a/regtest/basic/rt42c-cuda/deriv.reference b/regtest/basic/rtCUDACOORDINATION-nopbc/deriv.reference
similarity index 100%
rename from regtest/basic/rt42c-cuda/deriv.reference
rename to regtest/basic/rtCUDACOORDINATION-nopbc/deriv.reference
diff --git a/regtest/basic/rtCUDACOORDINATION-nopbc/plumed.dat b/regtest/basic/rtCUDACOORDINATION-nopbc/plumed.dat
new file mode 100644
index 0000000000..f69a00cbf3
--- /dev/null
+++ b/regtest/basic/rtCUDACOORDINATION-nopbc/plumed.dat
@@ -0,0 +1,19 @@
+LOAD FILE=../../../../src/cudaCoord/Coordination.so
+gpu: CUDACOORDINATION GROUPA=1-108 R_0=1 NOPBC
+gpu512: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=512 NOPBC
+gpu256: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=256 NOPBC
+gpu128: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=128 NOPBC
+gpu64: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=64 NOPBC
+cpu: COORDINATION GROUPA=1-108 R_0=1 NOPBC
+
+diff: CUSTOM ARG=gpu,cpu FUNC=y-x PERIODIC=NO
+diff512: CUSTOM ARG=gpu512,cpu FUNC=y-x PERIODIC=NO
+diff256: CUSTOM ARG=gpu256,cpu FUNC=y-x PERIODIC=NO
+diff128: CUSTOM ARG=gpu128,cpu FUNC=y-x PERIODIC=NO
+diff64: CUSTOM ARG=gpu64,cpu FUNC=y-x PERIODIC=NO
+
+PRINT ARG=diff FILE=colvar FMT=%8.4f STRIDE=1
+PRINT ARG=diff512,diff256,diff128,diff64 FILE=threadsDifferences FMT=%8.4f STRIDE=1
+
+DUMPDERIVATIVES ARG=gpu,cpu FILE=deriv FMT=%8.4f
+FLUSH STRIDE=1
diff --git a/regtest/basic/rtCUDACOORDINATION-nopbc/threadsDifferences.reference b/regtest/basic/rtCUDACOORDINATION-nopbc/threadsDifferences.reference
new file mode 100644
index 0000000000..b533fd5def
--- /dev/null
+++ b/regtest/basic/rtCUDACOORDINATION-nopbc/threadsDifferences.reference
@@ -0,0 +1,109 @@
+#! FIELDS time diff512 diff256 diff128 diff64
+ 0.000000   0.0000   0.0000   0.0000   0.0000
+ 1.000000   0.0000   0.0000   0.0000   0.0000
+ 2.000000   0.0000   0.0000   0.0000   0.0000
+ 3.000000   0.0000   0.0000   0.0000   0.0000
+ 4.000000   0.0000   0.0000   0.0000   0.0000
+ 5.000000   0.0000   0.0000   0.0000   0.0000
+ 6.000000   0.0000   0.0000   0.0000   0.0000
+ 7.000000   0.0000   0.0000   0.0000   0.0000
+ 8.000000   0.0000   0.0000   0.0000   0.0000
+ 9.000000   0.0000   0.0000   0.0000   0.0000
+ 10.000000   0.0000   0.0000   0.0000   0.0000
+ 11.000000   0.0000   0.0000   0.0000   0.0000
+ 12.000000   0.0000   0.0000   0.0000   0.0000
+ 13.000000   0.0000   0.0000   0.0000   0.0000
+ 14.000000   0.0000   0.0000   0.0000   0.0000
+ 15.000000   0.0000   0.0000   0.0000   0.0000
+ 16.000000   0.0000   0.0000   0.0000   0.0000
+ 17.000000   0.0000   0.0000   0.0000   0.0000
+ 18.000000   0.0000   0.0000   0.0000   0.0000
+ 19.000000   0.0000   0.0000   0.0000   0.0000
+ 20.000000   0.0000   0.0000   0.0000   0.0000
+ 21.000000   0.0000   0.0000   0.0000   0.0000
+ 22.000000   0.0000   0.0000   0.0000   0.0000
+ 23.000000   0.0000   0.0000   0.0000   0.0000
+ 24.000000   0.0000   0.0000   0.0000   0.0000
+ 25.000000   0.0000   0.0000   0.0000   0.0000
+ 26.000000   0.0000   0.0000   0.0000   0.0000
+ 27.000000   0.0000   0.0000   0.0000   0.0000
+ 28.000000   0.0000   0.0000   0.0000   0.0000
+ 29.000000   0.0000   0.0000   0.0000   0.0000
+ 30.000000   0.0000   0.0000   0.0000   0.0000
+ 31.000000   0.0000   0.0000   0.0000   0.0000
+ 32.000000   0.0000   0.0000   0.0000   0.0000
+ 33.000000   0.0000   0.0000   0.0000   0.0000
+ 34.000000   0.0000   0.0000   0.0000   0.0000
+ 35.000000   0.0000   0.0000   0.0000   0.0000
+ 36.000000   0.0000   0.0000   0.0000   0.0000
+ 37.000000   0.0000   0.0000   0.0000   0.0000
+ 38.000000   0.0000   0.0000   0.0000   0.0000
+ 39.000000   0.0000   0.0000   0.0000   0.0000
+ 40.000000   0.0000   0.0000   0.0000   0.0000
+ 41.000000   0.0000   0.0000   0.0000   0.0000
+ 42.000000   0.0000   0.0000   0.0000   0.0000
+ 43.000000   0.0000   0.0000   0.0000   0.0000
+ 44.000000   0.0000   0.0000   0.0000   0.0000
+ 45.000000   0.0000   0.0000   0.0000   0.0000
+ 46.000000   0.0000   0.0000   0.0000   0.0000
+ 47.000000   0.0000   0.0000   0.0000   0.0000
+ 48.000000   0.0000   0.0000   0.0000   0.0000
+ 49.000000   0.0000   0.0000   0.0000   0.0000
+ 50.000000   0.0000   0.0000   0.0000   0.0000
+ 51.000000   0.0000   0.0000   0.0000   0.0000
+ 52.000000   0.0000   0.0000   0.0000   0.0000
+ 53.000000   0.0000   0.0000   0.0000   0.0000
+ 54.000000   0.0000   0.0000   0.0000   0.0000
+ 55.000000   0.0000   0.0000   0.0000   0.0000
+ 56.000000   0.0000   0.0000   0.0000   0.0000
+ 57.000000   0.0000   0.0000   0.0000   0.0000
+ 58.000000   0.0000   0.0000   0.0000   0.0000
+ 59.000000   0.0000   0.0000   0.0000   0.0000
+ 60.000000   0.0000   0.0000   0.0000   0.0000
+ 61.000000   0.0000   0.0000   0.0000   0.0000
+ 62.000000   0.0000   0.0000   0.0000   0.0000
+ 63.000000   0.0000   0.0000   0.0000   0.0000
+ 64.000000   0.0000   0.0000   0.0000   0.0000
+ 65.000000   0.0000   0.0000   0.0000   0.0000
+ 66.000000   0.0000   0.0000   0.0000   0.0000
+ 67.000000   0.0000   0.0000   0.0000   0.0000
+ 68.000000   0.0000   0.0000   0.0000   0.0000
+ 69.000000   0.0000   0.0000   0.0000   0.0000
+ 70.000000   0.0000   0.0000   0.0000   0.0000
+ 71.000000   0.0000   0.0000   0.0000   0.0000
+ 72.000000   0.0000   0.0000   0.0000   0.0000
+ 73.000000   0.0000   0.0000   0.0000   0.0000
+ 74.000000   0.0000   0.0000   0.0000   0.0000
+ 75.000000   0.0000   0.0000   0.0000   0.0000
+ 76.000000   0.0000   0.0000   0.0000   0.0000
+ 77.000000   0.0000   0.0000   0.0000   0.0000
+ 78.000000   0.0000   0.0000   0.0000   0.0000
+ 79.000000   0.0000   0.0000   0.0000   0.0000
+ 80.000000   0.0000   0.0000   0.0000   0.0000
+ 81.000000   0.0000   0.0000   0.0000   0.0000
+ 82.000000   0.0000   0.0000   0.0000   0.0000
+ 83.000000   0.0000   0.0000   0.0000   0.0000
+ 84.000000   0.0000   0.0000   0.0000   0.0000
+ 85.000000   0.0000   0.0000   0.0000   0.0000
+ 86.000000   0.0000   0.0000   0.0000   0.0000
+ 87.000000   0.0000   0.0000   0.0000   0.0000
+ 88.000000   0.0000   0.0000   0.0000   0.0000
+ 89.000000   0.0000   0.0000   0.0000   0.0000
+ 90.000000   0.0000   0.0000   0.0000   0.0000
+ 91.000000   0.0000   0.0000   0.0000   0.0000
+ 92.000000   0.0000   0.0000   0.0000   0.0000
+ 93.000000   0.0000   0.0000   0.0000   0.0000
+ 94.000000   0.0000   0.0000   0.0000   0.0000
+ 95.000000   0.0000   0.0000   0.0000   0.0000
+ 96.000000   0.0000   0.0000   0.0000   0.0000
+ 97.000000   0.0000   0.0000   0.0000   0.0000
+ 98.000000   0.0000   0.0000   0.0000   0.0000
+ 99.000000   0.0000   0.0000   0.0000   0.0000
+ 100.000000   0.0000   0.0000   0.0000   0.0000
+ 101.000000   0.0000   0.0000   0.0000   0.0000
+ 102.000000   0.0000   0.0000   0.0000   0.0000
+ 103.000000   0.0000   0.0000   0.0000   0.0000
+ 104.000000   0.0000   0.0000   0.0000   0.0000
+ 105.000000   0.0000   0.0000   0.0000   0.0000
+ 106.000000   0.0000   0.0000   0.0000   0.0000
+ 107.000000   0.0000   0.0000   0.0000   0.0000
diff --git a/regtest/basic/rt42c-cuda/trajectory.xyz b/regtest/basic/rtCUDACOORDINATION-nopbc/trajectory.xyz
similarity index 100%
rename from regtest/basic/rt42c-cuda/trajectory.xyz
rename to regtest/basic/rtCUDACOORDINATION-nopbc/trajectory.xyz
diff --git a/src/cudaCoord/Coordination.cu b/src/cudaCoord/Coordination.cu
index acf961dfa7..5726de8dd3 100644
--- a/src/cudaCoord/Coordination.cu
+++ b/src/cudaCoord/Coordination.cu
@@ -41,8 +41,7 @@ using std::cerr;
 //#define vdbg(...)
 
 namespace PLMD {
-namespace colvar {
-
+  namespace colvar {
 //+PLUMEDOC COLVAR CUDACOORDINATION
 /*
 Calculate coordination numbers. Like coordination, but on nvdia gpu and with no swithcing function
@@ -130,19 +129,20 @@ class CudaCoordination : public Colvar {
   CUDAHELPERS::memoryHolder<double> cudaCoordination;
   CUDAHELPERS::memoryHolder<double> cudaDerivatives;
   CUDAHELPERS::memoryHolder<double> cudaDerivatives_sparse;
-  CUDAHELPERS::memoryHolder<int64_t> cudaDerivatives_sparserow;
+  CUDAHELPERS::memoryHolder<int64_t> cudaDerivatives_sparserows;
   CUDAHELPERS::memoryHolder<int64_t> cudaDerivatives_sparsecols;
   CUDAHELPERS::memoryHolder<double> cudaVirial;
-  CUDAHELPERS::memoryHolder<double> reductionMemoryDerivatives;
+  CUDAHELPERS::memoryHolder<double> resultDerivatives;
   CUDAHELPERS::memoryHolder<double> bufferDerivatives;
   CUDAHELPERS::memoryHolder<double> reductionMemoryVirial;
   CUDAHELPERS::memoryHolder<double> reductionMemoryCoord;
-  CUDAHELPERS::memoryHolder<double> ones;
+  
   cusparseHandle_t sparseMDevHandle;
+  cusparseDnVecDescr_t outDevDescr;
+
   cudaStream_t streamDerivatives;
   cudaStream_t streamVirial;
   cudaStream_t streamCoordination;
-  cusparseDnVecDescr_t outVecDescr;
   unsigned maxNumThreads=512;
   SwitchingFunction switchingFunction;
   rationalSwitchParameters switchingParameters;
@@ -166,17 +166,26 @@ PLUMED_REGISTER_ACTION(CudaCoordination,"CUDACOORDINATION")
 void CudaCoordination::setUpPermanentGPUMemory(){
   auto nat = getPositions().size();
   //coordinates values are updated at each step
-  cudaCoords.resize(3*nat);
-  
+  if ((3*nat) != cudaCoords.size()){
+    cudaCoords.resize(3*nat);
+    if(resultDerivatives.size()!=0)
+      cusparseDestroyDnVec(outDevDescr);
+    resultDerivatives.resize(3*nat);
+    cusparseCreateDnVec(&outDevDescr,
+      3*nat,
+      resultDerivatives.pointer(),
+      CUDA_R_64F);
+  }
   //the neighbour list will be updated at each request of prepare
   auto pairList = nl->getClosePairs();
   const unsigned nn=nl->size();
   cudaPairList.resize(2*nn);
-  cudaPairList.copyToCuda(pairList.data());
-  reductionMemoryDerivatives.resize(3*nat);
-  cusparseCreateDnVec(&outVecDescr,
+  //streamDerivatives makes the copy async
+  cudaPairList.copyToCuda(pairList.data(),streamDerivatives);
+  resultDerivatives.resize(3*nat);
+  cusparseCreateDnVec(&outDevDescr,
                     3*nat,
-                    reductionMemoryDerivatives.pointer(),
+                    resultDerivatives.pointer(),
                     CUDA_R_64F);
 }
 
@@ -191,21 +200,28 @@ void CudaCoordination::prepare() {
       requestAtoms(nl->getReducedAtomList());
       setUpPermanentGPUMemory();
       invalidateList=false;
-      if(getExchangeStep()) error("Neighbor lists should be updated on exchange steps - choose a NL_STRIDE which divides the exchange stride!");
+      if(getExchangeStep())
+        error("Neighbor lists should be updated on exchange steps - choose a "
+          "NL_STRIDE which divides the exchange stride!");
     }
-    if(getExchangeStep()) firsttime=true;
+    if(getExchangeStep())
+      firsttime=true;
   }
 }
+
 void CudaCoordination::registerKeywords( Keywords& keys ) {
   Colvar::registerKeywords(keys);
   keys.addFlag("SERIAL",false,"Perform the calculation in serial - for debug purpose");
-  keys.addFlag("PAIR",false,"Pair only 1st element of the 1st group with 1st element in the second, etc");
+  keys.addFlag("PAIR",false,"Pair only 1st element of the 1st group with 1st "
+    "element in the second, etc");
   keys.addFlag("NLIST",false,"Use a neighbor list to speed up the calculation");
   keys.add("optional","NL_CUTOFF","The cutoff for the neighbor list");
-  keys.add("optional","NL_STRIDE","The frequency with which we are updating the atoms in the neighbor list");
+  keys.add("optional","NL_STRIDE","The frequency with which we are updating the "
+    "atoms in the neighbor list");
   keys.add("optional","THREADS","The upper limit of the number of threads");
   keys.add("atoms","GROUPA","First list of atoms");
-  keys.add("atoms","GROUPB","Second list of atoms (if empty, N*(N-1)/2 pairs in GROUPA are counted)");
+  keys.add("atoms","GROUPB","Second list of atoms (if empty, N*(N-1)/2 pairs in "
+    "GROUPA are counted)");
   keys.add("compulsory","NN","6","The n parameter of the switching function ");
   keys.add("compulsory","MM","0","The m parameter of the switching function; 0 implies 2*NN");
   keys.add("compulsory","D_0","0.0","The d_0 parameter of the switching function");
@@ -215,16 +231,15 @@ void CudaCoordination::registerKeywords( Keywords& keys ) {
 //these constant will be used within the kernels
 __constant__ double cu_epsilon;
 
-__device__ double pcuda_fastpow(double base, int expo) {
+__device__ double pcuda_fastpow(double base,int expo) {
   if(expo<0) {
     expo=-expo;
     base=1.0/base;
   }
   double result = 1.0;
   while (expo) {
-    if (expo & 1) {
+    if (expo & 1)
       result *= base;
-    }
     expo >>= 1;
     base *= base;
   }
@@ -263,76 +278,109 @@ __global__ void getpcuda_Rational(const double *rdists,const int NN, const int M
   if(rdists[i]<=0.) {
     res[i]=1.;
     dfunc[i]=0.0;
-  }else{
-  res[i]=pcuda_Rational(rdists[i],NN,MM,dfunc[i]);
-  }
-  //printf("CUDA: %i :: d=%f -> %f, %f\n", i,rdists[i],res[i],dfunc[i]);
+  }else
+    res[i]=pcuda_Rational(rdists[i],NN,MM,dfunc[i]);
 }
 
-
-__global__ void getConst() {
-  //printf("Cuda: cu_epsilon = %f\n", cu_epsilon);
-}
+// __global__ void getConst() {
+//   printf("Cuda: cu_epsilon = %f\n", cu_epsilon);
+// }
 
 CudaCoordination::CudaCoordination(const ActionOptions&ao):
   PLUMED_COLVAR_INIT(ao) {
   parseFlag("SERIAL",serial);
 
-  std::vector<AtomNumber> ga_lista,gb_lista;
-  parseAtomList("GROUPA",ga_lista);
-  parseAtomList("GROUPB",gb_lista);
+  std::vector<AtomNumber> GroupA,GroupB;
+  parseAtomList("GROUPA",GroupA);
+  parseAtomList("GROUPB",GroupB);
 
   bool nopbc=!pbc;
   parseFlag("NOPBC",nopbc);
   pbc=!nopbc;
 
-// pair stuff
+  // pair stuff
   bool dopair=false;
   parseFlag("PAIR",dopair);
 
-// neighbor list stuff
+  // neighbor list stuff
   bool doneigh=false;
   double nl_cut=0.0;
   int nl_st=0;
   parseFlag("NLIST",doneigh);
   if(doneigh) {
     parse("NL_CUTOFF",nl_cut);
-    if(nl_cut<=0.0) error("NL_CUTOFF should be explicitly specified and positive");
+    if(nl_cut<=0.0)
+      error("NL_CUTOFF should be explicitly specified and positive");
     parse("NL_STRIDE",nl_st);
-    if(nl_st<=0) error("NL_STRIDE should be explicitly specified and positive");
+    if(nl_st<=0)
+      error("NL_STRIDE should be explicitly specified and positive");
   }
   parse("THREADS",maxNumThreads);
-  if(maxNumThreads<=0) error("THREADS should be positive");
-  addValueWithDerivatives(); setNotPeriodic();
-  if(gb_lista.size()>0) {
-    if(doneigh)
-    {nl=Tools::make_unique<NeighborList>(ga_lista,gb_lista,serial,dopair,pbc,getPbc(),comm,nl_cut,nl_st);}
-    else
-    {nl=Tools::make_unique<NeighborList>(ga_lista,gb_lista,serial,dopair,pbc,getPbc(),comm);}
+  if(maxNumThreads<=0)
+    error("THREADS should be positive");
+  addValueWithDerivatives();
+  setNotPeriodic();
+  if(GroupB.size()>0) {
+    if(doneigh){
+      nl=Tools::make_unique<NeighborList>(
+        GroupA,
+        GroupB,
+        serial,
+        dopair,
+        pbc,
+        getPbc(),
+        comm,
+        nl_cut,
+        nl_st);
+    } else {
+      nl=Tools::make_unique<NeighborList>(
+        GroupA,
+        GroupB,
+        serial,
+        dopair,
+        pbc,
+        getPbc(),
+        comm);}
   } else {
-    if(doneigh)
-    {nl=Tools::make_unique<NeighborList>(ga_lista,serial,pbc,getPbc(),comm,nl_cut,nl_st);}
-    else
-    {nl=Tools::make_unique<NeighborList>(ga_lista,serial,pbc,getPbc(),comm);}
+    if(doneigh){
+      nl=Tools::make_unique<NeighborList>(
+      GroupA,
+      serial,
+      pbc,
+      getPbc(),
+      comm,
+      nl_cut,
+      nl_st);
+    }else{
+      nl=Tools::make_unique<NeighborList>(GroupA,serial,pbc,getPbc(),comm);
+    }
   }
 
   requestAtoms(nl->getFullAtomList());
 
-  log.printf("  between two groups of %u and %u atoms\n",static_cast<unsigned>(ga_lista.size()),static_cast<unsigned>(gb_lista.size()));
+  log.printf(
+    "  between two groups of %u and %u atoms\n",
+    static_cast<unsigned>(GroupA.size()),
+    static_cast<unsigned>(GroupB.size()));
   log.printf("  first group:\n");
-  for(unsigned int i=0; i<ga_lista.size(); ++i) {
-    if ( (i+1) % 25 == 0 ) log.printf("  \n");
-    log.printf("  %d", ga_lista[i].serial());
+  for(unsigned int i=0; i<GroupA.size(); ++i) {
+    if ( (i+1) % 25 == 0 )
+      log.printf("  \n");
+    log.printf("  %d", GroupA[i].serial());
   }
   log.printf("  \n  second group:\n");
-  for(unsigned int i=0; i<gb_lista.size(); ++i) {
-    if ( (i+1) % 25 == 0 ) log.printf("  \n");
-    log.printf("  %d", gb_lista[i].serial());
+  for(unsigned int i=0; i<GroupB.size(); ++i) {
+    if ( (i+1) % 25 == 0 )
+      log.printf("  \n");
+    log.printf("  %d", GroupB[i].serial());
   }
   log.printf("  \n");
-  if(pbc) log.printf("  using periodic boundary conditions\n");
-  else    log.printf("  without periodic boundary conditions\n");
-  if(dopair) log.printf("  with PAIR option\n");
+  if(pbc)
+    log.printf("  using periodic boundary conditions\n");
+  else
+    log.printf("  without periodic boundary conditions\n");
+  if(dopair)
+    log.printf("  with PAIR option\n");
   if(doneigh) {
     log.printf("  using neighbor lists with\n");
     log.printf("  update every %d steps and cutoff %f\n",nl_st,nl_cut);
@@ -391,18 +439,18 @@ CudaCoordination::CudaCoordination(const ActionOptions&ao):
   cudaStreamCreate(&streamVirial);
   cudaStreamCreate(&streamCoordination);
   cusparseCreate(&sparseMDevHandle);
-  //cusparseSetPointerMode(sparseMDevHandle, CUSPARSE_POINTER_MODE_HOST or CUSPARSE_POINTER_MODE_DEVICE);
   cusparseSetStream(sparseMDevHandle, streamDerivatives);
   setUpPermanentGPUMemory();
-  log<<"  contacts are counted with cutoff "<<switchingFunction.description()<<"\n";
+  log << "  contacts are counted with cutoff "
+      << switchingFunction.description() << "\n";
 }
 
 CudaCoordination::~CudaCoordination(){
-   cudaStreamDestroy(streamDerivatives);
-   cudaStreamDestroy(streamVirial);
-   cudaStreamDestroy(streamCoordination);
-   cusparseDestroyDnVec(outVecDescr);
-   cusparseDestroy(sparseMDevHandle);
+  cusparseDestroyDnVec(outDevDescr);
+  cusparseDestroy(sparseMDevHandle);
+  cudaStreamDestroy(streamDerivatives);
+  cudaStreamDestroy(streamVirial);
+  cudaStreamDestroy(streamCoordination);
 }
 
 __device__ double calculateSqr(const double distancesq,
@@ -412,14 +460,17 @@ __device__ double calculateSqr(const double distancesq,
   dfunc=0.0;
   if(distancesq<switchingParameters.dmaxSQ) {
     const double rdist_2 = distancesq*switchingParameters.invr0_2;
-    result=pcuda_Rational(rdist_2,switchingParameters.nn/2,switchingParameters.mm/2,dfunc);
+    result=pcuda_Rational(
+      rdist_2,
+      switchingParameters.nn/2,
+      switchingParameters.mm/2,
+      dfunc);
     // chain rule:
     dfunc*=2*switchingParameters.invr0_2;
     // cu_stretch:
     result=result*switchingParameters.stretch+switchingParameters.shift;
     dfunc*=switchingParameters.stretch;
   }
-  //printf("%f\n",result);
   return result;
 }
 
@@ -427,33 +478,29 @@ __device__ double calculateSqr(const double distancesq,
 #define Y(I) 3*I+1
 #define Z(I) 3*I+2
 
-#define Xd(I) i+ (   I*3 )* numOfPairs
-#define Yd(I) i+ (1 +I*3 )* numOfPairs
-#define Zd(I) i+ (2 +I*3 )* numOfPairs
 __global__ void getCoord(
                         const unsigned numOfPairs,
                         const rationalSwitchParameters switchingParameters,
                         const double *coordinates,
                         const unsigned *pairList,
                         double *ncoordOut,
-                        double *ddOut,//will be used for the virial
-                        double *ddOut_sparse,//will be used for the derivatives
+                        double *ddOut,
+                        double *ddOut_sparse,
                         int64_t *sparseRows,
-                        int64_t *sparseCols,
-                        double* ones
+                        int64_t *sparseCols
                         ) {
   //blockDIm are the number of threads in your block
   const unsigned i = threadIdx.x + blockIdx.x * blockDim.x;
-  
-  if (i >=numOfPairs) {
-    //Safeguard
+
+  //Safeguard
+  if (i >=numOfPairs)
     return;
-  }
+
   const unsigned i0= pairList[i*2];
   const unsigned i1= pairList[i*2+1]; 
-  if (i0 == i1) {
+  if (i0 == i1)
     return;
-  }
+
   double d[3]={
     coordinates[X(i1)] - coordinates[X(i0)],
     coordinates[Y(i1)] - coordinates[Y(i0)],
@@ -468,131 +515,248 @@ __global__ void getCoord(
   ddOut[i + 1 * numOfPairs] = d[1];
   ddOut[i + 2 * numOfPairs] = d[2];
   ddOut[i + 3 * numOfPairs] = dfunc;
-  ddOut_sparse[0 + sparsePlace] = -d[0] * dfunc;
-  ddOut_sparse[1 + sparsePlace] = -d[1] * dfunc;
-  ddOut_sparse[2 + sparsePlace] = -d[2] * dfunc;
-  ddOut_sparse[3 + sparsePlace] = d[0] * dfunc;
-  ddOut_sparse[4 + sparsePlace] = d[1] * dfunc;
-  ddOut_sparse[5 + sparsePlace] = d[2] * dfunc;
+  ddOut_sparse[0 + sparsePlace] = -d[0];// * dfunc;
+  ddOut_sparse[1 + sparsePlace] = -d[1];// * dfunc;
+  ddOut_sparse[2 + sparsePlace] = -d[2];// * dfunc;
+  ddOut_sparse[3 + sparsePlace] = d[0] ;//* dfunc;
+  ddOut_sparse[4 + sparsePlace] = d[1] ;//* dfunc;
+  ddOut_sparse[5 + sparsePlace] = d[2] ;//* dfunc;
   sparseRows[0 + sparsePlace] = 3 * i0 + 0;
   sparseRows[1 + sparsePlace] = 3 * i0 + 1;
   sparseRows[2 + sparsePlace] = 3 * i0 + 2;
   sparseRows[3 + sparsePlace] = 3 * i1 + 0;
   sparseRows[4 + sparsePlace] = 3 * i1 + 1;
   sparseRows[5 + sparsePlace] = 3 * i1 + 2;
-  ones[i] = 1.0;
   sparseCols[i]=sparsePlace;
-  // ncoord[i]= 1;
-  // printf("Cuda: %i,%i %i %i\n", i,threadIdx.x , blockIdx.x,  blockDim.x);
 }
 
-void CudaCoordination::calculate() {
+//In ndReduction.cu there are the more "general" reductions
+  //(vector, tensor and scalar) here we have the adapted one for how the virial
+  //information are stored
+  template <unsigned numThreads, typename T>
+  __global__ void reductionVirial(T *inputArray, T *outputArray, const unsigned int len) {
+  //we saved the dd in an array x0 x1 x2..,xn-1,y0 y1 y2..,yn-1,z0 z1 z2..,zn-1
+  //virialOut[ii*3+jj]-=d[ii]*d[jj]*dfunc;
+  auto sdata = CUDAHELPERS::shared_memory_proxy<T>();
+  const unsigned int ii = blockIdx.y;
+  const unsigned int jj = blockIdx.z;
+  const unsigned int vcoord = ii*3+jj;
+  const unsigned int place = threadIdx.x;
+      
+  // each thread preprocess some element from global to shared memory
+  const unsigned int diplacementI = ii*len;
+  const unsigned int diplacementJ = jj*len;
+  unsigned int i = (numThreads*2)*blockIdx.x + place + diplacementI;
+  unsigned int j = (numThreads*2)*blockIdx.x + place + diplacementJ;
+  unsigned int dfunc = (numThreads*2)*blockIdx.x + place + 3*len;
+  const unsigned int gridSize = (numThreads*2)*gridDim.x;
+  //the first element is in blockIdx.y*len, the last element to sum in (blockIdx.y+1)*len-1
+  const unsigned int trgt=diplacementI + len;
+
+
+  sdata[place] = T(0);
   
+  while (i+numThreads < trgt) {
+    sdata[place] -= inputArray[i]*inputArray[j]*inputArray[dfunc]
+     + inputArray[i+numThreads]*inputArray[j+numThreads]*inputArray[dfunc+numThreads];
+    i+=gridSize;
+    j+=gridSize;
+    dfunc+=gridSize;
+  }
+  while (i < trgt) {
+    sdata[place] -= inputArray[i]*inputArray[j]*inputArray[dfunc];
+    i+=gridSize;
+    j+=gridSize;
+    dfunc+=gridSize;
+  }
+    
+  __syncthreads();
+  // do the actual reduction, its contained in cudaHelpers.cuh
+  CUDAHELPERS::reductor<numThreads>(
+    sdata,
+    outputArray,
+    blockIdx.x + vcoord * gridDim.x
+  );
+}
+
+template <typename T>
+void doReductionVirial (T *inputArray, T *outputArray, const unsigned int len,
+ const dim3 blocks, const unsigned nthreads,cudaStream_t stream=0){
+  switch (nthreads) {
+  case 512:
+    reductionVirial<512,T><<<blocks,512,512*sizeof(T),stream>>>(inputArray,outputArray, len);
+    break;
+  case 256:
+    reductionVirial<256,T><<<blocks,256,256*sizeof(T),stream>>>(inputArray,outputArray, len);
+    break;
+  case 128:
+    reductionVirial<128,T><<<blocks,128,128*sizeof(T),stream>>>(inputArray,outputArray, len);
+    break;
+  case 64:
+    reductionVirial<64, T><<<blocks,64,64*sizeof(T),stream>>>(inputArray,outputArray, len);
+    break;
+  case 32:
+    reductionVirial<32, T><<<blocks,32,32*sizeof(T),stream>>>(inputArray,outputArray, len);
+    break;
+  default:
+    plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads.");
+  }
+}
+
+void CudaCoordination::calculate() {
   auto positions = getPositions();
   auto nat = positions.size();
-  if(nl->getStride()>0 && invalidateList) {
+
+  Tensor virial;
+  double coordination;
+  auto deriv = std::vector<Vector>(nat);
+  
+  if(nl->getStride()>0 && invalidateList)
     nl->update(getPositions());
-  }
+  
   auto pairList = nl->getClosePairs();
   const unsigned nn=nl->size();
-  
   constexpr unsigned nthreads=512;
     
   unsigned ngroups=ceil(double(nn)/nthreads);
 
-  /****************allocating the memory on the GPU****************/
-  cudaCoords.copyToCuda(&positions[0][0]);
+  /**********************allocating the memory on the GPU**********************/
+  cudaCoords.copyToCuda(&positions[0][0],streamDerivatives);
   
-  ones.resize(nn);
   cudaCoordination.resize(nn);  
-  cudaDerivatives.resize(nn * 4);
-  //cudaVirial.resize(nn * 9);//it will be eventually updated in the reduceDVS function
-
-  cudaDerivatives_sparse.resize(nn * 6);
-  cudaDerivatives_sparserow.resize(nn * 6);
+  cudaDerivatives.resize(nn *4);
+  cudaDerivatives_sparse.resize(nn *6);
+  cudaVirial.resize(nn*9);
+  cudaDerivatives_sparserows.resize(nn*6);
   cudaDerivatives_sparsecols.resize(nn);
-  
-  /****************starting the calculations****************/
-  getCoord<<<ngroups,nthreads,0,streamDerivatives>>> (nn,switchingParameters,
+  /**************************starting the calculations*************************/
+  //this initializes the memory to be accumulated
+  getCoord<<<ngroups,nthreads,0,streamDerivatives>>> (
+    nn,
+    switchingParameters,
     cudaCoords.pointer(),
     cudaPairList.pointer(),
     cudaCoordination.pointer(),
     cudaDerivatives.pointer(),
     cudaDerivatives_sparse.pointer(),
-    cudaDerivatives_sparserow.pointer(),
-    cudaDerivatives_sparsecols.pointer(),
-    ones.pointer()
-    );
-  std::vector<Vector> deriv(nat);
-  cusparseSpMatDescr_t spMatDescr;
-  cusparseDnVecDescr_t dnVecDescr;
-  cusparseCreateDnVec(&dnVecDescr,
-                  nn,
-                  ones.pointer(),
-                  CUDA_R_64F);
+    cudaDerivatives_sparserows.pointer(),
+    cudaDerivatives_sparsecols.pointer()
+  );
+  //since the virial and the coordination reduction are on different streams, 
+  //this barrier makes sure that the memory is ready for the operations
   
-  cusparseCreateCsc(&spMatDescr,
-                  getPositions().size() * 3,
-                  nn,
-                  nn*6,
-                  cudaDerivatives_sparsecols.pointer(),
-                  cudaDerivatives_sparserow.pointer(),
-                  cudaDerivatives_sparse.pointer(),
-                  CUSPARSE_INDEX_64I,
-                  CUSPARSE_INDEX_64I,
-                  CUSPARSE_INDEX_BASE_ZERO,
-                  CUDA_R_64F);
+  /**************************accumulating the results**************************/
+  cusparseDnVecDescr_t dfuncDense;
+  cusparseCreateDnVec(
+    &dfuncDense,
+    nn,
+    //this part of the array contains the nn dfuncs
+    cudaDerivatives.pointer()+3*nn,
+    CUDA_R_64F);
+  cusparseSpMatDescr_t derivativesSparse;
+  cusparseCreateCsc(
+    &derivativesSparse,
+    getPositions().size() * 3,//rows
+    nn,//columns
+    nn*6,//non zerp elements
+    cudaDerivatives_sparsecols.pointer(),
+    cudaDerivatives_sparserows.pointer(),
+    cudaDerivatives_sparse.pointer(),
+    CUSPARSE_INDEX_64I,
+    CUSPARSE_INDEX_64I,
+    CUSPARSE_INDEX_BASE_ZERO,
+    CUDA_R_64F);
   double one=1.0;
   double zero=0.0;
   size_t bufferSize = 0;
-  cusparseSpMV_bufferSize( sparseMDevHandle,
-                        CUSPARSE_OPERATION_NON_TRANSPOSE,
-                        &one,
-                        spMatDescr,
-                        dnVecDescr,
-                        &zero,
-                        outVecDescr,
-                        CUDA_R_64F,
-                        CUSPARSE_SPMV_ALG_DEFAULT,//may be improved
-                        &bufferSize);
-bufferDerivatives.resize(bufferSize);
-//the sparseMDevHandle operation happen in their own stream, so reduceDVS will be run in concurrency
-cusparseSpMV( sparseMDevHandle,
-                        CUSPARSE_OPERATION_NON_TRANSPOSE,
-                        &one,
-                        spMatDescr,
-                        dnVecDescr,
-                        &zero,
-                        outVecDescr,
-                        CUDA_R_64F,
-                        CUSPARSE_SPMV_ALG_DEFAULT,//may be improved
-                        bufferDerivatives.pointer());
-  //reductionMemoryDerivatives.copyFromCuda(&deriv[0][0],streamDerivatives);
-  CUDAHELPERS::VS ret = CUDAHELPERS::reduceVS(
-    cudaDerivatives,
-    cudaVirial,
-    cudaCoordination,
-    cudaPairList,
-    reductionMemoryVirial,
-    reductionMemoryCoord,
-    streamVirial,
-    streamCoordination,
-    nn,nat,
-    maxNumThreads
-  );
-  reductionMemoryDerivatives.copyFromCuda(&deriv[0][0]);
-  //cusparseDnVecGetValues(outVecDescr,**void); returns reductionMemoryDerivatives.pointer()
-   
+  //this computes the buffersize
+  cusparseSpMV_bufferSize(
+    sparseMDevHandle,
+    CUSPARSE_OPERATION_NON_TRANSPOSE,
+    &one,
+    derivativesSparse,
+    dfuncDense,
+    &zero,
+    outDevDescr,
+    CUDA_R_64F,
+    CUSPARSE_SPMV_ALG_DEFAULT,//may be improved
+    &bufferSize);
+  bufferDerivatives.resize(bufferSize);
+  cudaStreamSynchronize ( streamDerivatives );
+  cusparseSpMV(
+    sparseMDevHandle,
+    CUSPARSE_OPERATION_NON_TRANSPOSE,
+    &one,
+    derivativesSparse,
+    dfuncDense,
+    &zero,
+    outDevDescr,
+    CUDA_R_64F,
+    CUSPARSE_SPMV_ALG_DEFAULT,//may be improved
+    bufferDerivatives.pointer());
   
-  cusparseDestroySpMat(spMatDescr);
-  cusparseDestroyDnVec(dnVecDescr);
+  auto N=nn;
+  bool first=true;
+  while(N>1){
+    size_t runningThreads = CUDAHELPERS::threadsPerBlock(N,maxNumThreads);
+    unsigned nGroups = CUDAHELPERS::idealGroups(N, runningThreads);
+    
+    
+    reductionMemoryCoord.resize(nGroups);
+    reductionMemoryVirial.resize(9* nGroups);
+
+    if (first){
+      dim3 ngroupsVirial(nGroups,3,3);
+      doReductionVirial (
+        cudaDerivatives.pointer(),
+        reductionMemoryVirial.pointer(),
+        N,
+        ngroupsVirial,
+        runningThreads,
+        streamVirial);
+        //putting this here improves the overlap between mem movements and kernels
+        resultDerivatives.copyFromCuda(&deriv[0][0],streamDerivatives);
+    }else{
+      dim3 ngroupsVirial(nGroups,9);
+      CUDAHELPERS::doReductionND (
+        cudaVirial.pointer(),
+        reductionMemoryVirial.pointer(),
+        N,
+        ngroupsVirial,
+        runningThreads,
+        streamVirial);
+    }
+
+    CUDAHELPERS::doReduction1D (
+      cudaCoordination.pointer(),//reduceScalarIn->pointer(),
+      reductionMemoryCoord.pointer(),//reduceSOut->pointer(),
+      N,
+      nGroups,
+      runningThreads,
+      streamCoordination);
   
+    if (nGroups==1){
+      reductionMemoryVirial.copyFromCuda(&virial[0][0],streamVirial);
+      //reduceSOut->copyFromCuda(&coordination,streamCoordination);
+      reductionMemoryCoord.copyFromCuda(&coordination,streamCoordination);
+    } else {
+      //std::swap(reduceScalarIn,reduceSOut);
+      reductionMemoryCoord.swap(cudaCoordination);
+      reductionMemoryVirial.swap(cudaVirial);
+    }
+    first=false;
+    N=nGroups;
+  }
+  //this ensures that the memory is fully in the host ram
+  cudaDeviceSynchronize ();
+  cusparseDestroySpMat(derivativesSparse);
+  cusparseDestroyDnVec(dfuncDense);
   for(unsigned i=0; i<deriv.size(); ++i) {
     setAtomsDerivatives(i,deriv[i]);
   }
   
-  setValue           (ret.scalar);
-  setBoxDerivatives  (ret.virial);
+  setValue           (coordination);
+  setBoxDerivatives  (virial);
 }
 
 } // namespace colvar
diff --git a/src/cudaCoord/cudaHelpers.cuh b/src/cudaCoord/cudaHelpers.cuh
index bd9f1f1f39..722e0eb677 100644
--- a/src/cudaCoord/cudaHelpers.cuh
+++ b/src/cudaCoord/cudaHelpers.cuh
@@ -140,6 +140,75 @@ namespace CUDAHELPERS {
     b = std::move(t);
   }
 
+// template <unsigned numThreads, typename T>
+// __device__ void warpReduce(volatile T* sdata, const unsigned int place){
+//   //Instructions are SIMD synchronous within a warp
+//   //so no need for __syncthreads(), in the last iterations
+//     if(numThreads >= 64){//compile time
+//       sdata[place] += sdata[place + 32];
+//     }
+//     if(numThreads >= 32){//compile time
+//       sdata[place] += sdata[place + 16];
+//     }
+//     if(numThreads >= 16){//compile time
+//       sdata[place] += sdata[place + 8];
+//     }
+//     if(numThreads >= 8){//compile time
+//       sdata[place] += sdata[place + 4];
+//     }
+//     if(numThreads >= 4){//compile time
+//       sdata[place] += sdata[place + 2];
+//     }
+//     if(numThreads >= 2){//compile time
+//       sdata[place] += sdata[place + 1];
+//     }
+// }
+
+template <unsigned numThreads, typename T>
+__device__ void reductor(volatile T* sdata,T *outputArray,const unsigned int where){
+  const unsigned int tid=threadIdx.x;
+  if (numThreads >= 512) {//compile time
+    if (tid  < 256)
+      sdata[tid] += sdata[tid + 256];
+    __syncthreads(); 
+    }
+  if (numThreads >= 256) {//compile time
+    if (tid  < 128)
+      sdata[tid] += sdata[tid + 128];
+    __syncthreads(); 
+    }
+  if (numThreads >= 128) {//compile time
+    if (tid < 64)
+      sdata[tid] += sdata[tid + 64];
+    __syncthreads();
+    }
+  if (tid < mymin(32u,numThreads/2)) {
+    //warpReduce<numThreads>(sdata, tid);
+    if(numThreads >= 64){//compile time
+      sdata[tid] += sdata[tid + 32];
+    }
+    if(numThreads >= 32){//compile time
+      sdata[tid] += sdata[tid + 16];
+    }
+    if(numThreads >= 16){//compile time
+      sdata[tid] += sdata[tid + 8];
+    }
+    if(numThreads >= 8){//compile time
+      sdata[tid] += sdata[tid + 4];
+    }
+    if(numThreads >= 4){//compile time
+      sdata[tid] += sdata[tid + 2];
+    }
+    if(numThreads >= 2){//compile time
+      sdata[tid] += sdata[tid + 1];
+    }
+  }
+  // write result for this block to global memory
+  if (tid == 0){
+    outputArray[where] = sdata[0];    
+  }
+}
+
 } //namespace CUDAHELPERS
 } //namespace PLMD
 #endif //__PLUMED_cuda_helpers_cuh
diff --git a/src/cudaCoord/ndReduction.cu b/src/cudaCoord/ndReduction.cu
index 1ff02e6450..321c854b9d 100644
--- a/src/cudaCoord/ndReduction.cu
+++ b/src/cudaCoord/ndReduction.cu
@@ -11,6 +11,7 @@
 //#define vdbg(...) std::cerr << std::setw(4) << __LINE__ <<":" << std::setw(20)<< #__VA_ARGS__ << " " << (__VA_ARGS__) <<'\n'
 #define vdbg(...)
 
+// Some help for me:
 // * Grids map to GPUs
 // * Blocks map to the MultiProcessors (MP)
 // * Threads map to Stream Processors (SP)
@@ -18,7 +19,8 @@
 
 
 //There are a LOTS of unrolled loop down here,
-//see this https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf to undestand why
+//see this https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+// to undestand why
 
 //https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dim3
 //7.3.2. dim3
@@ -27,55 +29,7 @@
 namespace PLMD {
 namespace CUDAHELPERS {
 template <unsigned numThreads, typename T>
-__device__ void warpReduce(volatile T* sdata, const unsigned int place){
-  //Instructions are SIMD synchronous within a warp
-  //so no need for __syncthreads(), in the last iterations
-    if(numThreads >= 64){//compile time
-      sdata[place] += sdata[place + 32];
-    }
-    if(numThreads >= 32){//compile time
-      sdata[place] += sdata[place + 16];
-    }
-    if(numThreads >= 16){//compile time
-      sdata[place] += sdata[place + 8];
-    }
-    if(numThreads >= 8){//compile time
-      sdata[place] += sdata[place + 4];
-    }
-    if(numThreads >= 4){//compile time
-      sdata[place] += sdata[place + 2];
-    }
-    if(numThreads >= 2){//compile time
-      sdata[place] += sdata[place + 1];
-    }
-}
-
-template <unsigned numThreads, typename T>
-__device__ void reductor(volatile T* sdata,T *g_odata,const unsigned int where){
-  const unsigned int tid=threadIdx.x;
-  if (numThreads >= 512) {//compile time
-    if (tid  < 256) {
-       sdata[tid] += sdata[tid + 256]; } __syncthreads(); 
-    }
-  if (numThreads >= 256) {//compile time
-    if (tid  < 128) {
-       sdata[tid] += sdata[tid + 128]; } __syncthreads(); 
-    }
-  if (numThreads >= 128) {//compile time
-    if (threadIdx. x < 64) { 
-      sdata[tid] += sdata[tid + 64]; } __syncthreads();
-    }
-  if (tid < mymin(32u,numThreads/2)) {
-    warpReduce<numThreads>(sdata, tid);
-  }
-  // write result for this block to global memory
-  if (tid == 0){
-    g_odata[where] = sdata[0];    
-  }
-}
-
-template <unsigned numThreads, typename T>
-__global__ void reductionND(const T *g_idata, T *g_odata, const unsigned int len) {
+__global__ void reductionND(const T *inputArray, T *outputArray, const unsigned int len) {
   //playing with this 
   //https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
   auto sdata = shared_memory_proxy<T>();
@@ -90,94 +44,46 @@ __global__ void reductionND(const T *g_idata, T *g_odata, const unsigned int len
 
   sdata[threadIdx.x] = T(0);
   while (i+numThreads < trgt) {
-    sdata[threadIdx.x] += g_idata[i] + g_idata[i+numThreads];
+    sdata[threadIdx.x] += inputArray[i] + inputArray[i+numThreads];
     i+=gridSize;
   }
   while (i < trgt) {
-    sdata[threadIdx.x] += g_idata[i];
+    sdata[threadIdx.x] += inputArray[i];
      i+=gridSize;
   }
 
   __syncthreads();
   // do reduction in shared memory
-  reductor<numThreads>(sdata,g_odata,blockIdx.x+blockIdx.y*gridDim.x);
+  reductor<numThreads>(sdata,outputArray,blockIdx.x+blockIdx.y*gridDim.x);
 }
 
 template <unsigned numThreads, typename T>
-__global__ void reduction1D(T *g_idata, T *g_odata, const unsigned int len) {
+__global__ void reduction1D(T *inputArray, T *outputArray, const unsigned int len) {
   auto sdata = shared_memory_proxy<T>();
   const unsigned int place = threadIdx.x;
   // each thread sums some elements from global to shared memory
   unsigned int i = (2*numThreads)*blockIdx.x + place;
   const unsigned int gridSize = (2*numThreads)*gridDim.x;
   sdata[place] = T(0);
-  //I think this may slow down the loop, but this does not force the user to have
-  //an input that is multiple of the threads, padded with zeros
-  while (i+numThreads < len) {
-    sdata[place] += g_idata[i] + g_idata[i+numThreads];
+  //The double while is for preventig wrong memory
+  while ( i + numThreads < len) {
+    sdata[place] += inputArray[i] + inputArray[i+numThreads];
     i+=gridSize;
   }
   while (i < len) {
-    sdata[place] += g_idata[i];
+    sdata[place] += inputArray[i];
     i+=gridSize;
   }
     
   __syncthreads();
   // do reduction in shared memory
-  reductor<numThreads>(sdata,g_odata,blockIdx.x);
-}
-
-template <unsigned numThreads, typename T>
-__global__ void reductionVirial(T *g_idata, T *g_odata, const unsigned int len) {
-  //we saved the dd in an array x0 x1 x2..,xn-1,y0 y1 y2..,yn-1,z0 z1 z2..,zn-1
-  //virialOut[ii*3+jj]-=d[ii]*d[jj]*dfunc;
-  auto sdata = shared_memory_proxy<T>();
-  const unsigned int ii = blockIdx.y;
-  const unsigned int jj = blockIdx.z;
-  const unsigned int vcoord = ii*3+jj;
-  const unsigned int place = threadIdx.x;
-  // each thread loads one element from global to shared mem
-    
-  // each thread loads one element from global to shared memory
-  const unsigned int diplacementI = ii*len;
-  const unsigned int diplacementJ = jj*len;
-  unsigned int i = (numThreads*2)*blockIdx.x + place + diplacementI;
-  unsigned int j = (numThreads*2)*blockIdx.x + place + diplacementJ;
-  unsigned int dfunc = (numThreads*2)*blockIdx.x + place + 3*len;
-  const unsigned int gridSize = (numThreads*2)*gridDim.x;
-  //the first element is in blockIdx.y*len, the last element to sum in (blockIdx.y+1)*len-1
-  const unsigned int trgt=diplacementI + len;
-
-
-  sdata[place] = T(0);
-  //I think this may slow down the loop, but this does not force the user to have
-  //an input that is multiple of the threads, padded with zeros
-  while (i+numThreads < trgt) {
-    sdata[place] -= g_idata[i]*g_idata[j]*g_idata[dfunc]
-     + g_idata[i+numThreads]*g_idata[j+numThreads]*g_idata[dfunc+numThreads];
-    i+=gridSize;
-    j+=gridSize;
-    dfunc+=gridSize;
-  }
-  while (i < trgt) {
-    sdata[place] -= g_idata[i]*g_idata[j]*g_idata[dfunc];
-    i+=gridSize;
-    j+=gridSize;
-    dfunc+=gridSize;
-  }
-    
-  __syncthreads();
-  // do reduction in shared memory
-  reductor<numThreads>(sdata,g_odata,blockIdx.x
-  +
-  vcoord * gridDim.x
-  );
+  reductor<numThreads>(sdata,outputArray,blockIdx.x);
 }
 
 //after c++14 the template activation will be shorter to write:
 //template<typename T, std::enable_if_t<std::is_integral_v<T>, bool> = true>
 
-///finds the nearest upper multiple of the given reference (wit non increments)
+///finds the nearest upper multiple of the given reference
 template<typename T, 
 typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
   inline T nearestUpperMultipleTo(T number, T reference){
@@ -185,7 +91,7 @@ typename std::enable_if<std::is_integral<T>::value, bool>::type = true>
 }
 
 ///We'll find the ideal number of blocks using the Brent's theorem
-size_t getIdealGroups(size_t numberOfElements, size_t runningThreads){
+size_t idealGroups(size_t numberOfElements, size_t runningThreads){
     //nearest upper multiple to the numberof threads
     const size_t nnToGPU=nearestUpperMultipleTo(numberOfElements,runningThreads);
     ///Brent’s theorem says each thread should sum O(log n) elements
@@ -197,7 +103,7 @@ size_t getIdealGroups(size_t numberOfElements, size_t runningThreads){
 }
 
 
-size_t decideThreadsPerBlock(unsigned N, unsigned maxNumThreads=512){
+size_t threadsPerBlock(unsigned N, unsigned maxNumThreads){
   //this seeks the minimum number of threads to use a sigle block (and end the recursion)
   size_t dim=32;
   for (dim=32;dim<512;dim<<=1){
@@ -213,22 +119,27 @@ size_t decideThreadsPerBlock(unsigned N, unsigned maxNumThreads=512){
 }
 
 template <typename T>
-void callReduction1D (T *g_idata, T *g_odata, const unsigned int len, const unsigned blocks, const unsigned nthreads){
+void callReduction1D (
+  T *inputArray,
+  T *outputArray,
+  const unsigned int len,
+  const unsigned blocks,
+  const unsigned nthreads){
   switch (nthreads) {
   case 512:
-    reduction1D<512,T><<<blocks,512,512*sizeof(T)>>>(g_idata,g_odata, len);
+    reduction1D<512,T><<<blocks,512,512*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 256:
-    reduction1D<256,T><<<blocks,256,256*sizeof(T)>>>(g_idata,g_odata, len);
+    reduction1D<256,T><<<blocks,256,256*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 128:
-    reduction1D<128,T><<<blocks,128,128*sizeof(T)>>>(g_idata,g_odata, len);
+    reduction1D<128,T><<<blocks,128,128*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 64:
-    reduction1D<64, T><<<blocks,64,64*sizeof(T)>>>(g_idata,g_odata, len);
+    reduction1D<64, T><<<blocks,64,64*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 32:
-    reduction1D<32, T><<<blocks,32,32*sizeof(T)>>>(g_idata,g_odata, len);
+    reduction1D<32, T><<<blocks,32,32*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   default:
     plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads.");
@@ -236,22 +147,26 @@ void callReduction1D (T *g_idata, T *g_odata, const unsigned int len, const unsi
 }
 
 template <typename T>
-void callReductionND (T *g_idata, T *g_odata, const unsigned int len, const dim3 blocks, const unsigned nthreads){
+void callReductionND (T *inputArray,
+  T *outputArray,
+  const unsigned int len,
+  const dim3 blocks,
+  const unsigned nthreads){
     switch (nthreads) {
   case 512:
-    reductionND<512,T><<<blocks,512,512*sizeof(T)>>>(g_idata,g_odata, len);
+    reductionND<512,T><<<blocks,512,512*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 256:
-    reductionND<256,T><<<blocks,256,256*sizeof(T)>>>(g_idata,g_odata, len);
+    reductionND<256,T><<<blocks,256,256*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 128:
-    reductionND<128,T><<<blocks,128,128*sizeof(T)>>>(g_idata,g_odata, len);
+    reductionND<128,T><<<blocks,128,128*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 64:
-    reductionND<64, T><<<blocks,64,64*sizeof(T)>>>(g_idata,g_odata, len);
+    reductionND<64, T><<<blocks,64,64*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   case 32:
-    reductionND<32, T><<<blocks,32,32*sizeof(T)>>>(g_idata,g_odata, len);
+    reductionND<32, T><<<blocks,32,32*sizeof(T)>>>(inputArray,outputArray, len);
     break;
   default:
     plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads.");
@@ -263,10 +178,10 @@ double reduceScalar(double* cudaScalarAddress, unsigned N, unsigned maxNumThread
   double *reduceOut = cudaScalarAddress;
   double *reduceIn;
   while(N>1){
-    size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads);
+    size_t runningThreads = threadsPerBlock(N,maxNumThreads);
     reduceIn = reduceOut;
     reduceOut = nullptr;
-    auto ngroups=getIdealGroups(N, runningThreads);
+    auto ngroups=idealGroups(N, runningThreads);
     cudaFree(reduceOut);
     cudaMalloc(&reduceOut,ngroups  * sizeof(double));
     callReduction1D (reduceIn, reduceOut, N, ngroups, runningThreads);
@@ -288,9 +203,9 @@ double reduceScalar(memoryHolder<double>& cudaScalarAddress,
   memoryHolder<double>* reduceIn= &memoryHelper;
   memoryHolder<double>* reduceOut =&cudaScalarAddress;
   while(N>1){
-    size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads);
+    size_t runningThreads = threadsPerBlock(N,maxNumThreads);
     std::swap(reduceIn,reduceOut);
-    auto ngroups=getIdealGroups(N, runningThreads);
+    auto ngroups=idealGroups(N, runningThreads);
     reduceOut->resize(ngroups);
     callReduction1D (reduceIn->pointer(), reduceOut->pointer(), N, ngroups, runningThreads);
     N=ngroups;
@@ -304,27 +219,16 @@ std::vector<Vector> reduceNVectors(double* cudaNVectorAddress, unsigned N, unsig
   double *reduceOut = cudaNVectorAddress;
   double *reduceIn;
   auto dim = nat*3;
-  vdbg("InNVectors");
   while(N>1){
-    size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads);
+    size_t runningThreads = threadsPerBlock(N,maxNumThreads);
     reduceIn = reduceOut;
     reduceOut = nullptr;
-    vdbg(N);
-    vdbg(cudaNVectorAddress);
-    vdbg(reduceIn);
-    vdbg(reduceOut);
-    vdbg(runningThreads);
-    dim3 ngroups(getIdealGroups(N, runningThreads),dim);
-    vdbg(ngroups.x);
-    vdbg(ngroups.y);
+    dim3 ngroups(idealGroups(N, runningThreads),dim);
     cudaFree(reduceOut);
     cudaMalloc(&reduceOut,ngroups.y* ngroups.x  * sizeof(double));
-    vdbg(reduceOut);
-
     callReductionND (reduceIn, reduceOut, N, ngroups, runningThreads);
 
     if (reduceIn != cudaNVectorAddress){
-      vdbg("Free reduceIn");
       cudaFree(reduceIn);
     }
     N=ngroups.x;
@@ -332,35 +236,36 @@ std::vector<Vector> reduceNVectors(double* cudaNVectorAddress, unsigned N, unsig
   std::vector<Vector> toret(nat);
   cudaMemcpy(&toret[0][0], reduceOut, 3*nat*sizeof(double), cudaMemcpyDeviceToHost);
   cudaFree(reduceOut);
-  vdbg(toret[0]);
   return toret;
 }
 
 //THIS DOES NOT KEEP THE DATA SAFE
-std::vector<Vector> reduceNVectors(memoryHolder<double>& cudaNVectorAddress,
- memoryHolder<double>& memoryHelper, 
-unsigned N, unsigned nat, unsigned maxNumThreads){
+std::vector<Vector> reduceNVectors(
+  memoryHolder<double>& cudaNVectorAddress,
+  memoryHolder<double>& memoryHelper, 
+  unsigned N,
+  unsigned nat,
+  unsigned maxNumThreads){
   memoryHolder<double>* reduceIn= &memoryHelper;
   memoryHolder<double>* reduceOut =&cudaNVectorAddress;
   
   auto dim = nat*3;
-  vdbg("InNVectors");
   while(N>1){
-    size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads);
+    size_t runningThreads = threadsPerBlock(N,maxNumThreads);
     std::swap(reduceIn,reduceOut);
-    dim3 ngroups(getIdealGroups(N, runningThreads),dim);
+    dim3 ngroups(idealGroups(N, runningThreads),dim);
     reduceOut->resize(ngroups.y* ngroups.x);
-    
-
-    callReductionND (reduceIn->pointer(), reduceOut->pointer(), N, ngroups, runningThreads);
-
+    callReductionND (
+      reduceIn->pointer(),
+      reduceOut->pointer(),
+      N,
+      ngroups,
+      runningThreads);
     N=ngroups.x;
   }
   std::vector<Vector> toret(nat);
   reduceOut->copyFromCuda(&toret[0][0]);
   //cudaMemcpy(&toret[0][0], reduceOut, 3*nat*sizeof(double), cudaMemcpyDeviceToHost);
-  
-  vdbg(toret[0]);
   return toret;
 }
 
@@ -370,10 +275,10 @@ Vector reduceVector(double* cudaVectorAddress, unsigned N, unsigned maxNumThread
   double *reduceOut = cudaVectorAddress;
   double *reduceIn;
   while(N>1){
-    size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads);
+    size_t runningThreads = threadsPerBlock(N,maxNumThreads);
     reduceIn = reduceOut;
     reduceOut = nullptr;
-    dim3 ngroups(getIdealGroups(N, runningThreads),3);
+    dim3 ngroups(idealGroups(N, runningThreads),3);
     cudaFree(reduceOut);
     cudaMalloc(&reduceOut,ngroups.y* ngroups.x  * sizeof(double));
     
@@ -396,9 +301,9 @@ memoryHolder<double>& memoryHelper, unsigned N, unsigned maxNumThreads){
   memoryHolder<double>* reduceIn= &memoryHelper;
   memoryHolder<double>* reduceOut =&cudaTensorAddress;
   while(N>1){
-    size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads);
+    size_t runningThreads = threadsPerBlock(N,maxNumThreads);
     std::swap(reduceIn,reduceOut);
-    dim3 ngroups(getIdealGroups(N, runningThreads),9);
+    dim3 ngroups(idealGroups(N, runningThreads),9);
     reduceOut->resize(ngroups.y* ngroups.x);
   
     callReductionND (reduceIn->pointer(), reduceOut->pointer(),
@@ -412,23 +317,23 @@ memoryHolder<double>& memoryHelper, unsigned N, unsigned maxNumThreads){
 }
 
 template <typename T>
-void callReduction1D (T *g_idata, T *g_odata, const unsigned int len,
+void callReduction1D (T *inputArray, T *outputArray, const unsigned int len,
  const unsigned blocks, const unsigned nthreads,cudaStream_t stream=0){
   switch (nthreads) {
   case 512:
-    reduction1D<512,T><<<blocks,512,512*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reduction1D<512,T><<<blocks,512,512*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 256:
-    reduction1D<256,T><<<blocks,256,256*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reduction1D<256,T><<<blocks,256,256*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 128:
-    reduction1D<128,T><<<blocks,128,128*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reduction1D<128,T><<<blocks,128,128*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 64:
-    reduction1D<64, T><<<blocks,64,64*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reduction1D<64, T><<<blocks,64,64*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 32:
-    reduction1D<32, T><<<blocks,32,32*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reduction1D<32, T><<<blocks,32,32*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   default:
     plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads.");
@@ -436,111 +341,46 @@ void callReduction1D (T *g_idata, T *g_odata, const unsigned int len,
 }
 
 template <typename T>
-void callReductionND (T *g_idata, T *g_odata, const unsigned int len,
+void callReductionND (T *inputArray, T *outputArray, const unsigned int len,
  const dim3 blocks, const unsigned nthreads,cudaStream_t stream=0){
     switch (nthreads) {
   case 512:
-    reductionND<512,T><<<blocks,512,512*sizeof(T),stream>>>(g_idata,g_odata, len);
-    break;
-  case 256:
-    reductionND<256,T><<<blocks,256,256*sizeof(T),stream>>>(g_idata,g_odata, len);
-    break;
-  case 128:
-    reductionND<128,T><<<blocks,128,128*sizeof(T),stream>>>(g_idata,g_odata, len);
-    break;
-  case 64:
-    reductionND<64, T><<<blocks,64,64*sizeof(T),stream>>>(g_idata,g_odata, len);
-    break;
-  case 32:
-    reductionND<32, T><<<blocks,32,32*sizeof(T),stream>>>(g_idata,g_odata, len);
-    break;
-  default:
-    plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads.");
-  }
-}
-
-template <typename T>
-void callReductionVirial (T *g_idata, T *g_odata, const unsigned int len,
- const dim3 blocks, const unsigned nthreads,cudaStream_t stream=0){
-  switch (nthreads) {
-  case 512:
-    reductionVirial<512,T><<<blocks,512,512*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reductionND<512,T><<<blocks,512,512*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 256:
-    reductionVirial<256,T><<<blocks,256,256*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reductionND<256,T><<<blocks,256,256*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 128:
-    reductionVirial<128,T><<<blocks,128,128*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reductionND<128,T><<<blocks,128,128*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 64:
-    reductionVirial<64, T><<<blocks,64,64*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reductionND<64, T><<<blocks,64,64*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   case 32:
-    reductionVirial<32, T><<<blocks,32,32*sizeof(T),stream>>>(g_idata,g_odata, len);
+    reductionND<32, T><<<blocks,32,32*sizeof(T),stream>>>(inputArray,outputArray, len);
     break;
   default:
     plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads.");
   }
 }
 
-//#define vdbg(...) std::cerr << std::setw(4) << __LINE__ <<":" << std::setw(20)<< #__VA_ARGS__ << " " << (__VA_ARGS__) <<'\n'
-//if this is working I might use something similar to the sharedptr/weakptr
-VS reduceVS(memoryHolder<double>& derivativeIn,
- memoryHolder<double>& virialIn,
- memoryHolder<double>& scalarIn,
- memoryHolder<unsigned>& pairListIn,
- memoryHolder<double>& memoryHelperV,
- memoryHolder<double>& memoryHelperS,
- const cudaStream_t streamVirial,
- const cudaStream_t streamScalar,
- unsigned N, unsigned nat,
- unsigned maxNumThreads
-){
-  //the memory is assigned swapped because it will be swapped at each loop iteration
-  memoryHolder<double>* reduceScalarIn = &scalarIn;
-  memoryHolder<double>* reduceSOut = &memoryHelperS;
-
-  memoryHolder<double>* reduceVirialIn = &virialIn;
-  memoryHolder<double>* reduceVirialOut = &memoryHelperV;
-  
-  auto dim = nat*3;
-    
-  bool first=true;
-  VS toret;
-  while(N>1){
-    size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads);
-    unsigned ngroupsS=getIdealGroups(N, runningThreads);
-    
-    reduceVirialOut->resize(9* ngroupsS);
-    reduceSOut->resize(ngroupsS);
-
-    if (first){
-      dim3 ngroupsVirial(ngroupsS,3,3);
-      callReductionVirial (derivativeIn.pointer(),
-        reduceVirialOut->pointer(),
-        N, ngroupsVirial, runningThreads,streamVirial);
-    }else{
-      dim3 ngroupsVirial(ngroupsS,9);
-      callReductionND (reduceVirialIn->pointer(), reduceVirialOut->pointer(),
-        N, ngroupsVirial, runningThreads, streamVirial);
-    }
-    if (ngroupsS==1)
-      reduceVirialOut->copyFromCuda(&toret.virial[0][0],streamVirial);
-    
-    callReduction1D (reduceScalarIn->pointer(), reduceSOut->pointer(),
-      N, ngroupsS, runningThreads,streamScalar);
-    if (ngroupsS==1)
-      reduceSOut->copyFromCuda(&toret.scalar,streamScalar);
-    
-    std::swap(reduceVirialIn,reduceVirialOut);
-    std::swap(reduceScalarIn,reduceSOut);
-    first=false;
-    N=ngroupsS;
-  }
-  return toret;
-}
-
-
+void doReduction1D (double *inputArray,
+ double *outputArray,
+ const unsigned int len,
+ const unsigned blocks,
+ const unsigned nthreads,
+ cudaStream_t stream){
+  callReduction1D (inputArray, outputArray, len, blocks, nthreads, stream);
+ }
+
+void doReductionND (double *inputArray,
+ double *outputArray,
+ const unsigned int len,
+ const dim3 blocks,
+ const unsigned nthreads,
+ cudaStream_t stream){
+  callReductionND (inputArray, outputArray, len, blocks, nthreads, stream);
+ }
 } //namespace CUDAHELPERS
 } //namespace PLMD
 
diff --git a/src/cudaCoord/ndReduction.h b/src/cudaCoord/ndReduction.h
index 6c47d9e3b5..f1920cc161 100644
--- a/src/cudaCoord/ndReduction.h
+++ b/src/cudaCoord/ndReduction.h
@@ -123,47 +123,22 @@ unsigned N, unsigned maxNumThreads);
   double reduceScalar(memoryHolder<double>& cudaScalarAddress,
  memoryHolder<double>& memoryHelper, unsigned N, unsigned maxNumThreads=512);
 
- struct VS{
-  Tensor virial;
-  double scalar;
-};
+void doReduction1D (double *inputArray,
+ double *outputArray,
+ const unsigned int len,
+ const unsigned blocks,
+ const unsigned nthreads,
+ cudaStream_t stream=0);
 
-/** @brief reduces the coordination, the derivatives and the virial from a coordination calculation
-  *  @param derivativeIn the 3 * N array to be reduced with the compontent sof the derivatives
-  *  @param scalarIn the N array to be reduced with the values of the coordination
-  *  @param memoryHelperD preallocated GPU memory for speed up (for the derivatives)
-  *  @param memoryHelperV preallocated GPU memory for speed up (for the virial)
-  *  @param memoryHelperS preallocated GPU memory for speed up (for the scalar/coordination)
-  *  @param streamDerivatives preinitializated stream for concurrency (for the derivatives)
-  *  @param streamVirial preinitializated stream for concurrency (for the virial)
-  *  @param streamScalar preinitializated stream for concurrency (for the scalar/coordination)
-  *  @param cudaScalarAddress the pointer to the memory in cuda
-  *  @param N the number of scalar to reduce
-  *  @param nat the number of atoms
-  *  @param maxNumThreads limits the number of threads per block to be used 
-  *  @return the derivative, the virial and the coordination
-  * 
-  * reduceDVS 
-  * The memory helpers will be resized if needed (the memory occupied may be increased but not reduced)
-  * 
-  * The algorithm will decide the best number of threads to be used in 
-  * an extra nthreads * sizeof(double) will be allocated on the GPU,
-  * where nthreads is the total number of threads that will be used
-  * 
-  * @note cudaScalarAddress is threated as not owned: the user will need to call cudaFree on it!!!
-  */ 
-VS reduceVS(
-  memoryHolder<double>& derivativeIn,
-  memoryHolder<double>& virialIn,
-  memoryHolder<double>& scalarIn,
-  memoryHolder<unsigned>& pairListIn,
-  memoryHolder<double>& memoryHelperV,
-  memoryHolder<double>& memoryHelperS,
-  cudaStream_t streamVirial,
-  cudaStream_t streamScalar,
-  unsigned N, unsigned nat,
-  unsigned maxNumThreads=512);
+void doReductionND (double *inputArray,
+ double *outputArray,
+ const unsigned int len,
+ const dim3 blocks,
+ const unsigned nthreads,
+ cudaStream_t stream=0);
 
+size_t idealGroups(size_t numberOfElements, size_t runningThreads);
+size_t threadsPerBlock(unsigned N, unsigned maxNumThreads=512);
 } //namespace CUDAHELPERS
 } //namespace PLMD
-#endif //__PLUMED_cuda_ndReduction_h
\ No newline at end of file
+#endif //__PLUMED_cuda_ndReduction_h