Improve BlockInterGPU reduction and MolExchangeGPU optimizations

GOMC-WSU · Jul 1, 2024 · a709ea5 · a709ea5
1 parent ac6ba7b
commit a709ea5
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 169 deletions.
diff --git a/src/Ewald.cpp b/src/Ewald.cpp
@@ -758,76 +758,71 @@ double Ewald::MolExchangeReciprocal(const std::vector<cbmc::TrialMol> &newMol,
 
 #ifdef GOMC_CUDA
     //Build a vector of only the charged particles in the new and old molecules
-    std::vector<double> chargeBoxNew;
+    std::vector<double> chargeBox;
     MoleculeKind const& thisKindNew = newMol[0].GetKind();
     uint lengthNew = thisKindNew.NumAtoms() * newMol.size();
-    XYZArray newMolCoords = XYZArray(lengthNew); 
+    MoleculeKind const& thisKindOld = oldMol[0].GetKind();
+    uint lengthOld = thisKindOld.NumAtoms() * oldMol.size();
+    // The maximum size of this array is all charged particles
+    XYZArray molCoords = XYZArray(lengthNew + lengthOld); 
 
-    int newChargedParticles = 0; 
+    int numChargedParticles = 0; 
     for (uint m = 0; m < newMol.size(); ++m) {
-      uint newMoleculeIndex = molIndexNew[m];
-      double lambdaCoef = GetLambdaCoef(newMoleculeIndex, box);
+      uint moleculeIndex = molIndexNew[m];
+      double lambdaCoef = GetLambdaCoef(moleculeIndex, box);
 
-      XYZArray currNewMolCoords = newMol[m].GetCoords(); 
+      XYZArray currMolCoords = newMol[m].GetCoords(); 
       for (uint p = 0; p < lengthNew; ++p) {
-        unsigned long currentAtom = mols.MolStart(newMoleculeIndex) + p;
-        if(!particleHasNoCharge[currentAtom]) {
-          newMolCoords.Set(newChargedParticles, 
-                           currNewMolCoords.x[p],
-                           currNewMolCoords.y[p],
-                           currNewMolCoords.z[p]); 
-          newChargedParticles++;
-          chargeBoxNew.push_back(thisKindNew.AtomCharge(p) * lambdaCoef);
+        unsigned long currentAtom = mols.MolStart(moleculeIndex) + p;
+        if (!particleHasNoCharge[currentAtom]) {
+          molCoords.Set(numChargedParticles, 
+                        currMolCoords.x[p],
+                        currMolCoords.y[p],
+                        currMolCoords.z[p]); 
+          numChargedParticles++;
+          chargeBox.push_back(thisKindNew.AtomCharge(p) * lambdaCoef);
         }
       }      
     }
-    lengthNew = newChargedParticles;
-
-    std::vector<double> chargeBoxOld;
-    MoleculeKind const& thisKindOld = oldMol[0].GetKind();
-    uint lengthOld = thisKindOld.NumAtoms() * oldMol.size();
-    XYZArray oldMolCoords = XYZArray(lengthOld); 
 
-    int oldChargedParticles = 0; 
     for (uint m = 0; m < oldMol.size(); ++m) {
-      uint oldMoleculeIndex = molIndexOld[m];
-      double lambdaCoef = GetLambdaCoef(oldMoleculeIndex, box);
-      XYZArray currOldMolCoords = oldMol[m].GetCoords(); 
+      uint moleculeIndex = molIndexOld[m];
+      double lambdaCoef = GetLambdaCoef(moleculeIndex, box);
+      XYZArray currMolCoords = oldMol[m].GetCoords(); 
       for (uint p = 0; p < lengthOld; ++p) {
-        unsigned long currentAtom = mols.MolStart(oldMoleculeIndex) + p;
-        if(!particleHasNoCharge[currentAtom]) {
-          oldMolCoords.Set(oldChargedParticles, 
-                           currOldMolCoords.x[p],
-                           currOldMolCoords.y[p],
-                           currOldMolCoords.z[p]); 
-          oldChargedParticles++;
-          chargeBoxOld.push_back(thisKindOld.AtomCharge(p) * lambdaCoef);
+        unsigned long currentAtom = mols.MolStart(moleculeIndex) + p;
+        if (!particleHasNoCharge[currentAtom]) {
+          molCoords.Set(numChargedParticles, 
+                        currMolCoords.x[p],
+                        currMolCoords.y[p],
+                        currMolCoords.z[p]); 
+          numChargedParticles++;
+		  // Invert these charges since we subtract them in the energy calc
+          chargeBox.push_back(thisKindOld.AtomCharge(p) * -lambdaCoef);
         }
       }
     }
-    lengthOld = oldChargedParticles;
 
     // Depending on the move, we could call this function twice. If so, we don't want to
     // double count the existing (reference) sums, so we copy them only for the first call
     // and then add to them inside the function based on the delta values for the move.
-    if(first_call) {
+    if (first_call) {
       CopyRefToNewCUDA(ff.particles->getCUDAVars(), box, imageSizeRef[box]);
     }
 
     // If there are no charged particles, the energy doesn't change, so no need to call
     // the function
-    if (lengthNew + lengthOld == 0) {
+    if (numChargedParticles == 0) {
       energyRecipNew = sysPotRef.boxEnergy[box].recip;
     }
     else {
-      CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(), 
+      CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(),
                                    imageSizeRef[box],
-                                   sumRnew[box], sumInew[box], box, 
-                                   chargeBoxNew, chargeBoxOld,
-                                   lengthNew, lengthOld,
+                                   sumRnew[box], sumInew[box], box,
+                                   chargeBox,
+                                   numChargedParticles,
                                    energyRecipNew,
-                                   newMolCoords,
-                                   oldMolCoords);
+                                   molCoords);
     }
 #else 
     MoleculeKind const& thisKindNew = newMol[0].GetKind();

diff --git a/src/GPU/CalculateEnergyCUDAKernel.cu b/src/GPU/CalculateEnergyCUDAKernel.cu
@@ -203,7 +203,8 @@ BoxInterGPU(int *gpu_cellStartIndex, int *gpu_cellVector, int *gpu_neighborList,
   using BlockReduce = cub::BlockReduce<double, 128>;
 
   // Allocate shared memory for BlockReduce
-  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ typename BlockReduce::TempStorage LJEn_temp_storage;
+  __shared__ typename BlockReduce::TempStorage REn_temp_storage;
 
   double LJEn = 0.0, REn = 0.0;
 
@@ -258,18 +259,17 @@ BoxInterGPU(int *gpu_cellStartIndex, int *gpu_cellVector, int *gpu_neighborList,
   __syncthreads();
 
   // Compute the block-wide sum for thread 0
-  double aggregate = BlockReduce(temp_storage).Sum(LJEn);
+  double aggregate = BlockReduce(LJEn_temp_storage).Sum(LJEn);
 
   if (threadIdx.x == 0) {
     gpu_LJEn[blockIdx.x] = aggregate;
   }
 
   if (electrostatic) {
-    //Need to sync the threads before reusing temp_storage
-    //OK inside the if since it's a global value for all threads
-    __syncthreads();
+    // Need to sync the threads before reusing temp_storage
+    // so using different variables
     // Compute the block-wide sum for thread 0
-    aggregate = BlockReduce(temp_storage).Sum(REn);
+    aggregate = BlockReduce(REn_temp_storage).Sum(REn);
     if (threadIdx.x == 0)
       gpu_REn[blockIdx.x] = aggregate;
   }

diff --git a/src/GPU/CalculateEwaldCUDAKernel.cu b/src/GPU/CalculateEwaldCUDAKernel.cu
@@ -150,11 +150,6 @@ void CallBoxReciprocalSumsGPU(VariablesCUDA *vars, XYZArray const &coords,
   checkLastErrorCUDA(__FILE__, __LINE__);
 #endif
 
-  // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double),
-             // cudaMemcpyDeviceToHost);
-  // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double),
-             // cudaMemcpyDeviceToHost);
-
   // ReduceSum
   DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies,
                     vars->gpu_finalVal, imageSize);
@@ -382,82 +377,45 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
                                   double *sumRnew,
                                   double *sumInew,
                                   uint box, 
-                                  std::vector<double> chargeBoxNew,
-                                  std::vector<double> chargeBoxOld,
-                                  uint lengthNew, uint lengthOld,
+                                  std::vector<double> chargeBox,
+                                  int numChargedParticles,
                                   double &energyRecipNew,
-                                  XYZArray newMolCoords,
-                                  XYZArray oldMolCoords)
+                                  XYZArray molCoords)
 {
-  uint atomsPerImage = lengthNew +lengthOld;
-  uint totalAtoms = atomsPerImage * imageSize;
-
-  double *gpu_chargeBoxNew;
-  double *gpu_chargeBoxOld;
-  double *gpu_newMolX;
-  double *gpu_newMolY;
-  double *gpu_newMolZ;
-  double *gpu_oldMolX;
-  double *gpu_oldMolY;
-  double *gpu_oldMolZ;
-
-  if (lengthNew > 0) {
-    CUMALLOC((void**) &gpu_chargeBoxNew, chargeBoxNew.size() * sizeof(double));
-    cudaMemcpy(gpu_chargeBoxNew, &chargeBoxNew[0], chargeBoxNew.size() * sizeof(double), 
-               cudaMemcpyHostToDevice);
-  }
+  double *gpu_chargeBox;
+  double *gpu_MolX;
+  double *gpu_MolY;
+  double *gpu_MolZ;
 
-  if (lengthOld > 0) {
-    CUMALLOC((void**) &gpu_chargeBoxOld, chargeBoxOld.size() * sizeof(double));
-    cudaMemcpy(gpu_chargeBoxOld, &chargeBoxOld[0], chargeBoxOld.size() * sizeof(double), 
-               cudaMemcpyHostToDevice);
-  }
+  CUMALLOC((void**) &gpu_chargeBox, chargeBox.size() * sizeof(double));
+  CUMALLOC((void**) &gpu_MolX, molCoords.Count() * sizeof(double));
+  CUMALLOC((void**) &gpu_MolY, molCoords.Count() * sizeof(double));
+  CUMALLOC((void**) &gpu_MolZ, molCoords.Count() * sizeof(double));
 
-  CUMALLOC((void**) &gpu_newMolX, newMolCoords.Count() * sizeof(double));
-  CUMALLOC((void**) &gpu_newMolY, newMolCoords.Count() * sizeof(double));
-  CUMALLOC((void**) &gpu_newMolZ, newMolCoords.Count() * sizeof(double));
-  CUMALLOC((void**) &gpu_oldMolX, oldMolCoords.Count() * sizeof(double));
-  CUMALLOC((void**) &gpu_oldMolY, oldMolCoords.Count() * sizeof(double)); 
-  CUMALLOC((void**) &gpu_oldMolZ, oldMolCoords.Count() * sizeof(double));          
-  cudaMemcpy(gpu_newMolX, &newMolCoords.x[0], 
-              newMolCoords.Count() * sizeof(double), 
-              cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_newMolY, &newMolCoords.y[0], 
-              newMolCoords.Count() * sizeof(double), 
-              cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_newMolZ, &newMolCoords.z[0], 
-              newMolCoords.Count() * sizeof(double), 
-              cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_oldMolX, &oldMolCoords.x[0], 
-              oldMolCoords.Count() * sizeof(double), 
-              cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_oldMolY, &oldMolCoords.y[0], 
-              oldMolCoords.Count() * sizeof(double), 
-              cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_oldMolZ, &oldMolCoords.z[0], 
-              oldMolCoords.Count() * sizeof(double), 
-              cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_chargeBox, &chargeBox[0], chargeBox.size() * sizeof(double), 
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_MolX, &molCoords.x[0], molCoords.Count() * sizeof(double), 
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_MolY, &molCoords.y[0], molCoords.Count() * sizeof(double), 
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_MolZ, &molCoords.z[0], molCoords.Count() * sizeof(double), 
+             cudaMemcpyHostToDevice);
 
   int threadsPerBlock = 128;
-  int blocksPerGrid = (totalAtoms + threadsPerBlock - 1)/threadsPerBlock;
-  int dynamicSharedMemorySize = 4 * sizeof(double) * atomsPerImage;
+  int blocksPerGrid = (numChargedParticles * imageSize + threadsPerBlock - 1)/threadsPerBlock;
+  int dynamicSharedMemorySize = 4 * sizeof(double) * numChargedParticles;
   MolExchangeReciprocalGPU<<< blocksPerGrid, threadsPerBlock, dynamicSharedMemorySize>>>(
         imageSize,
         vars->gpu_kxRef[box],
         vars->gpu_kyRef[box],
         vars->gpu_kzRef[box],
         vars->gpu_sumRnew[box],
         vars->gpu_sumInew[box],
-        gpu_chargeBoxNew,
-        gpu_chargeBoxOld,
-        lengthNew,
-        lengthOld,
-        gpu_newMolX,
-        gpu_newMolY,
-        gpu_newMolZ,
-        gpu_oldMolX,
-        gpu_oldMolY,
-        gpu_oldMolZ);
+        gpu_chargeBox,
+        numChargedParticles,
+        gpu_MolX,
+        gpu_MolY,
+        gpu_MolZ);
 #ifndef NDEBUG
   cudaDeviceSynchronize();
   checkLastErrorCUDA(__FILE__, __LINE__);
@@ -470,7 +428,7 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
         vars->gpu_sumInew[box],
         vars->gpu_recipEnergies,
 		imageSize);
- #ifndef NDEBUG
+#ifndef NDEBUG
   cudaDeviceSynchronize();
   checkLastErrorCUDA(__FILE__, __LINE__);
 #endif
@@ -480,18 +438,10 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
                     vars->gpu_recipEnergies, vars->gpu_finalVal, imageSize);
   cudaMemcpy(&energyRecipNew, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);
 
-  if(lengthNew > 0){
-    CUFREE(gpu_chargeBoxNew);
-  }
-  if(lengthOld > 0){
-    CUFREE(gpu_chargeBoxOld);
-  }
-  CUFREE(gpu_newMolX); 
-  CUFREE(gpu_newMolY);
-  CUFREE(gpu_newMolZ);
-  CUFREE(gpu_oldMolX);
-  CUFREE(gpu_oldMolY);
-  CUFREE(gpu_oldMolZ);
+  CUFREE(gpu_chargeBox);
+  CUFREE(gpu_MolX); 
+  CUFREE(gpu_MolY);
+  CUFREE(gpu_MolZ);
 }
 
 
@@ -789,41 +739,28 @@ __global__ void MolExchangeReciprocalGPU(
                                   double *gpu_kz,
                                   double *gpu_sumRnew,
                                   double *gpu_sumInew,
-                                  double *gpu_chargeBoxNew,
-                                  double *gpu_chargeBoxOld, 
-                                  uint lengthNew, 
-                                  uint lengthOld,
-                                  double *gpu_newMolX,
-                                  double *gpu_newMolY,
-                                  double *gpu_newMolZ,
-                                  double *gpu_oldMolX,
-                                  double *gpu_oldMolY,
-                                  double *gpu_oldMolZ)
+                                  double *gpu_chargeBox,
+                                  int numChargedParticles, 
+                                  double *gpu_MolX,
+                                  double *gpu_MolY,
+                                  double *gpu_MolZ)
 {
   int threadID = blockIdx.x * blockDim.x + threadIdx.x;
-  int chargeBoxLength = lengthNew + lengthOld;
 
   extern __shared__ double shared_arr[]; 
   double* shared_chargeBox = (double *) shared_arr; 
-  double* shared_Mol = (double *) &shared_chargeBox[chargeBoxLength]; 
+  double* shared_Mol = (double *) &shared_chargeBox[numChargedParticles]; 
 
-  if(threadIdx.x < lengthNew) { 
-    shared_Mol[threadIdx.x * 3] = gpu_newMolX[threadIdx.x];
-    shared_Mol[threadIdx.x * 3 + 1] = gpu_newMolY[threadIdx.x];
-    shared_Mol[threadIdx.x * 3 + 2] = gpu_newMolZ[threadIdx.x];
-    shared_chargeBox[threadIdx.x] = gpu_chargeBoxNew[threadIdx.x];
-  }
-  else if (threadIdx.x < chargeBoxLength) {
-    int gpu_oldMolIndex = threadIdx.x - lengthNew;
-    shared_Mol[threadIdx.x * 3] = gpu_oldMolX[gpu_oldMolIndex];
-    shared_Mol[threadIdx.x * 3 + 1] = gpu_oldMolY[gpu_oldMolIndex];
-    shared_Mol[threadIdx.x * 3 + 2] = gpu_oldMolZ[gpu_oldMolIndex];
-    shared_chargeBox[threadIdx.x] = -gpu_chargeBoxOld[gpu_oldMolIndex];
+  if(threadIdx.x < numChargedParticles) { 
+    shared_Mol[threadIdx.x * 3] = gpu_MolX[threadIdx.x];
+    shared_Mol[threadIdx.x * 3 + 1] = gpu_MolY[threadIdx.x];
+    shared_Mol[threadIdx.x * 3 + 2] = gpu_MolZ[threadIdx.x];
+    shared_chargeBox[threadIdx.x] = gpu_chargeBox[threadIdx.x];
   }
   __syncthreads();
 
   //wait until the shared array is loaded before deciding that we don't need these threads
-  if (threadID >= imageSize * chargeBoxLength)
+  if (threadID >= imageSize * numChargedParticles)
     return;
 
   // for each new & old atom index, loop thru each image

diff --git a/src/GPU/CalculateEwaldCUDAKernel.cuh b/src/GPU/CalculateEwaldCUDAKernel.cuh
@@ -92,13 +92,10 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
                                   double *sumRnew,
                                   double *sumInew,
                                   uint box, 
-                                  std::vector<double> chargeBoxNew,
-                                  std::vector<double> chargeBoxOld,
-                                  uint lengthNew, uint lengthOld,
+                                  std::vector<double> chargeBox,
+                                  int numChargedParticles,
                                   double &energyRecipNew,
-                                  XYZArray newMolCoords,
-                                  XYZArray oldMolCoords);
-
+                                  XYZArray molCoords);
 
 __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx,
                                       double *gpu_aForceRecy,
@@ -199,16 +196,11 @@ __global__ void MolExchangeReciprocalGPU(
                                   double *gpu_kz,
                                   double *gpu_sumRnew,
                                   double *gpu_sumInew,
-                                  double *gpu_chargeBoxNew,
-                                  double *gpu_chargeBoxOld,
-                                  uint lengthNew,
-                                  uint lengthOld,
-                                  double *gpu_newMolX,
-                                  double *gpu_newMolY,
-                                  double *gpu_newMolZ,
-                                  double *gpu_oldMolX,
-                                  double *gpu_oldMolY,
-                                  double *gpu_oldMolZ);
+                                  double *gpu_chargeBox,
+                                  int numChargedParticles,
+                                  double *gpu_MolX,
+                                  double *gpu_MolY,
+                                  double *gpu_MolZ);
 
 __global__ void NewSwapReciprocalGPU(VariablesCUDA *vars,
                                   int atomNumber,