From bc3e12d7d41f2c86123e370bc37c9bc5d1346193 Mon Sep 17 00:00:00 2001
From: Loren Schwiebert <loren@wayne.edu>
Date: Wed, 19 Jun 2024 13:16:48 -0400
Subject: [PATCH] Optimizations for BoxInterForceGPU

---
 src/GPU/CalculateForceCUDAKernel.cu | 64 ++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/src/GPU/CalculateForceCUDAKernel.cu b/src/GPU/CalculateForceCUDAKernel.cu
index 155855107..35e40de25 100644
--- a/src/GPU/CalculateForceCUDAKernel.cu
+++ b/src/GPU/CalculateForceCUDAKernel.cu
@@ -471,6 +471,10 @@ __global__ void BoxInterForceGPU(
     double sc_alpha, uint sc_power, double *gpu_rMin, double *gpu_rMaxSq,
     double *gpu_expConst, int *gpu_molIndex, double *gpu_lambdaVDW,
     double *gpu_lambdaCoulomb, bool *gpu_isFraction, int box) {
+  __shared__ double shr_cutoff;
+  __shared__ int shr_particlesInsideCurrentCell, shr_numberOfPairs;
+  __shared__ int shr_currentCellStartIndex, shr_neighborCellStartIndex;
+  __shared__ bool shr_sameCell;
   double distSq;
   double3 virComponents;
 
@@ -491,48 +495,58 @@ __global__ void BoxInterForceGPU(
     gpu_rT23[threadID] = 0.0;
   }
 
-  double3 diff_com;
-  diff_com = make_double3(0.0, 0.0, 0.0);
-
-  double cutoff = fmax(gpu_rCut[0], gpu_rCutCoulomb[box]);
-
   int currentCell = blockIdx.x / NUMBER_OF_NEIGHBOR_CELLS;
   int nCellIndex = blockIdx.x;
   int neighborCell = gpu_neighborList[nCellIndex];
 
-  // calculate number of particles inside neighbor Cell
-  int particlesInsideCurrentCell, particlesInsideNeighboringCell;
-  int endIndex = gpu_cellStartIndex[neighborCell + 1];
-  particlesInsideNeighboringCell = endIndex - gpu_cellStartIndex[neighborCell];
+  //Skip some block pairs so we don't double count particle pairs
+  if (currentCell > neighborCell) return;
 
-  // calculate number of particles inside current Cell
-  endIndex = gpu_cellStartIndex[currentCell + 1];
-  particlesInsideCurrentCell = endIndex - gpu_cellStartIndex[currentCell];
+  if (threadIdx.x == 0) {
+    // Calculate number of particles inside current Cell
+    shr_currentCellStartIndex = gpu_cellStartIndex[currentCell];
+    shr_particlesInsideCurrentCell =
+        gpu_cellStartIndex[currentCell + 1] - shr_currentCellStartIndex;
 
-  // total number of pairs
-  int numberOfPairs =
-      particlesInsideCurrentCell * particlesInsideNeighboringCell;
+    // Calculate number of particles inside neighbor Cell
+    shr_neighborCellStartIndex = gpu_cellStartIndex[neighborCell];
+    int particlesInsideNeighboringCell =
+        gpu_cellStartIndex[neighborCell + 1] - shr_neighborCellStartIndex;
 
-  for (int pairIndex = threadIdx.x; pairIndex < numberOfPairs;
+    shr_sameCell = currentCell == neighborCell;
+    // Total number of pairs
+    shr_numberOfPairs =
+        shr_particlesInsideCurrentCell * particlesInsideNeighboringCell;
+    shr_cutoff = fmax(gpu_rCut[0], gpu_rCutCoulomb[box]);
+  }
+  __syncthreads();
+
+  double3 diff_com;
+
+  for (int pairIndex = threadIdx.x; pairIndex < shr_numberOfPairs;
        pairIndex += blockDim.x) {
-    int neighborParticleIndex = pairIndex / particlesInsideCurrentCell;
-    int currentParticleIndex = pairIndex % particlesInsideCurrentCell;
+    int neighborParticleIndex = pairIndex / shr_particlesInsideCurrentCell;
+    int currentParticleIndex = pairIndex % shr_particlesInsideCurrentCell;
 
     int currentParticle =
-        gpu_cellVector[gpu_cellStartIndex[currentCell] + currentParticleIndex];
-    int neighborParticle = gpu_cellVector[gpu_cellStartIndex[neighborCell] +
+        gpu_cellVector[shr_currentCellStartIndex + currentParticleIndex];
+    int neighborParticle = gpu_cellVector[shr_neighborCellStartIndex +
                                           neighborParticleIndex];
 
-    if (currentParticle < neighborParticle &&
-        gpu_particleMol[currentParticle] != gpu_particleMol[neighborParticle]) {
+    // We don't process the same pair of cells twice, so we just need to check
+    // to be sure we have different molecules. The exception is when the two
+    // cells are the same, then we need to skip some pairs of molecules so we
+    // don't double count any pairs.
+    int mA = gpu_particleMol[currentParticle];
+    int mB = gpu_particleMol[neighborParticle];
+    bool skip = mA == mB || (shr_sameCell && mA > mB);
+    if (!skip) {
       if (InRcutGPU(distSq, virComponents, gpu_x, gpu_y, gpu_z, currentParticle,
-                    neighborParticle, axis, halfAx, cutoff, gpu_nonOrth[0],
+                    neighborParticle, axis, halfAx, shr_cutoff, gpu_nonOrth[0],
                     gpu_cell_x, gpu_cell_y, gpu_cell_z, gpu_Invcell_x,
                     gpu_Invcell_y, gpu_Invcell_z)) {
         int kA = gpu_particleKind[currentParticle];
         int kB = gpu_particleKind[neighborParticle];
-        int mA = gpu_particleMol[currentParticle];
-        int mB = gpu_particleMol[neighborParticle];
 
         double lambdaVDW = DeviceGetLambdaVDW(mA, mB, box, gpu_isFraction,
                                               gpu_molIndex, gpu_lambdaVDW);