Skip to content

Commit

Permalink
Improve BlockInterGPU reduction and MolExchangeGPU optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
LSchwiebert committed Jul 1, 2024
1 parent ac6ba7b commit a709ea5
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 169 deletions.
77 changes: 36 additions & 41 deletions src/Ewald.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -758,76 +758,71 @@ double Ewald::MolExchangeReciprocal(const std::vector<cbmc::TrialMol> &newMol,

#ifdef GOMC_CUDA
//Build a vector of only the charged particles in the new and old molecules
std::vector<double> chargeBoxNew;
std::vector<double> chargeBox;
MoleculeKind const& thisKindNew = newMol[0].GetKind();
uint lengthNew = thisKindNew.NumAtoms() * newMol.size();
XYZArray newMolCoords = XYZArray(lengthNew);
MoleculeKind const& thisKindOld = oldMol[0].GetKind();
uint lengthOld = thisKindOld.NumAtoms() * oldMol.size();
// The maximum size of this array is all charged particles
XYZArray molCoords = XYZArray(lengthNew + lengthOld);

int newChargedParticles = 0;
int numChargedParticles = 0;
for (uint m = 0; m < newMol.size(); ++m) {
uint newMoleculeIndex = molIndexNew[m];
double lambdaCoef = GetLambdaCoef(newMoleculeIndex, box);
uint moleculeIndex = molIndexNew[m];
double lambdaCoef = GetLambdaCoef(moleculeIndex, box);

XYZArray currNewMolCoords = newMol[m].GetCoords();
XYZArray currMolCoords = newMol[m].GetCoords();
for (uint p = 0; p < lengthNew; ++p) {
unsigned long currentAtom = mols.MolStart(newMoleculeIndex) + p;
if(!particleHasNoCharge[currentAtom]) {
newMolCoords.Set(newChargedParticles,
currNewMolCoords.x[p],
currNewMolCoords.y[p],
currNewMolCoords.z[p]);
newChargedParticles++;
chargeBoxNew.push_back(thisKindNew.AtomCharge(p) * lambdaCoef);
unsigned long currentAtom = mols.MolStart(moleculeIndex) + p;
if (!particleHasNoCharge[currentAtom]) {
molCoords.Set(numChargedParticles,
currMolCoords.x[p],
currMolCoords.y[p],
currMolCoords.z[p]);
numChargedParticles++;
chargeBox.push_back(thisKindNew.AtomCharge(p) * lambdaCoef);
}
}
}
lengthNew = newChargedParticles;

std::vector<double> chargeBoxOld;
MoleculeKind const& thisKindOld = oldMol[0].GetKind();
uint lengthOld = thisKindOld.NumAtoms() * oldMol.size();
XYZArray oldMolCoords = XYZArray(lengthOld);

int oldChargedParticles = 0;
for (uint m = 0; m < oldMol.size(); ++m) {
uint oldMoleculeIndex = molIndexOld[m];
double lambdaCoef = GetLambdaCoef(oldMoleculeIndex, box);
XYZArray currOldMolCoords = oldMol[m].GetCoords();
uint moleculeIndex = molIndexOld[m];
double lambdaCoef = GetLambdaCoef(moleculeIndex, box);
XYZArray currMolCoords = oldMol[m].GetCoords();
for (uint p = 0; p < lengthOld; ++p) {
unsigned long currentAtom = mols.MolStart(oldMoleculeIndex) + p;
if(!particleHasNoCharge[currentAtom]) {
oldMolCoords.Set(oldChargedParticles,
currOldMolCoords.x[p],
currOldMolCoords.y[p],
currOldMolCoords.z[p]);
oldChargedParticles++;
chargeBoxOld.push_back(thisKindOld.AtomCharge(p) * lambdaCoef);
unsigned long currentAtom = mols.MolStart(moleculeIndex) + p;
if (!particleHasNoCharge[currentAtom]) {
molCoords.Set(numChargedParticles,
currMolCoords.x[p],
currMolCoords.y[p],
currMolCoords.z[p]);
numChargedParticles++;
// Invert these charges since we subtract them in the energy calc
chargeBox.push_back(thisKindOld.AtomCharge(p) * -lambdaCoef);
}
}
}
lengthOld = oldChargedParticles;

// Depending on the move, we could call this function twice. If so, we don't want to
// double count the existing (reference) sums, so we copy them only for the first call
// and then add to them inside the function based on the delta values for the move.
if(first_call) {
if (first_call) {
CopyRefToNewCUDA(ff.particles->getCUDAVars(), box, imageSizeRef[box]);
}

// If there are no charged particles, the energy doesn't change, so no need to call
// the function
if (lengthNew + lengthOld == 0) {
if (numChargedParticles == 0) {
energyRecipNew = sysPotRef.boxEnergy[box].recip;
}
else {
CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(),
CallMolExchangeReciprocalGPU(ff.particles->getCUDAVars(),
imageSizeRef[box],
sumRnew[box], sumInew[box], box,
chargeBoxNew, chargeBoxOld,
lengthNew, lengthOld,
sumRnew[box], sumInew[box], box,
chargeBox,
numChargedParticles,
energyRecipNew,
newMolCoords,
oldMolCoords);
molCoords);
}
#else
MoleculeKind const& thisKindNew = newMol[0].GetKind();
Expand Down
12 changes: 6 additions & 6 deletions src/GPU/CalculateEnergyCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ BoxInterGPU(int *gpu_cellStartIndex, int *gpu_cellVector, int *gpu_neighborList,
using BlockReduce = cub::BlockReduce<double, 128>;

// Allocate shared memory for BlockReduce
__shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ typename BlockReduce::TempStorage LJEn_temp_storage;
__shared__ typename BlockReduce::TempStorage REn_temp_storage;

double LJEn = 0.0, REn = 0.0;

Expand Down Expand Up @@ -258,18 +259,17 @@ BoxInterGPU(int *gpu_cellStartIndex, int *gpu_cellVector, int *gpu_neighborList,
__syncthreads();

// Compute the block-wide sum for thread 0
double aggregate = BlockReduce(temp_storage).Sum(LJEn);
double aggregate = BlockReduce(LJEn_temp_storage).Sum(LJEn);

if (threadIdx.x == 0) {
gpu_LJEn[blockIdx.x] = aggregate;
}

if (electrostatic) {
//Need to sync the threads before reusing temp_storage
//OK inside the if since it's a global value for all threads
__syncthreads();
// Need to sync the threads before reusing temp_storage
// so using different variables
// Compute the block-wide sum for thread 0
aggregate = BlockReduce(temp_storage).Sum(REn);
aggregate = BlockReduce(REn_temp_storage).Sum(REn);
if (threadIdx.x == 0)
gpu_REn[blockIdx.x] = aggregate;
}
Expand Down
149 changes: 43 additions & 106 deletions src/GPU/CalculateEwaldCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,6 @@ void CallBoxReciprocalSumsGPU(VariablesCUDA *vars, XYZArray const &coords,
checkLastErrorCUDA(__FILE__, __LINE__);
#endif

// cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double),
// cudaMemcpyDeviceToHost);
// cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double),
// cudaMemcpyDeviceToHost);

// ReduceSum
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies,
vars->gpu_finalVal, imageSize);
Expand Down Expand Up @@ -382,82 +377,45 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
double *sumRnew,
double *sumInew,
uint box,
std::vector<double> chargeBoxNew,
std::vector<double> chargeBoxOld,
uint lengthNew, uint lengthOld,
std::vector<double> chargeBox,
int numChargedParticles,
double &energyRecipNew,
XYZArray newMolCoords,
XYZArray oldMolCoords)
XYZArray molCoords)
{
uint atomsPerImage = lengthNew +lengthOld;
uint totalAtoms = atomsPerImage * imageSize;

double *gpu_chargeBoxNew;
double *gpu_chargeBoxOld;
double *gpu_newMolX;
double *gpu_newMolY;
double *gpu_newMolZ;
double *gpu_oldMolX;
double *gpu_oldMolY;
double *gpu_oldMolZ;

if (lengthNew > 0) {
CUMALLOC((void**) &gpu_chargeBoxNew, chargeBoxNew.size() * sizeof(double));
cudaMemcpy(gpu_chargeBoxNew, &chargeBoxNew[0], chargeBoxNew.size() * sizeof(double),
cudaMemcpyHostToDevice);
}
double *gpu_chargeBox;
double *gpu_MolX;
double *gpu_MolY;
double *gpu_MolZ;

if (lengthOld > 0) {
CUMALLOC((void**) &gpu_chargeBoxOld, chargeBoxOld.size() * sizeof(double));
cudaMemcpy(gpu_chargeBoxOld, &chargeBoxOld[0], chargeBoxOld.size() * sizeof(double),
cudaMemcpyHostToDevice);
}
CUMALLOC((void**) &gpu_chargeBox, chargeBox.size() * sizeof(double));
CUMALLOC((void**) &gpu_MolX, molCoords.Count() * sizeof(double));
CUMALLOC((void**) &gpu_MolY, molCoords.Count() * sizeof(double));
CUMALLOC((void**) &gpu_MolZ, molCoords.Count() * sizeof(double));

CUMALLOC((void**) &gpu_newMolX, newMolCoords.Count() * sizeof(double));
CUMALLOC((void**) &gpu_newMolY, newMolCoords.Count() * sizeof(double));
CUMALLOC((void**) &gpu_newMolZ, newMolCoords.Count() * sizeof(double));
CUMALLOC((void**) &gpu_oldMolX, oldMolCoords.Count() * sizeof(double));
CUMALLOC((void**) &gpu_oldMolY, oldMolCoords.Count() * sizeof(double));
CUMALLOC((void**) &gpu_oldMolZ, oldMolCoords.Count() * sizeof(double));
cudaMemcpy(gpu_newMolX, &newMolCoords.x[0],
newMolCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_newMolY, &newMolCoords.y[0],
newMolCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_newMolZ, &newMolCoords.z[0],
newMolCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_oldMolX, &oldMolCoords.x[0],
oldMolCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_oldMolY, &oldMolCoords.y[0],
oldMolCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_oldMolZ, &oldMolCoords.z[0],
oldMolCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_chargeBox, &chargeBox[0], chargeBox.size() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_MolX, &molCoords.x[0], molCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_MolY, &molCoords.y[0], molCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_MolZ, &molCoords.z[0], molCoords.Count() * sizeof(double),
cudaMemcpyHostToDevice);

int threadsPerBlock = 128;
int blocksPerGrid = (totalAtoms + threadsPerBlock - 1)/threadsPerBlock;
int dynamicSharedMemorySize = 4 * sizeof(double) * atomsPerImage;
int blocksPerGrid = (numChargedParticles * imageSize + threadsPerBlock - 1)/threadsPerBlock;
int dynamicSharedMemorySize = 4 * sizeof(double) * numChargedParticles;
MolExchangeReciprocalGPU<<< blocksPerGrid, threadsPerBlock, dynamicSharedMemorySize>>>(
imageSize,
vars->gpu_kxRef[box],
vars->gpu_kyRef[box],
vars->gpu_kzRef[box],
vars->gpu_sumRnew[box],
vars->gpu_sumInew[box],
gpu_chargeBoxNew,
gpu_chargeBoxOld,
lengthNew,
lengthOld,
gpu_newMolX,
gpu_newMolY,
gpu_newMolZ,
gpu_oldMolX,
gpu_oldMolY,
gpu_oldMolZ);
gpu_chargeBox,
numChargedParticles,
gpu_MolX,
gpu_MolY,
gpu_MolZ);
#ifndef NDEBUG
cudaDeviceSynchronize();
checkLastErrorCUDA(__FILE__, __LINE__);
Expand All @@ -470,7 +428,7 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
vars->gpu_sumInew[box],
vars->gpu_recipEnergies,
imageSize);
#ifndef NDEBUG
#ifndef NDEBUG
cudaDeviceSynchronize();
checkLastErrorCUDA(__FILE__, __LINE__);
#endif
Expand All @@ -480,18 +438,10 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
vars->gpu_recipEnergies, vars->gpu_finalVal, imageSize);
cudaMemcpy(&energyRecipNew, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);

if(lengthNew > 0){
CUFREE(gpu_chargeBoxNew);
}
if(lengthOld > 0){
CUFREE(gpu_chargeBoxOld);
}
CUFREE(gpu_newMolX);
CUFREE(gpu_newMolY);
CUFREE(gpu_newMolZ);
CUFREE(gpu_oldMolX);
CUFREE(gpu_oldMolY);
CUFREE(gpu_oldMolZ);
CUFREE(gpu_chargeBox);
CUFREE(gpu_MolX);
CUFREE(gpu_MolY);
CUFREE(gpu_MolZ);
}


Expand Down Expand Up @@ -789,41 +739,28 @@ __global__ void MolExchangeReciprocalGPU(
double *gpu_kz,
double *gpu_sumRnew,
double *gpu_sumInew,
double *gpu_chargeBoxNew,
double *gpu_chargeBoxOld,
uint lengthNew,
uint lengthOld,
double *gpu_newMolX,
double *gpu_newMolY,
double *gpu_newMolZ,
double *gpu_oldMolX,
double *gpu_oldMolY,
double *gpu_oldMolZ)
double *gpu_chargeBox,
int numChargedParticles,
double *gpu_MolX,
double *gpu_MolY,
double *gpu_MolZ)
{
int threadID = blockIdx.x * blockDim.x + threadIdx.x;
int chargeBoxLength = lengthNew + lengthOld;

extern __shared__ double shared_arr[];
double* shared_chargeBox = (double *) shared_arr;
double* shared_Mol = (double *) &shared_chargeBox[chargeBoxLength];
double* shared_Mol = (double *) &shared_chargeBox[numChargedParticles];

if(threadIdx.x < lengthNew) {
shared_Mol[threadIdx.x * 3] = gpu_newMolX[threadIdx.x];
shared_Mol[threadIdx.x * 3 + 1] = gpu_newMolY[threadIdx.x];
shared_Mol[threadIdx.x * 3 + 2] = gpu_newMolZ[threadIdx.x];
shared_chargeBox[threadIdx.x] = gpu_chargeBoxNew[threadIdx.x];
}
else if (threadIdx.x < chargeBoxLength) {
int gpu_oldMolIndex = threadIdx.x - lengthNew;
shared_Mol[threadIdx.x * 3] = gpu_oldMolX[gpu_oldMolIndex];
shared_Mol[threadIdx.x * 3 + 1] = gpu_oldMolY[gpu_oldMolIndex];
shared_Mol[threadIdx.x * 3 + 2] = gpu_oldMolZ[gpu_oldMolIndex];
shared_chargeBox[threadIdx.x] = -gpu_chargeBoxOld[gpu_oldMolIndex];
if(threadIdx.x < numChargedParticles) {
shared_Mol[threadIdx.x * 3] = gpu_MolX[threadIdx.x];
shared_Mol[threadIdx.x * 3 + 1] = gpu_MolY[threadIdx.x];
shared_Mol[threadIdx.x * 3 + 2] = gpu_MolZ[threadIdx.x];
shared_chargeBox[threadIdx.x] = gpu_chargeBox[threadIdx.x];
}
__syncthreads();

//wait until the shared array is loaded before deciding that we don't need these threads
if (threadID >= imageSize * chargeBoxLength)
if (threadID >= imageSize * numChargedParticles)
return;

// for each new & old atom index, loop thru each image
Expand Down
24 changes: 8 additions & 16 deletions src/GPU/CalculateEwaldCUDAKernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,10 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars,
double *sumRnew,
double *sumInew,
uint box,
std::vector<double> chargeBoxNew,
std::vector<double> chargeBoxOld,
uint lengthNew, uint lengthOld,
std::vector<double> chargeBox,
int numChargedParticles,
double &energyRecipNew,
XYZArray newMolCoords,
XYZArray oldMolCoords);

XYZArray molCoords);

__global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx,
double *gpu_aForceRecy,
Expand Down Expand Up @@ -199,16 +196,11 @@ __global__ void MolExchangeReciprocalGPU(
double *gpu_kz,
double *gpu_sumRnew,
double *gpu_sumInew,
double *gpu_chargeBoxNew,
double *gpu_chargeBoxOld,
uint lengthNew,
uint lengthOld,
double *gpu_newMolX,
double *gpu_newMolY,
double *gpu_newMolZ,
double *gpu_oldMolX,
double *gpu_oldMolY,
double *gpu_oldMolZ);
double *gpu_chargeBox,
int numChargedParticles,
double *gpu_MolX,
double *gpu_MolY,
double *gpu_MolZ);

__global__ void NewSwapReciprocalGPU(VariablesCUDA *vars,
int atomNumber,
Expand Down

0 comments on commit a709ea5

Please sign in to comment.