Skip to content

Commit

Permalink
Initial performance optimizations that reduce memcpy and mallocs
Browse files Browse the repository at this point in the history
  • Loading branch information
LSchwiebert committed Jun 19, 2024
1 parent ff3f67a commit ba4ecb4
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 275 deletions.
61 changes: 48 additions & 13 deletions src/Ewald.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,13 +489,27 @@ double Ewald::SwapDestRecip(const cbmc::TrialMol &newMol, const uint box,
uint length = thisKind.NumAtoms();
#ifdef GOMC_CUDA
bool insert = true;
std::vector<double> MolCharge;
for (uint p = 0; p < length; p++) {
MolCharge.push_back(thisKind.AtomCharge(p));
std::vector<double> molCharges;
int charges = 0;
for (uint p = 0; p < length; ++p) {
if (thisKind.AtomCharge(p) != 0.0) {
molCharges.push_back(thisKind.AtomCharge(p));
if (p > charges) {
molCoords.Set(charges, molCoords[p]);
}
charges++;
}
}

CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, molCharges,
imageSizeRef[box], sumRnew[box], sumInew[box],
sumRref[box], sumIref[box], insert, energyRecipNew, box);
//If there are no charged particles, the energy doesn't change, but we need
//to run CallSwapReciprocalGPU to make sure the sumRnew and sumInew arrays
//have correct values
if (charges == 0) {
energyRecipNew = sysPotRef.boxEnergy[box].recip;
}
CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, MolCharge,
imageSizeRef[box], sumRnew[box], sumInew[box], insert,
energyRecipNew, box);
#else
uint startAtom = mols.MolStart(molIndex);
#ifdef _OPENMP
Expand Down Expand Up @@ -668,14 +682,26 @@ double Ewald::SwapSourceRecip(const cbmc::TrialMol &oldMol, const uint box,
uint length = thisKind.NumAtoms();
#ifdef GOMC_CUDA
bool insert = false;
std::vector<double> MolCharge;
for (uint p = 0; p < length; p++) {
MolCharge.push_back(thisKind.AtomCharge(p));
std::vector<double> molCharges;
int charges = 0;
for (uint p = 0; p < length; ++p) {
if (thisKind.AtomCharge(p) != 0.0) {
molCharges.push_back(thisKind.AtomCharge(p));
if (p > charges) {
molCoords.Set(charges, molCoords[p]);
}
charges++;
}
}
CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, molCharges,
imageSizeRef[box], sumRnew[box], sumInew[box],
sumRref[box], sumIref[box], insert, energyRecipNew, box);
//If there are no charged particles, the energy doesn't change, but we need
//to run CallSwapReciprocalGPU to make sure the sumRnew and sumInew arrays
//have correct values
if (charges == 0) {
energyRecipNew = sysPotRef.boxEnergy[box].recip;
}
CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, MolCharge,
imageSizeRef[box], sumRnew[box], sumInew[box], insert,
energyRecipNew, box);

#else
uint startAtom = mols.MolStart(molIndex);
#ifdef _OPENMP
Expand Down Expand Up @@ -725,6 +751,15 @@ double Ewald::MolExchangeReciprocal(const std::vector<cbmc::TrialMol> &newMol,

if (box < BOXES_WITH_U_NB) {
GOMC_EVENT_START(1, GomcProfileEvent::RECIP_MEMC_ENERGY);
// Because MolExchangeReciprocal does not have a matching GPU function, this is
// a stub function to copy the GPU sumRref and sumIref vectors to the CPU in
// order to calcuate the new sums. If this function is ported to the GPU, this
// call should be removed.
#ifdef GOMC_CUDA
CallMolExchangeReciprocalStartGPU(ff.particles->getCUDAVars(), imageSizeRef[box],
sumRref[box], sumIref[box], box);
#endif

uint lengthNew, lengthOld;
MoleculeKind const &thisKindNew = newMol[0].GetKind();
MoleculeKind const &thisKindOld = oldMol[0].GetKind();
Expand Down
19 changes: 8 additions & 11 deletions src/GPU/CalculateEnergyCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
int energyVectorLen;
double *gpu_particleCharge;
double *gpu_REn, *gpu_LJEn;
double *gpu_final_REn, *gpu_final_LJEn;

// Run the kernel
threadsPerBlock = 256;
threadsPerBlock = 128;
blocksPerGrid = numberOfCells * NUMBER_OF_NEIGHBOR_CELL;
energyVectorLen = blocksPerGrid * threadsPerBlock;

Expand All @@ -62,10 +61,8 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
CUMALLOC((void **)&gpu_particleKind, particleKind.size() * sizeof(int));
CUMALLOC((void **)&gpu_particleMol, particleMol.size() * sizeof(int));
CUMALLOC((void **)&gpu_LJEn, energyVectorLen * sizeof(double));
CUMALLOC((void **)&gpu_final_LJEn, sizeof(double));
if (electrostatic) {
CUMALLOC((void **)&gpu_REn, energyVectorLen * sizeof(double));
CUMALLOC((void **)&gpu_final_REn, sizeof(double));
}

// Copy necessary data to GPU
Expand Down Expand Up @@ -108,27 +105,29 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
vars->gpu_Invcell_z[box], sc_coul, sc_sigma_6, sc_alpha, sc_power,
vars->gpu_rMin, vars->gpu_rMaxSq, vars->gpu_expConst, vars->gpu_molIndex,
vars->gpu_lambdaVDW, vars->gpu_lambdaCoulomb, vars->gpu_isFraction, box);
#ifndef NDEBUG
cudaDeviceSynchronize();
checkLastErrorCUDA(__FILE__, __LINE__);
#endif

// ReduceSum
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
// LJ ReduceSum
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
gpu_final_LJEn, energyVectorLen);
vars->gpu_finalVal, energyVectorLen);
CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes));
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
gpu_final_LJEn, energyVectorLen);
vars->gpu_finalVal, energyVectorLen);
// Copy back the result to CPU ! :)
CubDebugExit(cudaMemcpy(&LJEn, gpu_final_LJEn, sizeof(double),
CubDebugExit(cudaMemcpy(&LJEn, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost));
if (electrostatic) {
// Real Term ReduceSum
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
gpu_final_REn, energyVectorLen);
vars->gpu_finalVal, energyVectorLen);
// Copy back the result to CPU ! :)
CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double),
CubDebugExit(cudaMemcpy(&REn, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost));
} else {
REn = 0.0;
Expand All @@ -139,10 +138,8 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
CUFREE(gpu_particleKind);
CUFREE(gpu_particleMol);
CUFREE(gpu_LJEn);
CUFREE(gpu_final_LJEn);
if (electrostatic) {
CUFREE(gpu_REn);
CUFREE(gpu_final_REn);
}
CUFREE(gpu_neighborList);
CUFREE(gpu_cellStartIndex);
Expand Down
Loading

0 comments on commit ba4ecb4

Please sign in to comment.