diff --git a/regtest/basic/rt42c-cuda/plumed.dat b/regtest/basic/rt42c-cuda/plumed.dat deleted file mode 100644 index eff3b9d96e..0000000000 --- a/regtest/basic/rt42c-cuda/plumed.dat +++ /dev/null @@ -1,10 +0,0 @@ -LOAD FILE=../../../../src/cudaCoord/Coordination.so -gpu: CUDACOORDINATION GROUPA=1-108 R_0=1 -cpu: COORDINATION GROUPA=1-108 R_0=1 NOPBC - -diff: CUSTOM ARG=gpu,cpu FUNC=y-x PERIODIC=NO - -PRINT ARG=diff FILE=colvar FMT=%8.4f STRIDE=1 - -DUMPDERIVATIVES ARG=gpu,cpu FILE=deriv FMT=%8.4f -FLUSH STRIDE=1 diff --git a/regtest/basic/rt42c-cuda/Makefile b/regtest/basic/rtCUDACOORDINATION-nopbc/Makefile similarity index 100% rename from regtest/basic/rt42c-cuda/Makefile rename to regtest/basic/rtCUDACOORDINATION-nopbc/Makefile diff --git a/regtest/basic/rt42c-cuda/colvar.reference b/regtest/basic/rtCUDACOORDINATION-nopbc/colvar.reference similarity index 100% rename from regtest/basic/rt42c-cuda/colvar.reference rename to regtest/basic/rtCUDACOORDINATION-nopbc/colvar.reference diff --git a/regtest/basic/rt42c-cuda/config b/regtest/basic/rtCUDACOORDINATION-nopbc/config similarity index 100% rename from regtest/basic/rt42c-cuda/config rename to regtest/basic/rtCUDACOORDINATION-nopbc/config diff --git a/regtest/basic/rt42c-cuda/deriv.reference b/regtest/basic/rtCUDACOORDINATION-nopbc/deriv.reference similarity index 100% rename from regtest/basic/rt42c-cuda/deriv.reference rename to regtest/basic/rtCUDACOORDINATION-nopbc/deriv.reference diff --git a/regtest/basic/rtCUDACOORDINATION-nopbc/plumed.dat b/regtest/basic/rtCUDACOORDINATION-nopbc/plumed.dat new file mode 100644 index 0000000000..f69a00cbf3 --- /dev/null +++ b/regtest/basic/rtCUDACOORDINATION-nopbc/plumed.dat @@ -0,0 +1,19 @@ +LOAD FILE=../../../../src/cudaCoord/Coordination.so +gpu: CUDACOORDINATION GROUPA=1-108 R_0=1 NOPBC +gpu512: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=512 NOPBC +gpu256: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=256 NOPBC +gpu128: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=128 NOPBC +gpu64: CUDACOORDINATION GROUPA=1-108 R_0=1 THREADS=64 NOPBC +cpu: COORDINATION GROUPA=1-108 R_0=1 NOPBC + +diff: CUSTOM ARG=gpu,cpu FUNC=y-x PERIODIC=NO +diff512: CUSTOM ARG=gpu512,cpu FUNC=y-x PERIODIC=NO +diff256: CUSTOM ARG=gpu256,cpu FUNC=y-x PERIODIC=NO +diff128: CUSTOM ARG=gpu128,cpu FUNC=y-x PERIODIC=NO +diff64: CUSTOM ARG=gpu64,cpu FUNC=y-x PERIODIC=NO + +PRINT ARG=diff FILE=colvar FMT=%8.4f STRIDE=1 +PRINT ARG=diff512,diff256,diff128,diff64 FILE=threadsDifferences FMT=%8.4f STRIDE=1 + +DUMPDERIVATIVES ARG=gpu,cpu FILE=deriv FMT=%8.4f +FLUSH STRIDE=1 diff --git a/regtest/basic/rtCUDACOORDINATION-nopbc/threadsDifferences.reference b/regtest/basic/rtCUDACOORDINATION-nopbc/threadsDifferences.reference new file mode 100644 index 0000000000..b533fd5def --- /dev/null +++ b/regtest/basic/rtCUDACOORDINATION-nopbc/threadsDifferences.reference @@ -0,0 +1,109 @@ +#! FIELDS time diff512 diff256 diff128 diff64 + 0.000000 0.0000 0.0000 0.0000 0.0000 + 1.000000 0.0000 0.0000 0.0000 0.0000 + 2.000000 0.0000 0.0000 0.0000 0.0000 + 3.000000 0.0000 0.0000 0.0000 0.0000 + 4.000000 0.0000 0.0000 0.0000 0.0000 + 5.000000 0.0000 0.0000 0.0000 0.0000 + 6.000000 0.0000 0.0000 0.0000 0.0000 + 7.000000 0.0000 0.0000 0.0000 0.0000 + 8.000000 0.0000 0.0000 0.0000 0.0000 + 9.000000 0.0000 0.0000 0.0000 0.0000 + 10.000000 0.0000 0.0000 0.0000 0.0000 + 11.000000 0.0000 0.0000 0.0000 0.0000 + 12.000000 0.0000 0.0000 0.0000 0.0000 + 13.000000 0.0000 0.0000 0.0000 0.0000 + 14.000000 0.0000 0.0000 0.0000 0.0000 + 15.000000 0.0000 0.0000 0.0000 0.0000 + 16.000000 0.0000 0.0000 0.0000 0.0000 + 17.000000 0.0000 0.0000 0.0000 0.0000 + 18.000000 0.0000 0.0000 0.0000 0.0000 + 19.000000 0.0000 0.0000 0.0000 0.0000 + 20.000000 0.0000 0.0000 0.0000 0.0000 + 21.000000 0.0000 0.0000 0.0000 0.0000 + 22.000000 0.0000 0.0000 0.0000 0.0000 + 23.000000 0.0000 0.0000 0.0000 0.0000 + 24.000000 0.0000 0.0000 0.0000 0.0000 + 25.000000 0.0000 0.0000 0.0000 0.0000 + 26.000000 0.0000 0.0000 0.0000 0.0000 + 27.000000 0.0000 0.0000 0.0000 0.0000 + 28.000000 0.0000 0.0000 0.0000 0.0000 + 29.000000 0.0000 0.0000 0.0000 0.0000 + 30.000000 0.0000 0.0000 0.0000 0.0000 + 31.000000 0.0000 0.0000 0.0000 0.0000 + 32.000000 0.0000 0.0000 0.0000 0.0000 + 33.000000 0.0000 0.0000 0.0000 0.0000 + 34.000000 0.0000 0.0000 0.0000 0.0000 + 35.000000 0.0000 0.0000 0.0000 0.0000 + 36.000000 0.0000 0.0000 0.0000 0.0000 + 37.000000 0.0000 0.0000 0.0000 0.0000 + 38.000000 0.0000 0.0000 0.0000 0.0000 + 39.000000 0.0000 0.0000 0.0000 0.0000 + 40.000000 0.0000 0.0000 0.0000 0.0000 + 41.000000 0.0000 0.0000 0.0000 0.0000 + 42.000000 0.0000 0.0000 0.0000 0.0000 + 43.000000 0.0000 0.0000 0.0000 0.0000 + 44.000000 0.0000 0.0000 0.0000 0.0000 + 45.000000 0.0000 0.0000 0.0000 0.0000 + 46.000000 0.0000 0.0000 0.0000 0.0000 + 47.000000 0.0000 0.0000 0.0000 0.0000 + 48.000000 0.0000 0.0000 0.0000 0.0000 + 49.000000 0.0000 0.0000 0.0000 0.0000 + 50.000000 0.0000 0.0000 0.0000 0.0000 + 51.000000 0.0000 0.0000 0.0000 0.0000 + 52.000000 0.0000 0.0000 0.0000 0.0000 + 53.000000 0.0000 0.0000 0.0000 0.0000 + 54.000000 0.0000 0.0000 0.0000 0.0000 + 55.000000 0.0000 0.0000 0.0000 0.0000 + 56.000000 0.0000 0.0000 0.0000 0.0000 + 57.000000 0.0000 0.0000 0.0000 0.0000 + 58.000000 0.0000 0.0000 0.0000 0.0000 + 59.000000 0.0000 0.0000 0.0000 0.0000 + 60.000000 0.0000 0.0000 0.0000 0.0000 + 61.000000 0.0000 0.0000 0.0000 0.0000 + 62.000000 0.0000 0.0000 0.0000 0.0000 + 63.000000 0.0000 0.0000 0.0000 0.0000 + 64.000000 0.0000 0.0000 0.0000 0.0000 + 65.000000 0.0000 0.0000 0.0000 0.0000 + 66.000000 0.0000 0.0000 0.0000 0.0000 + 67.000000 0.0000 0.0000 0.0000 0.0000 + 68.000000 0.0000 0.0000 0.0000 0.0000 + 69.000000 0.0000 0.0000 0.0000 0.0000 + 70.000000 0.0000 0.0000 0.0000 0.0000 + 71.000000 0.0000 0.0000 0.0000 0.0000 + 72.000000 0.0000 0.0000 0.0000 0.0000 + 73.000000 0.0000 0.0000 0.0000 0.0000 + 74.000000 0.0000 0.0000 0.0000 0.0000 + 75.000000 0.0000 0.0000 0.0000 0.0000 + 76.000000 0.0000 0.0000 0.0000 0.0000 + 77.000000 0.0000 0.0000 0.0000 0.0000 + 78.000000 0.0000 0.0000 0.0000 0.0000 + 79.000000 0.0000 0.0000 0.0000 0.0000 + 80.000000 0.0000 0.0000 0.0000 0.0000 + 81.000000 0.0000 0.0000 0.0000 0.0000 + 82.000000 0.0000 0.0000 0.0000 0.0000 + 83.000000 0.0000 0.0000 0.0000 0.0000 + 84.000000 0.0000 0.0000 0.0000 0.0000 + 85.000000 0.0000 0.0000 0.0000 0.0000 + 86.000000 0.0000 0.0000 0.0000 0.0000 + 87.000000 0.0000 0.0000 0.0000 0.0000 + 88.000000 0.0000 0.0000 0.0000 0.0000 + 89.000000 0.0000 0.0000 0.0000 0.0000 + 90.000000 0.0000 0.0000 0.0000 0.0000 + 91.000000 0.0000 0.0000 0.0000 0.0000 + 92.000000 0.0000 0.0000 0.0000 0.0000 + 93.000000 0.0000 0.0000 0.0000 0.0000 + 94.000000 0.0000 0.0000 0.0000 0.0000 + 95.000000 0.0000 0.0000 0.0000 0.0000 + 96.000000 0.0000 0.0000 0.0000 0.0000 + 97.000000 0.0000 0.0000 0.0000 0.0000 + 98.000000 0.0000 0.0000 0.0000 0.0000 + 99.000000 0.0000 0.0000 0.0000 0.0000 + 100.000000 0.0000 0.0000 0.0000 0.0000 + 101.000000 0.0000 0.0000 0.0000 0.0000 + 102.000000 0.0000 0.0000 0.0000 0.0000 + 103.000000 0.0000 0.0000 0.0000 0.0000 + 104.000000 0.0000 0.0000 0.0000 0.0000 + 105.000000 0.0000 0.0000 0.0000 0.0000 + 106.000000 0.0000 0.0000 0.0000 0.0000 + 107.000000 0.0000 0.0000 0.0000 0.0000 diff --git a/regtest/basic/rt42c-cuda/trajectory.xyz b/regtest/basic/rtCUDACOORDINATION-nopbc/trajectory.xyz similarity index 100% rename from regtest/basic/rt42c-cuda/trajectory.xyz rename to regtest/basic/rtCUDACOORDINATION-nopbc/trajectory.xyz diff --git a/src/cudaCoord/Coordination.cu b/src/cudaCoord/Coordination.cu index acf961dfa7..5726de8dd3 100644 --- a/src/cudaCoord/Coordination.cu +++ b/src/cudaCoord/Coordination.cu @@ -41,8 +41,7 @@ using std::cerr; //#define vdbg(...) namespace PLMD { -namespace colvar { - + namespace colvar { //+PLUMEDOC COLVAR CUDACOORDINATION /* Calculate coordination numbers. Like coordination, but on nvdia gpu and with no swithcing function @@ -130,19 +129,20 @@ class CudaCoordination : public Colvar { CUDAHELPERS::memoryHolder cudaCoordination; CUDAHELPERS::memoryHolder cudaDerivatives; CUDAHELPERS::memoryHolder cudaDerivatives_sparse; - CUDAHELPERS::memoryHolder cudaDerivatives_sparserow; + CUDAHELPERS::memoryHolder cudaDerivatives_sparserows; CUDAHELPERS::memoryHolder cudaDerivatives_sparsecols; CUDAHELPERS::memoryHolder cudaVirial; - CUDAHELPERS::memoryHolder reductionMemoryDerivatives; + CUDAHELPERS::memoryHolder resultDerivatives; CUDAHELPERS::memoryHolder bufferDerivatives; CUDAHELPERS::memoryHolder reductionMemoryVirial; CUDAHELPERS::memoryHolder reductionMemoryCoord; - CUDAHELPERS::memoryHolder ones; + cusparseHandle_t sparseMDevHandle; + cusparseDnVecDescr_t outDevDescr; + cudaStream_t streamDerivatives; cudaStream_t streamVirial; cudaStream_t streamCoordination; - cusparseDnVecDescr_t outVecDescr; unsigned maxNumThreads=512; SwitchingFunction switchingFunction; rationalSwitchParameters switchingParameters; @@ -166,17 +166,26 @@ PLUMED_REGISTER_ACTION(CudaCoordination,"CUDACOORDINATION") void CudaCoordination::setUpPermanentGPUMemory(){ auto nat = getPositions().size(); //coordinates values are updated at each step - cudaCoords.resize(3*nat); - + if ((3*nat) != cudaCoords.size()){ + cudaCoords.resize(3*nat); + if(resultDerivatives.size()!=0) + cusparseDestroyDnVec(outDevDescr); + resultDerivatives.resize(3*nat); + cusparseCreateDnVec(&outDevDescr, + 3*nat, + resultDerivatives.pointer(), + CUDA_R_64F); + } //the neighbour list will be updated at each request of prepare auto pairList = nl->getClosePairs(); const unsigned nn=nl->size(); cudaPairList.resize(2*nn); - cudaPairList.copyToCuda(pairList.data()); - reductionMemoryDerivatives.resize(3*nat); - cusparseCreateDnVec(&outVecDescr, + //streamDerivatives makes the copy async + cudaPairList.copyToCuda(pairList.data(),streamDerivatives); + resultDerivatives.resize(3*nat); + cusparseCreateDnVec(&outDevDescr, 3*nat, - reductionMemoryDerivatives.pointer(), + resultDerivatives.pointer(), CUDA_R_64F); } @@ -191,21 +200,28 @@ void CudaCoordination::prepare() { requestAtoms(nl->getReducedAtomList()); setUpPermanentGPUMemory(); invalidateList=false; - if(getExchangeStep()) error("Neighbor lists should be updated on exchange steps - choose a NL_STRIDE which divides the exchange stride!"); + if(getExchangeStep()) + error("Neighbor lists should be updated on exchange steps - choose a " + "NL_STRIDE which divides the exchange stride!"); } - if(getExchangeStep()) firsttime=true; + if(getExchangeStep()) + firsttime=true; } } + void CudaCoordination::registerKeywords( Keywords& keys ) { Colvar::registerKeywords(keys); keys.addFlag("SERIAL",false,"Perform the calculation in serial - for debug purpose"); - keys.addFlag("PAIR",false,"Pair only 1st element of the 1st group with 1st element in the second, etc"); + keys.addFlag("PAIR",false,"Pair only 1st element of the 1st group with 1st " + "element in the second, etc"); keys.addFlag("NLIST",false,"Use a neighbor list to speed up the calculation"); keys.add("optional","NL_CUTOFF","The cutoff for the neighbor list"); - keys.add("optional","NL_STRIDE","The frequency with which we are updating the atoms in the neighbor list"); + keys.add("optional","NL_STRIDE","The frequency with which we are updating the " + "atoms in the neighbor list"); keys.add("optional","THREADS","The upper limit of the number of threads"); keys.add("atoms","GROUPA","First list of atoms"); - keys.add("atoms","GROUPB","Second list of atoms (if empty, N*(N-1)/2 pairs in GROUPA are counted)"); + keys.add("atoms","GROUPB","Second list of atoms (if empty, N*(N-1)/2 pairs in " + "GROUPA are counted)"); keys.add("compulsory","NN","6","The n parameter of the switching function "); keys.add("compulsory","MM","0","The m parameter of the switching function; 0 implies 2*NN"); keys.add("compulsory","D_0","0.0","The d_0 parameter of the switching function"); @@ -215,16 +231,15 @@ void CudaCoordination::registerKeywords( Keywords& keys ) { //these constant will be used within the kernels __constant__ double cu_epsilon; -__device__ double pcuda_fastpow(double base, int expo) { +__device__ double pcuda_fastpow(double base,int expo) { if(expo<0) { expo=-expo; base=1.0/base; } double result = 1.0; while (expo) { - if (expo & 1) { + if (expo & 1) result *= base; - } expo >>= 1; base *= base; } @@ -263,76 +278,109 @@ __global__ void getpcuda_Rational(const double *rdists,const int NN, const int M if(rdists[i]<=0.) { res[i]=1.; dfunc[i]=0.0; - }else{ - res[i]=pcuda_Rational(rdists[i],NN,MM,dfunc[i]); - } - //printf("CUDA: %i :: d=%f -> %f, %f\n", i,rdists[i],res[i],dfunc[i]); + }else + res[i]=pcuda_Rational(rdists[i],NN,MM,dfunc[i]); } - -__global__ void getConst() { - //printf("Cuda: cu_epsilon = %f\n", cu_epsilon); -} +// __global__ void getConst() { +// printf("Cuda: cu_epsilon = %f\n", cu_epsilon); +// } CudaCoordination::CudaCoordination(const ActionOptions&ao): PLUMED_COLVAR_INIT(ao) { parseFlag("SERIAL",serial); - std::vector ga_lista,gb_lista; - parseAtomList("GROUPA",ga_lista); - parseAtomList("GROUPB",gb_lista); + std::vector GroupA,GroupB; + parseAtomList("GROUPA",GroupA); + parseAtomList("GROUPB",GroupB); bool nopbc=!pbc; parseFlag("NOPBC",nopbc); pbc=!nopbc; -// pair stuff + // pair stuff bool dopair=false; parseFlag("PAIR",dopair); -// neighbor list stuff + // neighbor list stuff bool doneigh=false; double nl_cut=0.0; int nl_st=0; parseFlag("NLIST",doneigh); if(doneigh) { parse("NL_CUTOFF",nl_cut); - if(nl_cut<=0.0) error("NL_CUTOFF should be explicitly specified and positive"); + if(nl_cut<=0.0) + error("NL_CUTOFF should be explicitly specified and positive"); parse("NL_STRIDE",nl_st); - if(nl_st<=0) error("NL_STRIDE should be explicitly specified and positive"); + if(nl_st<=0) + error("NL_STRIDE should be explicitly specified and positive"); } parse("THREADS",maxNumThreads); - if(maxNumThreads<=0) error("THREADS should be positive"); - addValueWithDerivatives(); setNotPeriodic(); - if(gb_lista.size()>0) { - if(doneigh) - {nl=Tools::make_unique(ga_lista,gb_lista,serial,dopair,pbc,getPbc(),comm,nl_cut,nl_st);} - else - {nl=Tools::make_unique(ga_lista,gb_lista,serial,dopair,pbc,getPbc(),comm);} + if(maxNumThreads<=0) + error("THREADS should be positive"); + addValueWithDerivatives(); + setNotPeriodic(); + if(GroupB.size()>0) { + if(doneigh){ + nl=Tools::make_unique( + GroupA, + GroupB, + serial, + dopair, + pbc, + getPbc(), + comm, + nl_cut, + nl_st); + } else { + nl=Tools::make_unique( + GroupA, + GroupB, + serial, + dopair, + pbc, + getPbc(), + comm);} } else { - if(doneigh) - {nl=Tools::make_unique(ga_lista,serial,pbc,getPbc(),comm,nl_cut,nl_st);} - else - {nl=Tools::make_unique(ga_lista,serial,pbc,getPbc(),comm);} + if(doneigh){ + nl=Tools::make_unique( + GroupA, + serial, + pbc, + getPbc(), + comm, + nl_cut, + nl_st); + }else{ + nl=Tools::make_unique(GroupA,serial,pbc,getPbc(),comm); + } } requestAtoms(nl->getFullAtomList()); - log.printf(" between two groups of %u and %u atoms\n",static_cast(ga_lista.size()),static_cast(gb_lista.size())); + log.printf( + " between two groups of %u and %u atoms\n", + static_cast(GroupA.size()), + static_cast(GroupB.size())); log.printf(" first group:\n"); - for(unsigned int i=0; i=numOfPairs) { - //Safeguard + + //Safeguard + if (i >=numOfPairs) return; - } + const unsigned i0= pairList[i*2]; const unsigned i1= pairList[i*2+1]; - if (i0 == i1) { + if (i0 == i1) return; - } + double d[3]={ coordinates[X(i1)] - coordinates[X(i0)], coordinates[Y(i1)] - coordinates[Y(i0)], @@ -468,131 +515,248 @@ __global__ void getCoord( ddOut[i + 1 * numOfPairs] = d[1]; ddOut[i + 2 * numOfPairs] = d[2]; ddOut[i + 3 * numOfPairs] = dfunc; - ddOut_sparse[0 + sparsePlace] = -d[0] * dfunc; - ddOut_sparse[1 + sparsePlace] = -d[1] * dfunc; - ddOut_sparse[2 + sparsePlace] = -d[2] * dfunc; - ddOut_sparse[3 + sparsePlace] = d[0] * dfunc; - ddOut_sparse[4 + sparsePlace] = d[1] * dfunc; - ddOut_sparse[5 + sparsePlace] = d[2] * dfunc; + ddOut_sparse[0 + sparsePlace] = -d[0];// * dfunc; + ddOut_sparse[1 + sparsePlace] = -d[1];// * dfunc; + ddOut_sparse[2 + sparsePlace] = -d[2];// * dfunc; + ddOut_sparse[3 + sparsePlace] = d[0] ;//* dfunc; + ddOut_sparse[4 + sparsePlace] = d[1] ;//* dfunc; + ddOut_sparse[5 + sparsePlace] = d[2] ;//* dfunc; sparseRows[0 + sparsePlace] = 3 * i0 + 0; sparseRows[1 + sparsePlace] = 3 * i0 + 1; sparseRows[2 + sparsePlace] = 3 * i0 + 2; sparseRows[3 + sparsePlace] = 3 * i1 + 0; sparseRows[4 + sparsePlace] = 3 * i1 + 1; sparseRows[5 + sparsePlace] = 3 * i1 + 2; - ones[i] = 1.0; sparseCols[i]=sparsePlace; - // ncoord[i]= 1; - // printf("Cuda: %i,%i %i %i\n", i,threadIdx.x , blockIdx.x, blockDim.x); } -void CudaCoordination::calculate() { +//In ndReduction.cu there are the more "general" reductions + //(vector, tensor and scalar) here we have the adapted one for how the virial + //information are stored + template + __global__ void reductionVirial(T *inputArray, T *outputArray, const unsigned int len) { + //we saved the dd in an array x0 x1 x2..,xn-1,y0 y1 y2..,yn-1,z0 z1 z2..,zn-1 + //virialOut[ii*3+jj]-=d[ii]*d[jj]*dfunc; + auto sdata = CUDAHELPERS::shared_memory_proxy(); + const unsigned int ii = blockIdx.y; + const unsigned int jj = blockIdx.z; + const unsigned int vcoord = ii*3+jj; + const unsigned int place = threadIdx.x; + + // each thread preprocess some element from global to shared memory + const unsigned int diplacementI = ii*len; + const unsigned int diplacementJ = jj*len; + unsigned int i = (numThreads*2)*blockIdx.x + place + diplacementI; + unsigned int j = (numThreads*2)*blockIdx.x + place + diplacementJ; + unsigned int dfunc = (numThreads*2)*blockIdx.x + place + 3*len; + const unsigned int gridSize = (numThreads*2)*gridDim.x; + //the first element is in blockIdx.y*len, the last element to sum in (blockIdx.y+1)*len-1 + const unsigned int trgt=diplacementI + len; + + + sdata[place] = T(0); + while (i+numThreads < trgt) { + sdata[place] -= inputArray[i]*inputArray[j]*inputArray[dfunc] + + inputArray[i+numThreads]*inputArray[j+numThreads]*inputArray[dfunc+numThreads]; + i+=gridSize; + j+=gridSize; + dfunc+=gridSize; + } + while (i < trgt) { + sdata[place] -= inputArray[i]*inputArray[j]*inputArray[dfunc]; + i+=gridSize; + j+=gridSize; + dfunc+=gridSize; + } + + __syncthreads(); + // do the actual reduction, its contained in cudaHelpers.cuh + CUDAHELPERS::reductor( + sdata, + outputArray, + blockIdx.x + vcoord * gridDim.x + ); +} + +template +void doReductionVirial (T *inputArray, T *outputArray, const unsigned int len, + const dim3 blocks, const unsigned nthreads,cudaStream_t stream=0){ + switch (nthreads) { + case 512: + reductionVirial<512,T><<>>(inputArray,outputArray, len); + break; + case 256: + reductionVirial<256,T><<>>(inputArray,outputArray, len); + break; + case 128: + reductionVirial<128,T><<>>(inputArray,outputArray, len); + break; + case 64: + reductionVirial<64, T><<>>(inputArray,outputArray, len); + break; + case 32: + reductionVirial<32, T><<>>(inputArray,outputArray, len); + break; + default: + plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads."); + } +} + +void CudaCoordination::calculate() { auto positions = getPositions(); auto nat = positions.size(); - if(nl->getStride()>0 && invalidateList) { + + Tensor virial; + double coordination; + auto deriv = std::vector(nat); + + if(nl->getStride()>0 && invalidateList) nl->update(getPositions()); - } + auto pairList = nl->getClosePairs(); const unsigned nn=nl->size(); - constexpr unsigned nthreads=512; unsigned ngroups=ceil(double(nn)/nthreads); - /****************allocating the memory on the GPU****************/ - cudaCoords.copyToCuda(&positions[0][0]); + /**********************allocating the memory on the GPU**********************/ + cudaCoords.copyToCuda(&positions[0][0],streamDerivatives); - ones.resize(nn); cudaCoordination.resize(nn); - cudaDerivatives.resize(nn * 4); - //cudaVirial.resize(nn * 9);//it will be eventually updated in the reduceDVS function - - cudaDerivatives_sparse.resize(nn * 6); - cudaDerivatives_sparserow.resize(nn * 6); + cudaDerivatives.resize(nn *4); + cudaDerivatives_sparse.resize(nn *6); + cudaVirial.resize(nn*9); + cudaDerivatives_sparserows.resize(nn*6); cudaDerivatives_sparsecols.resize(nn); - - /****************starting the calculations****************/ - getCoord<<>> (nn,switchingParameters, + /**************************starting the calculations*************************/ + //this initializes the memory to be accumulated + getCoord<<>> ( + nn, + switchingParameters, cudaCoords.pointer(), cudaPairList.pointer(), cudaCoordination.pointer(), cudaDerivatives.pointer(), cudaDerivatives_sparse.pointer(), - cudaDerivatives_sparserow.pointer(), - cudaDerivatives_sparsecols.pointer(), - ones.pointer() - ); - std::vector deriv(nat); - cusparseSpMatDescr_t spMatDescr; - cusparseDnVecDescr_t dnVecDescr; - cusparseCreateDnVec(&dnVecDescr, - nn, - ones.pointer(), - CUDA_R_64F); + cudaDerivatives_sparserows.pointer(), + cudaDerivatives_sparsecols.pointer() + ); + //since the virial and the coordination reduction are on different streams, + //this barrier makes sure that the memory is ready for the operations - cusparseCreateCsc(&spMatDescr, - getPositions().size() * 3, - nn, - nn*6, - cudaDerivatives_sparsecols.pointer(), - cudaDerivatives_sparserow.pointer(), - cudaDerivatives_sparse.pointer(), - CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_64I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_64F); + /**************************accumulating the results**************************/ + cusparseDnVecDescr_t dfuncDense; + cusparseCreateDnVec( + &dfuncDense, + nn, + //this part of the array contains the nn dfuncs + cudaDerivatives.pointer()+3*nn, + CUDA_R_64F); + cusparseSpMatDescr_t derivativesSparse; + cusparseCreateCsc( + &derivativesSparse, + getPositions().size() * 3,//rows + nn,//columns + nn*6,//non zerp elements + cudaDerivatives_sparsecols.pointer(), + cudaDerivatives_sparserows.pointer(), + cudaDerivatives_sparse.pointer(), + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_64I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F); double one=1.0; double zero=0.0; size_t bufferSize = 0; - cusparseSpMV_bufferSize( sparseMDevHandle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - spMatDescr, - dnVecDescr, - &zero, - outVecDescr, - CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT,//may be improved - &bufferSize); -bufferDerivatives.resize(bufferSize); -//the sparseMDevHandle operation happen in their own stream, so reduceDVS will be run in concurrency -cusparseSpMV( sparseMDevHandle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - spMatDescr, - dnVecDescr, - &zero, - outVecDescr, - CUDA_R_64F, - CUSPARSE_SPMV_ALG_DEFAULT,//may be improved - bufferDerivatives.pointer()); - //reductionMemoryDerivatives.copyFromCuda(&deriv[0][0],streamDerivatives); - CUDAHELPERS::VS ret = CUDAHELPERS::reduceVS( - cudaDerivatives, - cudaVirial, - cudaCoordination, - cudaPairList, - reductionMemoryVirial, - reductionMemoryCoord, - streamVirial, - streamCoordination, - nn,nat, - maxNumThreads - ); - reductionMemoryDerivatives.copyFromCuda(&deriv[0][0]); - //cusparseDnVecGetValues(outVecDescr,**void); returns reductionMemoryDerivatives.pointer() - + //this computes the buffersize + cusparseSpMV_bufferSize( + sparseMDevHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + derivativesSparse, + dfuncDense, + &zero, + outDevDescr, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT,//may be improved + &bufferSize); + bufferDerivatives.resize(bufferSize); + cudaStreamSynchronize ( streamDerivatives ); + cusparseSpMV( + sparseMDevHandle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one, + derivativesSparse, + dfuncDense, + &zero, + outDevDescr, + CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT,//may be improved + bufferDerivatives.pointer()); - cusparseDestroySpMat(spMatDescr); - cusparseDestroyDnVec(dnVecDescr); + auto N=nn; + bool first=true; + while(N>1){ + size_t runningThreads = CUDAHELPERS::threadsPerBlock(N,maxNumThreads); + unsigned nGroups = CUDAHELPERS::idealGroups(N, runningThreads); + + + reductionMemoryCoord.resize(nGroups); + reductionMemoryVirial.resize(9* nGroups); + + if (first){ + dim3 ngroupsVirial(nGroups,3,3); + doReductionVirial ( + cudaDerivatives.pointer(), + reductionMemoryVirial.pointer(), + N, + ngroupsVirial, + runningThreads, + streamVirial); + //putting this here improves the overlap between mem movements and kernels + resultDerivatives.copyFromCuda(&deriv[0][0],streamDerivatives); + }else{ + dim3 ngroupsVirial(nGroups,9); + CUDAHELPERS::doReductionND ( + cudaVirial.pointer(), + reductionMemoryVirial.pointer(), + N, + ngroupsVirial, + runningThreads, + streamVirial); + } + + CUDAHELPERS::doReduction1D ( + cudaCoordination.pointer(),//reduceScalarIn->pointer(), + reductionMemoryCoord.pointer(),//reduceSOut->pointer(), + N, + nGroups, + runningThreads, + streamCoordination); + if (nGroups==1){ + reductionMemoryVirial.copyFromCuda(&virial[0][0],streamVirial); + //reduceSOut->copyFromCuda(&coordination,streamCoordination); + reductionMemoryCoord.copyFromCuda(&coordination,streamCoordination); + } else { + //std::swap(reduceScalarIn,reduceSOut); + reductionMemoryCoord.swap(cudaCoordination); + reductionMemoryVirial.swap(cudaVirial); + } + first=false; + N=nGroups; + } + //this ensures that the memory is fully in the host ram + cudaDeviceSynchronize (); + cusparseDestroySpMat(derivativesSparse); + cusparseDestroyDnVec(dfuncDense); for(unsigned i=0; i +// __device__ void warpReduce(volatile T* sdata, const unsigned int place){ +// //Instructions are SIMD synchronous within a warp +// //so no need for __syncthreads(), in the last iterations +// if(numThreads >= 64){//compile time +// sdata[place] += sdata[place + 32]; +// } +// if(numThreads >= 32){//compile time +// sdata[place] += sdata[place + 16]; +// } +// if(numThreads >= 16){//compile time +// sdata[place] += sdata[place + 8]; +// } +// if(numThreads >= 8){//compile time +// sdata[place] += sdata[place + 4]; +// } +// if(numThreads >= 4){//compile time +// sdata[place] += sdata[place + 2]; +// } +// if(numThreads >= 2){//compile time +// sdata[place] += sdata[place + 1]; +// } +// } + +template +__device__ void reductor(volatile T* sdata,T *outputArray,const unsigned int where){ + const unsigned int tid=threadIdx.x; + if (numThreads >= 512) {//compile time + if (tid < 256) + sdata[tid] += sdata[tid + 256]; + __syncthreads(); + } + if (numThreads >= 256) {//compile time + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + } + if (numThreads >= 128) {//compile time + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + } + if (tid < mymin(32u,numThreads/2)) { + //warpReduce(sdata, tid); + if(numThreads >= 64){//compile time + sdata[tid] += sdata[tid + 32]; + } + if(numThreads >= 32){//compile time + sdata[tid] += sdata[tid + 16]; + } + if(numThreads >= 16){//compile time + sdata[tid] += sdata[tid + 8]; + } + if(numThreads >= 8){//compile time + sdata[tid] += sdata[tid + 4]; + } + if(numThreads >= 4){//compile time + sdata[tid] += sdata[tid + 2]; + } + if(numThreads >= 2){//compile time + sdata[tid] += sdata[tid + 1]; + } + } + // write result for this block to global memory + if (tid == 0){ + outputArray[where] = sdata[0]; + } +} + } //namespace CUDAHELPERS } //namespace PLMD #endif //__PLUMED_cuda_helpers_cuh diff --git a/src/cudaCoord/ndReduction.cu b/src/cudaCoord/ndReduction.cu index 1ff02e6450..321c854b9d 100644 --- a/src/cudaCoord/ndReduction.cu +++ b/src/cudaCoord/ndReduction.cu @@ -11,6 +11,7 @@ //#define vdbg(...) std::cerr << std::setw(4) << __LINE__ <<":" << std::setw(20)<< #__VA_ARGS__ << " " << (__VA_ARGS__) <<'\n' #define vdbg(...) +// Some help for me: // * Grids map to GPUs // * Blocks map to the MultiProcessors (MP) // * Threads map to Stream Processors (SP) @@ -18,7 +19,8 @@ //There are a LOTS of unrolled loop down here, -//see this https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf to undestand why +//see this https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf +// to undestand why //https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#dim3 //7.3.2. dim3 @@ -27,55 +29,7 @@ namespace PLMD { namespace CUDAHELPERS { template -__device__ void warpReduce(volatile T* sdata, const unsigned int place){ - //Instructions are SIMD synchronous within a warp - //so no need for __syncthreads(), in the last iterations - if(numThreads >= 64){//compile time - sdata[place] += sdata[place + 32]; - } - if(numThreads >= 32){//compile time - sdata[place] += sdata[place + 16]; - } - if(numThreads >= 16){//compile time - sdata[place] += sdata[place + 8]; - } - if(numThreads >= 8){//compile time - sdata[place] += sdata[place + 4]; - } - if(numThreads >= 4){//compile time - sdata[place] += sdata[place + 2]; - } - if(numThreads >= 2){//compile time - sdata[place] += sdata[place + 1]; - } -} - -template -__device__ void reductor(volatile T* sdata,T *g_odata,const unsigned int where){ - const unsigned int tid=threadIdx.x; - if (numThreads >= 512) {//compile time - if (tid < 256) { - sdata[tid] += sdata[tid + 256]; } __syncthreads(); - } - if (numThreads >= 256) {//compile time - if (tid < 128) { - sdata[tid] += sdata[tid + 128]; } __syncthreads(); - } - if (numThreads >= 128) {//compile time - if (threadIdx. x < 64) { - sdata[tid] += sdata[tid + 64]; } __syncthreads(); - } - if (tid < mymin(32u,numThreads/2)) { - warpReduce(sdata, tid); - } - // write result for this block to global memory - if (tid == 0){ - g_odata[where] = sdata[0]; - } -} - -template -__global__ void reductionND(const T *g_idata, T *g_odata, const unsigned int len) { +__global__ void reductionND(const T *inputArray, T *outputArray, const unsigned int len) { //playing with this //https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf auto sdata = shared_memory_proxy(); @@ -90,94 +44,46 @@ __global__ void reductionND(const T *g_idata, T *g_odata, const unsigned int len sdata[threadIdx.x] = T(0); while (i+numThreads < trgt) { - sdata[threadIdx.x] += g_idata[i] + g_idata[i+numThreads]; + sdata[threadIdx.x] += inputArray[i] + inputArray[i+numThreads]; i+=gridSize; } while (i < trgt) { - sdata[threadIdx.x] += g_idata[i]; + sdata[threadIdx.x] += inputArray[i]; i+=gridSize; } __syncthreads(); // do reduction in shared memory - reductor(sdata,g_odata,blockIdx.x+blockIdx.y*gridDim.x); + reductor(sdata,outputArray,blockIdx.x+blockIdx.y*gridDim.x); } template -__global__ void reduction1D(T *g_idata, T *g_odata, const unsigned int len) { +__global__ void reduction1D(T *inputArray, T *outputArray, const unsigned int len) { auto sdata = shared_memory_proxy(); const unsigned int place = threadIdx.x; // each thread sums some elements from global to shared memory unsigned int i = (2*numThreads)*blockIdx.x + place; const unsigned int gridSize = (2*numThreads)*gridDim.x; sdata[place] = T(0); - //I think this may slow down the loop, but this does not force the user to have - //an input that is multiple of the threads, padded with zeros - while (i+numThreads < len) { - sdata[place] += g_idata[i] + g_idata[i+numThreads]; + //The double while is for preventig wrong memory + while ( i + numThreads < len) { + sdata[place] += inputArray[i] + inputArray[i+numThreads]; i+=gridSize; } while (i < len) { - sdata[place] += g_idata[i]; + sdata[place] += inputArray[i]; i+=gridSize; } __syncthreads(); // do reduction in shared memory - reductor(sdata,g_odata,blockIdx.x); -} - -template -__global__ void reductionVirial(T *g_idata, T *g_odata, const unsigned int len) { - //we saved the dd in an array x0 x1 x2..,xn-1,y0 y1 y2..,yn-1,z0 z1 z2..,zn-1 - //virialOut[ii*3+jj]-=d[ii]*d[jj]*dfunc; - auto sdata = shared_memory_proxy(); - const unsigned int ii = blockIdx.y; - const unsigned int jj = blockIdx.z; - const unsigned int vcoord = ii*3+jj; - const unsigned int place = threadIdx.x; - // each thread loads one element from global to shared mem - - // each thread loads one element from global to shared memory - const unsigned int diplacementI = ii*len; - const unsigned int diplacementJ = jj*len; - unsigned int i = (numThreads*2)*blockIdx.x + place + diplacementI; - unsigned int j = (numThreads*2)*blockIdx.x + place + diplacementJ; - unsigned int dfunc = (numThreads*2)*blockIdx.x + place + 3*len; - const unsigned int gridSize = (numThreads*2)*gridDim.x; - //the first element is in blockIdx.y*len, the last element to sum in (blockIdx.y+1)*len-1 - const unsigned int trgt=diplacementI + len; - - - sdata[place] = T(0); - //I think this may slow down the loop, but this does not force the user to have - //an input that is multiple of the threads, padded with zeros - while (i+numThreads < trgt) { - sdata[place] -= g_idata[i]*g_idata[j]*g_idata[dfunc] - + g_idata[i+numThreads]*g_idata[j+numThreads]*g_idata[dfunc+numThreads]; - i+=gridSize; - j+=gridSize; - dfunc+=gridSize; - } - while (i < trgt) { - sdata[place] -= g_idata[i]*g_idata[j]*g_idata[dfunc]; - i+=gridSize; - j+=gridSize; - dfunc+=gridSize; - } - - __syncthreads(); - // do reduction in shared memory - reductor(sdata,g_odata,blockIdx.x - + - vcoord * gridDim.x - ); + reductor(sdata,outputArray,blockIdx.x); } //after c++14 the template activation will be shorter to write: //template, bool> = true> -///finds the nearest upper multiple of the given reference (wit non increments) +///finds the nearest upper multiple of the given reference template::value, bool>::type = true> inline T nearestUpperMultipleTo(T number, T reference){ @@ -185,7 +91,7 @@ typename std::enable_if::value, bool>::type = true> } ///We'll find the ideal number of blocks using the Brent's theorem -size_t getIdealGroups(size_t numberOfElements, size_t runningThreads){ +size_t idealGroups(size_t numberOfElements, size_t runningThreads){ //nearest upper multiple to the numberof threads const size_t nnToGPU=nearestUpperMultipleTo(numberOfElements,runningThreads); ///Brent’s theorem says each thread should sum O(log n) elements @@ -197,7 +103,7 @@ size_t getIdealGroups(size_t numberOfElements, size_t runningThreads){ } -size_t decideThreadsPerBlock(unsigned N, unsigned maxNumThreads=512){ +size_t threadsPerBlock(unsigned N, unsigned maxNumThreads){ //this seeks the minimum number of threads to use a sigle block (and end the recursion) size_t dim=32; for (dim=32;dim<512;dim<<=1){ @@ -213,22 +119,27 @@ size_t decideThreadsPerBlock(unsigned N, unsigned maxNumThreads=512){ } template -void callReduction1D (T *g_idata, T *g_odata, const unsigned int len, const unsigned blocks, const unsigned nthreads){ +void callReduction1D ( + T *inputArray, + T *outputArray, + const unsigned int len, + const unsigned blocks, + const unsigned nthreads){ switch (nthreads) { case 512: - reduction1D<512,T><<>>(g_idata,g_odata, len); + reduction1D<512,T><<>>(inputArray,outputArray, len); break; case 256: - reduction1D<256,T><<>>(g_idata,g_odata, len); + reduction1D<256,T><<>>(inputArray,outputArray, len); break; case 128: - reduction1D<128,T><<>>(g_idata,g_odata, len); + reduction1D<128,T><<>>(inputArray,outputArray, len); break; case 64: - reduction1D<64, T><<>>(g_idata,g_odata, len); + reduction1D<64, T><<>>(inputArray,outputArray, len); break; case 32: - reduction1D<32, T><<>>(g_idata,g_odata, len); + reduction1D<32, T><<>>(inputArray,outputArray, len); break; default: plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads."); @@ -236,22 +147,26 @@ void callReduction1D (T *g_idata, T *g_odata, const unsigned int len, const unsi } template -void callReductionND (T *g_idata, T *g_odata, const unsigned int len, const dim3 blocks, const unsigned nthreads){ +void callReductionND (T *inputArray, + T *outputArray, + const unsigned int len, + const dim3 blocks, + const unsigned nthreads){ switch (nthreads) { case 512: - reductionND<512,T><<>>(g_idata,g_odata, len); + reductionND<512,T><<>>(inputArray,outputArray, len); break; case 256: - reductionND<256,T><<>>(g_idata,g_odata, len); + reductionND<256,T><<>>(inputArray,outputArray, len); break; case 128: - reductionND<128,T><<>>(g_idata,g_odata, len); + reductionND<128,T><<>>(inputArray,outputArray, len); break; case 64: - reductionND<64, T><<>>(g_idata,g_odata, len); + reductionND<64, T><<>>(inputArray,outputArray, len); break; case 32: - reductionND<32, T><<>>(g_idata,g_odata, len); + reductionND<32, T><<>>(inputArray,outputArray, len); break; default: plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads."); @@ -263,10 +178,10 @@ double reduceScalar(double* cudaScalarAddress, unsigned N, unsigned maxNumThread double *reduceOut = cudaScalarAddress; double *reduceIn; while(N>1){ - size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads); + size_t runningThreads = threadsPerBlock(N,maxNumThreads); reduceIn = reduceOut; reduceOut = nullptr; - auto ngroups=getIdealGroups(N, runningThreads); + auto ngroups=idealGroups(N, runningThreads); cudaFree(reduceOut); cudaMalloc(&reduceOut,ngroups * sizeof(double)); callReduction1D (reduceIn, reduceOut, N, ngroups, runningThreads); @@ -288,9 +203,9 @@ double reduceScalar(memoryHolder& cudaScalarAddress, memoryHolder* reduceIn= &memoryHelper; memoryHolder* reduceOut =&cudaScalarAddress; while(N>1){ - size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads); + size_t runningThreads = threadsPerBlock(N,maxNumThreads); std::swap(reduceIn,reduceOut); - auto ngroups=getIdealGroups(N, runningThreads); + auto ngroups=idealGroups(N, runningThreads); reduceOut->resize(ngroups); callReduction1D (reduceIn->pointer(), reduceOut->pointer(), N, ngroups, runningThreads); N=ngroups; @@ -304,27 +219,16 @@ std::vector reduceNVectors(double* cudaNVectorAddress, unsigned N, unsig double *reduceOut = cudaNVectorAddress; double *reduceIn; auto dim = nat*3; - vdbg("InNVectors"); while(N>1){ - size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads); + size_t runningThreads = threadsPerBlock(N,maxNumThreads); reduceIn = reduceOut; reduceOut = nullptr; - vdbg(N); - vdbg(cudaNVectorAddress); - vdbg(reduceIn); - vdbg(reduceOut); - vdbg(runningThreads); - dim3 ngroups(getIdealGroups(N, runningThreads),dim); - vdbg(ngroups.x); - vdbg(ngroups.y); + dim3 ngroups(idealGroups(N, runningThreads),dim); cudaFree(reduceOut); cudaMalloc(&reduceOut,ngroups.y* ngroups.x * sizeof(double)); - vdbg(reduceOut); - callReductionND (reduceIn, reduceOut, N, ngroups, runningThreads); if (reduceIn != cudaNVectorAddress){ - vdbg("Free reduceIn"); cudaFree(reduceIn); } N=ngroups.x; @@ -332,35 +236,36 @@ std::vector reduceNVectors(double* cudaNVectorAddress, unsigned N, unsig std::vector toret(nat); cudaMemcpy(&toret[0][0], reduceOut, 3*nat*sizeof(double), cudaMemcpyDeviceToHost); cudaFree(reduceOut); - vdbg(toret[0]); return toret; } //THIS DOES NOT KEEP THE DATA SAFE -std::vector reduceNVectors(memoryHolder& cudaNVectorAddress, - memoryHolder& memoryHelper, -unsigned N, unsigned nat, unsigned maxNumThreads){ +std::vector reduceNVectors( + memoryHolder& cudaNVectorAddress, + memoryHolder& memoryHelper, + unsigned N, + unsigned nat, + unsigned maxNumThreads){ memoryHolder* reduceIn= &memoryHelper; memoryHolder* reduceOut =&cudaNVectorAddress; auto dim = nat*3; - vdbg("InNVectors"); while(N>1){ - size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads); + size_t runningThreads = threadsPerBlock(N,maxNumThreads); std::swap(reduceIn,reduceOut); - dim3 ngroups(getIdealGroups(N, runningThreads),dim); + dim3 ngroups(idealGroups(N, runningThreads),dim); reduceOut->resize(ngroups.y* ngroups.x); - - - callReductionND (reduceIn->pointer(), reduceOut->pointer(), N, ngroups, runningThreads); - + callReductionND ( + reduceIn->pointer(), + reduceOut->pointer(), + N, + ngroups, + runningThreads); N=ngroups.x; } std::vector toret(nat); reduceOut->copyFromCuda(&toret[0][0]); //cudaMemcpy(&toret[0][0], reduceOut, 3*nat*sizeof(double), cudaMemcpyDeviceToHost); - - vdbg(toret[0]); return toret; } @@ -370,10 +275,10 @@ Vector reduceVector(double* cudaVectorAddress, unsigned N, unsigned maxNumThread double *reduceOut = cudaVectorAddress; double *reduceIn; while(N>1){ - size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads); + size_t runningThreads = threadsPerBlock(N,maxNumThreads); reduceIn = reduceOut; reduceOut = nullptr; - dim3 ngroups(getIdealGroups(N, runningThreads),3); + dim3 ngroups(idealGroups(N, runningThreads),3); cudaFree(reduceOut); cudaMalloc(&reduceOut,ngroups.y* ngroups.x * sizeof(double)); @@ -396,9 +301,9 @@ memoryHolder& memoryHelper, unsigned N, unsigned maxNumThreads){ memoryHolder* reduceIn= &memoryHelper; memoryHolder* reduceOut =&cudaTensorAddress; while(N>1){ - size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads); + size_t runningThreads = threadsPerBlock(N,maxNumThreads); std::swap(reduceIn,reduceOut); - dim3 ngroups(getIdealGroups(N, runningThreads),9); + dim3 ngroups(idealGroups(N, runningThreads),9); reduceOut->resize(ngroups.y* ngroups.x); callReductionND (reduceIn->pointer(), reduceOut->pointer(), @@ -412,23 +317,23 @@ memoryHolder& memoryHelper, unsigned N, unsigned maxNumThreads){ } template -void callReduction1D (T *g_idata, T *g_odata, const unsigned int len, +void callReduction1D (T *inputArray, T *outputArray, const unsigned int len, const unsigned blocks, const unsigned nthreads,cudaStream_t stream=0){ switch (nthreads) { case 512: - reduction1D<512,T><<>>(g_idata,g_odata, len); + reduction1D<512,T><<>>(inputArray,outputArray, len); break; case 256: - reduction1D<256,T><<>>(g_idata,g_odata, len); + reduction1D<256,T><<>>(inputArray,outputArray, len); break; case 128: - reduction1D<128,T><<>>(g_idata,g_odata, len); + reduction1D<128,T><<>>(inputArray,outputArray, len); break; case 64: - reduction1D<64, T><<>>(g_idata,g_odata, len); + reduction1D<64, T><<>>(inputArray,outputArray, len); break; case 32: - reduction1D<32, T><<>>(g_idata,g_odata, len); + reduction1D<32, T><<>>(inputArray,outputArray, len); break; default: plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads."); @@ -436,111 +341,46 @@ void callReduction1D (T *g_idata, T *g_odata, const unsigned int len, } template -void callReductionND (T *g_idata, T *g_odata, const unsigned int len, +void callReductionND (T *inputArray, T *outputArray, const unsigned int len, const dim3 blocks, const unsigned nthreads,cudaStream_t stream=0){ switch (nthreads) { case 512: - reductionND<512,T><<>>(g_idata,g_odata, len); - break; - case 256: - reductionND<256,T><<>>(g_idata,g_odata, len); - break; - case 128: - reductionND<128,T><<>>(g_idata,g_odata, len); - break; - case 64: - reductionND<64, T><<>>(g_idata,g_odata, len); - break; - case 32: - reductionND<32, T><<>>(g_idata,g_odata, len); - break; - default: - plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads."); - } -} - -template -void callReductionVirial (T *g_idata, T *g_odata, const unsigned int len, - const dim3 blocks, const unsigned nthreads,cudaStream_t stream=0){ - switch (nthreads) { - case 512: - reductionVirial<512,T><<>>(g_idata,g_odata, len); + reductionND<512,T><<>>(inputArray,outputArray, len); break; case 256: - reductionVirial<256,T><<>>(g_idata,g_odata, len); + reductionND<256,T><<>>(inputArray,outputArray, len); break; case 128: - reductionVirial<128,T><<>>(g_idata,g_odata, len); + reductionND<128,T><<>>(inputArray,outputArray, len); break; case 64: - reductionVirial<64, T><<>>(g_idata,g_odata, len); + reductionND<64, T><<>>(inputArray,outputArray, len); break; case 32: - reductionVirial<32, T><<>>(g_idata,g_odata, len); + reductionND<32, T><<>>(inputArray,outputArray, len); break; default: plumed_merror("Reduction can be called only with 512, 256, 128, 64 or 32 threads."); } } -//#define vdbg(...) std::cerr << std::setw(4) << __LINE__ <<":" << std::setw(20)<< #__VA_ARGS__ << " " << (__VA_ARGS__) <<'\n' -//if this is working I might use something similar to the sharedptr/weakptr -VS reduceVS(memoryHolder& derivativeIn, - memoryHolder& virialIn, - memoryHolder& scalarIn, - memoryHolder& pairListIn, - memoryHolder& memoryHelperV, - memoryHolder& memoryHelperS, - const cudaStream_t streamVirial, - const cudaStream_t streamScalar, - unsigned N, unsigned nat, - unsigned maxNumThreads -){ - //the memory is assigned swapped because it will be swapped at each loop iteration - memoryHolder* reduceScalarIn = &scalarIn; - memoryHolder* reduceSOut = &memoryHelperS; - - memoryHolder* reduceVirialIn = &virialIn; - memoryHolder* reduceVirialOut = &memoryHelperV; - - auto dim = nat*3; - - bool first=true; - VS toret; - while(N>1){ - size_t runningThreads = decideThreadsPerBlock(N,maxNumThreads); - unsigned ngroupsS=getIdealGroups(N, runningThreads); - - reduceVirialOut->resize(9* ngroupsS); - reduceSOut->resize(ngroupsS); - - if (first){ - dim3 ngroupsVirial(ngroupsS,3,3); - callReductionVirial (derivativeIn.pointer(), - reduceVirialOut->pointer(), - N, ngroupsVirial, runningThreads,streamVirial); - }else{ - dim3 ngroupsVirial(ngroupsS,9); - callReductionND (reduceVirialIn->pointer(), reduceVirialOut->pointer(), - N, ngroupsVirial, runningThreads, streamVirial); - } - if (ngroupsS==1) - reduceVirialOut->copyFromCuda(&toret.virial[0][0],streamVirial); - - callReduction1D (reduceScalarIn->pointer(), reduceSOut->pointer(), - N, ngroupsS, runningThreads,streamScalar); - if (ngroupsS==1) - reduceSOut->copyFromCuda(&toret.scalar,streamScalar); - - std::swap(reduceVirialIn,reduceVirialOut); - std::swap(reduceScalarIn,reduceSOut); - first=false; - N=ngroupsS; - } - return toret; -} - - +void doReduction1D (double *inputArray, + double *outputArray, + const unsigned int len, + const unsigned blocks, + const unsigned nthreads, + cudaStream_t stream){ + callReduction1D (inputArray, outputArray, len, blocks, nthreads, stream); + } + +void doReductionND (double *inputArray, + double *outputArray, + const unsigned int len, + const dim3 blocks, + const unsigned nthreads, + cudaStream_t stream){ + callReductionND (inputArray, outputArray, len, blocks, nthreads, stream); + } } //namespace CUDAHELPERS } //namespace PLMD diff --git a/src/cudaCoord/ndReduction.h b/src/cudaCoord/ndReduction.h index 6c47d9e3b5..f1920cc161 100644 --- a/src/cudaCoord/ndReduction.h +++ b/src/cudaCoord/ndReduction.h @@ -123,47 +123,22 @@ unsigned N, unsigned maxNumThreads); double reduceScalar(memoryHolder& cudaScalarAddress, memoryHolder& memoryHelper, unsigned N, unsigned maxNumThreads=512); - struct VS{ - Tensor virial; - double scalar; -}; +void doReduction1D (double *inputArray, + double *outputArray, + const unsigned int len, + const unsigned blocks, + const unsigned nthreads, + cudaStream_t stream=0); -/** @brief reduces the coordination, the derivatives and the virial from a coordination calculation - * @param derivativeIn the 3 * N array to be reduced with the compontent sof the derivatives - * @param scalarIn the N array to be reduced with the values of the coordination - * @param memoryHelperD preallocated GPU memory for speed up (for the derivatives) - * @param memoryHelperV preallocated GPU memory for speed up (for the virial) - * @param memoryHelperS preallocated GPU memory for speed up (for the scalar/coordination) - * @param streamDerivatives preinitializated stream for concurrency (for the derivatives) - * @param streamVirial preinitializated stream for concurrency (for the virial) - * @param streamScalar preinitializated stream for concurrency (for the scalar/coordination) - * @param cudaScalarAddress the pointer to the memory in cuda - * @param N the number of scalar to reduce - * @param nat the number of atoms - * @param maxNumThreads limits the number of threads per block to be used - * @return the derivative, the virial and the coordination - * - * reduceDVS - * The memory helpers will be resized if needed (the memory occupied may be increased but not reduced) - * - * The algorithm will decide the best number of threads to be used in - * an extra nthreads * sizeof(double) will be allocated on the GPU, - * where nthreads is the total number of threads that will be used - * - * @note cudaScalarAddress is threated as not owned: the user will need to call cudaFree on it!!! - */ -VS reduceVS( - memoryHolder& derivativeIn, - memoryHolder& virialIn, - memoryHolder& scalarIn, - memoryHolder& pairListIn, - memoryHolder& memoryHelperV, - memoryHolder& memoryHelperS, - cudaStream_t streamVirial, - cudaStream_t streamScalar, - unsigned N, unsigned nat, - unsigned maxNumThreads=512); +void doReductionND (double *inputArray, + double *outputArray, + const unsigned int len, + const dim3 blocks, + const unsigned nthreads, + cudaStream_t stream=0); +size_t idealGroups(size_t numberOfElements, size_t runningThreads); +size_t threadsPerBlock(unsigned N, unsigned maxNumThreads=512); } //namespace CUDAHELPERS } //namespace PLMD -#endif //__PLUMED_cuda_ndReduction_h \ No newline at end of file +#endif //__PLUMED_cuda_ndReduction_h