diff --git a/src/alpaka/AlpakaCore/HistoContainer.h b/src/alpaka/AlpakaCore/HistoContainer.h index 397881fe0..ae64e1c56 100644 --- a/src/alpaka/AlpakaCore/HistoContainer.h +++ b/src/alpaka/AlpakaCore/HistoContainer.h @@ -74,17 +74,15 @@ namespace cms { const unsigned int nblocks = (num_items + nthreads - 1) / nthreads; const Vec1 blocksPerGrid(nblocks); + auto d_pc = cms::alpakatools::allocDeviceBuf(1u); + int32_t *pc = alpaka::getPtrNative(d_pc); + alpaka::memset(queue, d_pc, 0, 1u); + const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread); alpaka::enqueue(queue, alpaka::createTaskKernel( - workDiv, multiBlockPrefixScanFirstStep(), poff, poff, num_items)); - - const WorkDiv1 &workDivWith1Block = - cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread); - alpaka::enqueue( - queue, - alpaka::createTaskKernel( - workDivWith1Block, multiBlockPrefixScanSecondStep(), poff, poff, num_items, nblocks)); + workDiv, multiBlockPrefixScan(), poff, poff, num_items, pc)); + alpaka::wait(queue); } template diff --git a/src/alpaka/AlpakaCore/prefixScan.h b/src/alpaka/AlpakaCore/prefixScan.h index a23a2e342..c30fb61ef 100644 --- a/src/alpaka/AlpakaCore/prefixScan.h +++ b/src/alpaka/AlpakaCore/prefixScan.h @@ -4,6 +4,7 @@ #include #include "AlpakaCore/alpakaConfig.h" +#include "AlpakaCore/threadfence.h" #include "Framework/CMSUnrollLoop.h" #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED @@ -49,23 +50,23 @@ namespace cms { #endif ) { #if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__ - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const gridBlockIdx(alpaka::getIdx(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - assert(ws); + const int32_t blockDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t gridBlockIdx(alpaka::getIdx(acc)[0u]); + const int32_t blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); ALPAKA_ASSERT_OFFLOAD(size <= 1024); - ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); + ALPAKA_ASSERT_OFFLOAD(0 == blockDim % 32); auto first = blockThreadIdx; auto mask = __ballot_sync(0xffffffff, first < size); auto laneId = blockThreadIdx & 0x1f; - for (auto i = first; i < size; i += blockDimension) { + for (auto i = first; i < size; i += blockDim) { warpPrefixScan(laneId, ci, co, i, mask); auto warpId = i / 32; ALPAKA_ASSERT_OFFLOAD(warpId < 32); if (31 == laneId) ws[warpId] = co[i]; - mask = __ballot_sync(mask, i + blockDimension < size); + mask = __ballot_sync(mask, i + blockDim < size); } alpaka::syncBlockThreads(acc); if (size <= 32) @@ -74,7 +75,7 @@ namespace cms { warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); } alpaka::syncBlockThreads(acc); - for (auto i = first + 32; i < size; i += blockDimension) { + for (auto i = first + 32; i < size; i += blockDim) { uint32_t warpId = i / 32; co[i] += ws[warpId - 1]; } @@ -97,23 +98,23 @@ namespace cms { #endif ) { #if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__ - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const gridBlockIdx(alpaka::getIdx(acc)[0u]); - uint32_t const blockThreadIdx(alpaka::getIdx(acc)[0u]); - assert(ws); + const int32_t blockDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t gridBlockIdx(alpaka::getIdx(acc)[0u]); + const int32_t blockThreadIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(ws); ALPAKA_ASSERT_OFFLOAD(size <= 1024); - ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32); + ALPAKA_ASSERT_OFFLOAD(0 == blockDim % 32); auto first = blockThreadIdx; auto mask = __ballot_sync(0xffffffff, first < size); auto laneId = blockThreadIdx & 0x1f; - for (auto i = first; i < size; i += blockDimension) { + for (auto i = first; i < size; i += blockDim) { warpPrefixScan(laneId, c, i, mask); auto warpId = i / 32; ALPAKA_ASSERT_OFFLOAD(warpId < 32); if (31 == laneId) ws[warpId] = c[i]; - mask = __ballot_sync(mask, i + blockDimension < size); + mask = __ballot_sync(mask, i + blockDim < size); } alpaka::syncBlockThreads(acc); if (size <= 32) @@ -122,7 +123,7 @@ namespace cms { warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff); } alpaka::syncBlockThreads(acc); - for (auto i = first + 32; i < size; i += blockDimension) { + for (auto i = first + 32; i < size; i += blockDim) { auto warpId = i / 32; c[i] += ws[warpId - 1]; } @@ -135,44 +136,45 @@ namespace cms { // limited to 1024*1024 elements.... template - struct multiBlockPrefixScanFirstStep { + struct multiBlockPrefixScan { template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size) const { - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const blockIdx(alpaka::getIdx(acc)[0u]); + ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t* pc) const { + const int32_t blockDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t threadDim(alpaka::getWorkDiv(acc)[0u]); + const int32_t blockIdx(alpaka::getIdx(acc)[0u]); + const int32_t threadIdx(alpaka::getIdx(acc)[0u]); - auto& ws = alpaka::declareSharedVar(acc); // first each block does a scan of size 1024; (better be enough blocks....) -#ifndef NDEBUG - uint32_t const gridDimension(alpaka::getWorkDiv(acc)[0u]); - ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= 1024); -#endif - int off = blockDimension * blockIdx * threadDimension; + int32_t const gridDim(alpaka::getWorkDiv(acc)[0u]); + ALPAKA_ASSERT_OFFLOAD(gridDim / threadDim <= 1024); + int off = blockDim * blockIdx * threadDim; + auto& ws = alpaka::declareSharedVar(acc); if (size - off > 0) - blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDimension * threadDimension), size - off), ws); - } - }; + blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDim * threadDim), size - off), ws); - // limited to 1024*1024 elements.... - template - struct multiBlockPrefixScanSecondStep { - template - ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t numBlocks) const { - uint32_t const blockDimension(alpaka::getWorkDiv(acc)[0u]); - uint32_t const threadDimension(alpaka::getWorkDiv(acc)[0u]); + auto& isLastBlockDone = alpaka::declareSharedVar(acc); + if (0 == threadIdx) { + cms::alpakatools::threadfence(acc); + auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter + isLastBlockDone = (value == (gridDim - 1)); + } - uint32_t const threadIdx(alpaka::getIdx(acc)[0u]); + alpaka::syncBlockThreads(acc); - auto* const psum(alpaka::getDynSharedMem(acc)); + if (!isLastBlockDone) + return; - // first each block does a scan of size 1024; (better be enough blocks....) - ALPAKA_ASSERT_OFFLOAD(static_cast(blockDimension * threadDimension) >= numBlocks); - for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { - int index = +threadIdx * threadDimension + elemId; + ALPAKA_ASSERT_OFFLOAD(gridDim == *pc); - if (index < numBlocks) { - int lastElementOfPreviousBlockId = index * blockDimension * threadDimension - 1; + auto& psum = alpaka::declareSharedVar(acc); + + ALPAKA_ASSERT_OFFLOAD(static_cast(blockDim * threadDim) >= gridDim); + + for (int elemId = 0; elemId < static_cast(threadDim); ++elemId) { + int index = +threadIdx * threadDim + elemId; + + if (index < gridDim) { + int lastElementOfPreviousBlockId = index * blockDim * threadDim - 1; psum[index] = (lastElementOfPreviousBlockId < size and lastElementOfPreviousBlockId >= 0) ? co[lastElementOfPreviousBlockId] : T(0); @@ -180,53 +182,18 @@ namespace cms { } alpaka::syncBlockThreads(acc); + blockPrefixScan(acc, psum, psum, gridDim, ws); - auto& ws = alpaka::declareSharedVar(acc); - blockPrefixScan(acc, psum, psum, numBlocks, ws); - - for (int elemId = 0; elemId < static_cast(threadDimension); ++elemId) { - int first = threadIdx * threadDimension + elemId; - for (int i = first + blockDimension * threadDimension; i < size; i += blockDimension * threadDimension) { - auto k = i / (blockDimension * threadDimension); + for (int elemId = 0; elemId < static_cast(threadDim); ++elemId) { + int first = threadIdx * threadDim + elemId; + for (int i = first + blockDim * threadDim; i < size; i += blockDim * threadDim) { + auto k = i / (blockDim * threadDim); co[i] += psum[k]; } } } }; - } // namespace alpakatools } // namespace cms -namespace alpaka { - namespace traits { - - //############################################################################# - //! The trait for getting the size of the block shared dynamic memory for a kernel. - template - struct BlockSharedMemDynSizeBytes, TAcc> { - //----------------------------------------------------------------------------- - //! \return The size of the shared memory allocated for a block. - template - ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes( - cms::alpakatools::multiBlockPrefixScanSecondStep const& myKernel, - TVec const& blockThreadExtent, - TVec const& threadElemExtent, - T const* ci, - T* co, - int32_t size, - int32_t numBlocks) -> T { - alpaka::ignore_unused(myKernel); - alpaka::ignore_unused(blockThreadExtent); - alpaka::ignore_unused(threadElemExtent); - alpaka::ignore_unused(ci); - alpaka::ignore_unused(co); - alpaka::ignore_unused(size); - - return static_cast(numBlocks) * sizeof(T); - } - }; - - } // namespace traits -} // namespace alpaka - #endif // HeterogeneousCore_AlpakaUtilities_interface_prefixScan_h diff --git a/src/alpaka/test/alpaka/prefixScan_t.cc b/src/alpaka/test/alpaka/prefixScan_t.cc index cb0a053b9..99289febc 100644 --- a/src/alpaka/test/alpaka/prefixScan_t.cc +++ b/src/alpaka/test/alpaka/prefixScan_t.cc @@ -176,23 +176,14 @@ int main() { cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4); std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl; - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDivMultiBlock, - cms::alpakatools::multiBlockPrefixScanFirstStep(), - input_d, - output1_d, - num_items)); - - const Vec1 blocksPerGridSecondStep(Vec1::all(1)); - const WorkDiv1& workDivMultiBlockSecondStep = - cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4); - alpaka::enqueue(queue, - alpaka::createTaskKernel(workDivMultiBlockSecondStep, - cms::alpakatools::multiBlockPrefixScanSecondStep(), - input_d, - output1_d, - num_items, - nBlocks)); + auto d_pc(alpaka::allocBuf(device, size)); + int32_t* pc = alpaka::getPtrNative(d_pc); + + alpaka::memset(queue, d_pc, 0, size); + alpaka::enqueue( + queue, + alpaka::createTaskKernel( + workDivMultiBlock, cms::alpakatools::multiBlockPrefixScan(), input_d, output1_d, num_items, pc)); alpaka::enqueue(queue, alpaka::createTaskKernel(workDivMultiBlock, verify(), output1_d, num_items));