Skip to content

Commit

Permalink
[alpaka] Refactor prefixScan implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
antoniopetre authored and fwyzard committed Oct 14, 2021
1 parent 9ee23ee commit d427564
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 110 deletions.
14 changes: 6 additions & 8 deletions src/alpaka/AlpakaCore/HistoContainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,17 +74,15 @@ namespace cms {
const unsigned int nblocks = (num_items + nthreads - 1) / nthreads;
const Vec1 blocksPerGrid(nblocks);

auto d_pc = cms::alpakatools::allocDeviceBuf<int32_t>(1u);
int32_t *pc = alpaka::getPtrNative(d_pc);
alpaka::memset(queue, d_pc, 0, 1u);

const WorkDiv1 &workDiv = cms::alpakatools::make_workdiv(blocksPerGrid, threadsPerBlockOrElementsPerThread);
alpaka::enqueue(queue,
alpaka::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
workDiv, multiBlockPrefixScanFirstStep<uint32_t>(), poff, poff, num_items));

const WorkDiv1 &workDivWith1Block =
cms::alpakatools::make_workdiv(Vec1::all(1), threadsPerBlockOrElementsPerThread);
alpaka::enqueue(
queue,
alpaka::createTaskKernel<ALPAKA_ACCELERATOR_NAMESPACE::Acc1>(
workDivWith1Block, multiBlockPrefixScanSecondStep<uint32_t>(), poff, poff, num_items, nblocks));
workDiv, multiBlockPrefixScan<uint32_t>(), poff, poff, num_items, pc));
alpaka::wait(queue);
}

template <typename Histo, typename T>
Expand Down
137 changes: 52 additions & 85 deletions src/alpaka/AlpakaCore/prefixScan.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cstdint>

#include "AlpakaCore/alpakaConfig.h"
#include "AlpakaCore/threadfence.h"
#include "Framework/CMSUnrollLoop.h"

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
Expand Down Expand Up @@ -49,23 +50,23 @@ namespace cms {
#endif
) {
#if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__
uint32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
uint32_t const gridBlockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
uint32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
assert(ws);
const int32_t blockDim(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
const int32_t gridBlockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
const int32_t blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
ALPAKA_ASSERT_OFFLOAD(ws);
ALPAKA_ASSERT_OFFLOAD(size <= 1024);
ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32);
ALPAKA_ASSERT_OFFLOAD(0 == blockDim % 32);
auto first = blockThreadIdx;
auto mask = __ballot_sync(0xffffffff, first < size);
auto laneId = blockThreadIdx & 0x1f;

for (auto i = first; i < size; i += blockDimension) {
for (auto i = first; i < size; i += blockDim) {
warpPrefixScan(laneId, ci, co, i, mask);
auto warpId = i / 32;
ALPAKA_ASSERT_OFFLOAD(warpId < 32);
if (31 == laneId)
ws[warpId] = co[i];
mask = __ballot_sync(mask, i + blockDimension < size);
mask = __ballot_sync(mask, i + blockDim < size);
}
alpaka::syncBlockThreads(acc);
if (size <= 32)
Expand All @@ -74,7 +75,7 @@ namespace cms {
warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff);
}
alpaka::syncBlockThreads(acc);
for (auto i = first + 32; i < size; i += blockDimension) {
for (auto i = first + 32; i < size; i += blockDim) {
uint32_t warpId = i / 32;
co[i] += ws[warpId - 1];
}
Expand All @@ -97,23 +98,23 @@ namespace cms {
#endif
) {
#if defined ALPAKA_ACC_GPU_CUDA_ENABLED and __CUDA_ARCH__
uint32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
uint32_t const gridBlockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
uint32_t const blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
assert(ws);
const int32_t blockDim(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
const int32_t gridBlockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
const int32_t blockThreadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
ALPAKA_ASSERT_OFFLOAD(ws);
ALPAKA_ASSERT_OFFLOAD(size <= 1024);
ALPAKA_ASSERT_OFFLOAD(0 == blockDimension % 32);
ALPAKA_ASSERT_OFFLOAD(0 == blockDim % 32);
auto first = blockThreadIdx;
auto mask = __ballot_sync(0xffffffff, first < size);
auto laneId = blockThreadIdx & 0x1f;

for (auto i = first; i < size; i += blockDimension) {
for (auto i = first; i < size; i += blockDim) {
warpPrefixScan(laneId, c, i, mask);
auto warpId = i / 32;
ALPAKA_ASSERT_OFFLOAD(warpId < 32);
if (31 == laneId)
ws[warpId] = c[i];
mask = __ballot_sync(mask, i + blockDimension < size);
mask = __ballot_sync(mask, i + blockDim < size);
}
alpaka::syncBlockThreads(acc);
if (size <= 32)
Expand All @@ -122,7 +123,7 @@ namespace cms {
warpPrefixScan(laneId, ws, blockThreadIdx, 0xffffffff);
}
alpaka::syncBlockThreads(acc);
for (auto i = first + 32; i < size; i += blockDimension) {
for (auto i = first + 32; i < size; i += blockDim) {
auto warpId = i / 32;
c[i] += ws[warpId - 1];
}
Expand All @@ -135,98 +136,64 @@ namespace cms {

// limited to 1024*1024 elements....
template <typename T>
struct multiBlockPrefixScanFirstStep {
struct multiBlockPrefixScan {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size) const {
uint32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
uint32_t const threadDimension(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
uint32_t const blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t* pc) const {
const int32_t blockDim(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
const int32_t threadDim(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
const int32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
const int32_t threadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);

auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
// first each block does a scan of size 1024; (better be enough blocks....)
#ifndef NDEBUG
uint32_t const gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
ALPAKA_ASSERT_OFFLOAD(gridDimension / threadDimension <= 1024);
#endif
int off = blockDimension * blockIdx * threadDimension;
int32_t const gridDim(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
ALPAKA_ASSERT_OFFLOAD(gridDim / threadDim <= 1024);
int off = blockDim * blockIdx * threadDim;
auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
if (size - off > 0)
blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDimension * threadDimension), size - off), ws);
}
};
blockPrefixScan(acc, ci + off, co + off, std::min(int(blockDim * threadDim), size - off), ws);

// limited to 1024*1024 elements....
template <typename T>
struct multiBlockPrefixScanSecondStep {
template <typename T_Acc>
ALPAKA_FN_ACC void operator()(const T_Acc& acc, T const* ci, T* co, int32_t size, int32_t numBlocks) const {
uint32_t const blockDimension(alpaka::getWorkDiv<alpaka::Block, alpaka::Threads>(acc)[0u]);
uint32_t const threadDimension(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
auto& isLastBlockDone = alpaka::declareSharedVar<bool, __COUNTER__>(acc);
if (0 == threadIdx) {
cms::alpakatools::threadfence(acc);
auto value = alpaka::atomicAdd(acc, pc, 1, alpaka::hierarchy::Blocks{}); // block counter
isLastBlockDone = (value == (gridDim - 1));
}

uint32_t const threadIdx(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
alpaka::syncBlockThreads(acc);

auto* const psum(alpaka::getDynSharedMem<T>(acc));
if (!isLastBlockDone)
return;

// first each block does a scan of size 1024; (better be enough blocks....)
ALPAKA_ASSERT_OFFLOAD(static_cast<int32_t>(blockDimension * threadDimension) >= numBlocks);
for (int elemId = 0; elemId < static_cast<int>(threadDimension); ++elemId) {
int index = +threadIdx * threadDimension + elemId;
ALPAKA_ASSERT_OFFLOAD(gridDim == *pc);

if (index < numBlocks) {
int lastElementOfPreviousBlockId = index * blockDimension * threadDimension - 1;
auto& psum = alpaka::declareSharedVar<T[1024], __COUNTER__>(acc);

ALPAKA_ASSERT_OFFLOAD(static_cast<int32_t>(blockDim * threadDim) >= gridDim);

for (int elemId = 0; elemId < static_cast<int>(threadDim); ++elemId) {
int index = +threadIdx * threadDim + elemId;

if (index < gridDim) {
int lastElementOfPreviousBlockId = index * blockDim * threadDim - 1;
psum[index] = (lastElementOfPreviousBlockId < size and lastElementOfPreviousBlockId >= 0)
? co[lastElementOfPreviousBlockId]
: T(0);
}
}

alpaka::syncBlockThreads(acc);
blockPrefixScan(acc, psum, psum, gridDim, ws);

auto& ws = alpaka::declareSharedVar<T[32], __COUNTER__>(acc);
blockPrefixScan(acc, psum, psum, numBlocks, ws);

for (int elemId = 0; elemId < static_cast<int>(threadDimension); ++elemId) {
int first = threadIdx * threadDimension + elemId;
for (int i = first + blockDimension * threadDimension; i < size; i += blockDimension * threadDimension) {
auto k = i / (blockDimension * threadDimension);
for (int elemId = 0; elemId < static_cast<int>(threadDim); ++elemId) {
int first = threadIdx * threadDim + elemId;
for (int i = first + blockDim * threadDim; i < size; i += blockDim * threadDim) {
auto k = i / (blockDim * threadDim);
co[i] += psum[k];
}
}
}
};

} // namespace alpakatools
} // namespace cms

namespace alpaka {
namespace traits {

//#############################################################################
//! The trait for getting the size of the block shared dynamic memory for a kernel.
template <typename T, typename TAcc>
struct BlockSharedMemDynSizeBytes<cms::alpakatools::multiBlockPrefixScanSecondStep<T>, TAcc> {
//-----------------------------------------------------------------------------
//! \return The size of the shared memory allocated for a block.
template <typename TVec>
ALPAKA_FN_HOST_ACC static auto getBlockSharedMemDynSizeBytes(
cms::alpakatools::multiBlockPrefixScanSecondStep<T> const& myKernel,
TVec const& blockThreadExtent,
TVec const& threadElemExtent,
T const* ci,
T* co,
int32_t size,
int32_t numBlocks) -> T {
alpaka::ignore_unused(myKernel);
alpaka::ignore_unused(blockThreadExtent);
alpaka::ignore_unused(threadElemExtent);
alpaka::ignore_unused(ci);
alpaka::ignore_unused(co);
alpaka::ignore_unused(size);

return static_cast<size_t>(numBlocks) * sizeof(T);
}
};

} // namespace traits
} // namespace alpaka

#endif // HeterogeneousCore_AlpakaUtilities_interface_prefixScan_h
25 changes: 8 additions & 17 deletions src/alpaka/test/alpaka/prefixScan_t.cc
Original file line number Diff line number Diff line change
Expand Up @@ -176,23 +176,14 @@ int main() {
cms::alpakatools::make_workdiv(blocksPerGrid4, threadsPerBlockOrElementsPerThread4);

std::cout << "launch multiBlockPrefixScan " << num_items << ' ' << nBlocks << std::endl;
alpaka::enqueue(queue,
alpaka::createTaskKernel<Acc1>(workDivMultiBlock,
cms::alpakatools::multiBlockPrefixScanFirstStep<uint32_t>(),
input_d,
output1_d,
num_items));

const Vec1 blocksPerGridSecondStep(Vec1::all(1));
const WorkDiv1& workDivMultiBlockSecondStep =
cms::alpakatools::make_workdiv(blocksPerGridSecondStep, threadsPerBlockOrElementsPerThread4);
alpaka::enqueue(queue,
alpaka::createTaskKernel<Acc1>(workDivMultiBlockSecondStep,
cms::alpakatools::multiBlockPrefixScanSecondStep<uint32_t>(),
input_d,
output1_d,
num_items,
nBlocks));
auto d_pc(alpaka::allocBuf<int32_t, Idx>(device, size));
int32_t* pc = alpaka::getPtrNative(d_pc);

alpaka::memset(queue, d_pc, 0, size);
alpaka::enqueue(
queue,
alpaka::createTaskKernel<Acc1>(
workDivMultiBlock, cms::alpakatools::multiBlockPrefixScan<uint32_t>(), input_d, output1_d, num_items, pc));

alpaka::enqueue(queue, alpaka::createTaskKernel<Acc1>(workDivMultiBlock, verify(), output1_d, num_items));

Expand Down

0 comments on commit d427564

Please sign in to comment.