Skip to content

Commit

Permalink
use new interface for getValidWorkDiv
Browse files Browse the repository at this point in the history
  • Loading branch information
psychocoderHPC committed Aug 9, 2024
1 parent 07e9c22 commit 527b4bc
Show file tree
Hide file tree
Showing 25 changed files with 181 additions and 185 deletions.
9 changes: 4 additions & 5 deletions example/bufferCopy/src/bufferCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ auto example(TAccTag const&) -> int
using Data = std::uint32_t;
constexpr Idx nElementsPerDim = 2;

const Vec extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
Vec const extents(Vec::all(static_cast<Idx>(nElementsPerDim)));

// Allocate host memory buffers
//
Expand Down Expand Up @@ -166,7 +166,7 @@ auto example(TAccTag const&) -> int

auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
auto const hostWorkDiv
= alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
= alpaka::getValidWorkDiv<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);

alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
hostViewPlainPtrMdSpan); // 1st kernel argument
Expand Down Expand Up @@ -203,11 +203,10 @@ auto example(TAccTag const&) -> int
auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);

TestBufferKernel testBufferKernel;
auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const devWorkDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};
auto const devWorkDiv = alpaka::getValidWorkDiv(devAcc, kernelCfg, testBufferKernel, deviceBufferMdSpan1);

alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
Expand Down
6 changes: 3 additions & 3 deletions example/complex/src/complex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ auto example(TAccTag const&) -> int

ComplexKernel complexKernel;

auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(devAcc, kernelCfg, complexKernel);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, complexKernel);
Expand Down
2 changes: 1 addition & 1 deletion example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ auto example(TAccTag const&) -> int
alpaka::experimental::getMdSpan(bufFilterAcc));

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());
auto const workDiv = alpaka::getValidWorkDiv<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());


// Run the kernel, pass 3 arrays as 2D mdspans
Expand Down
14 changes: 8 additions & 6 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ struct ConvolutionKernel
TElem const* const input,
TElem const* const filter,
TElem* const output,
const std::size_t inputSize,
const std::size_t filterSize) const -> void
std::size_t const inputSize,
std::size_t const filterSize) const -> void
{
auto const globalThreadIdxX = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];

Expand Down Expand Up @@ -140,17 +140,19 @@ auto example(TAccTag const&) -> int
DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);

auto const& bundeledKernel = alpaka::KernelBundle(
alpaka::KernelCfg<DevAcc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDiv(
devAcc,
kernelCfg,
convolutionKernel,
nativeInputDeviceMemory,
nativeFilterDeviceMemory,
nativeOutputDeviceMemory,
inputSize,
filterSize);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
// Run the kernel
alpaka::exec<DevAcc>(
queue,
Expand Down
20 changes: 11 additions & 9 deletions example/convolution2D/src/convolution2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const intputWidthAllocated = [&]() -> const Idx
auto const intputWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
Expand Down Expand Up @@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const filterWidthAllocated = [&]() -> const Idx
auto const filterWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
Expand All @@ -305,20 +305,22 @@ auto example(TAccTag const&) -> int
// ConvolutionKernel2DSharedMemory
ConvolutionKernel2DSharedMemory convolutionKernel2D;

auto const& bundeledKernel = alpaka::KernelBundle(
alpaka::KernelCfg<DevAcc> kernelCfg = {extent, Vec::ones()};

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDiv(
devAcc,
kernelCfg,
convolutionKernel2D,
alpaka::getPtrNative(bufInputAcc),
alpaka::getPtrNative(outputDeviceMemory),
std::data(bufInputAcc),
std::data(outputDeviceMemory),
matrixWidth,
matrixHeight,
alpaka::getPtrNative(bufFilterAcc),
std::data(bufFilterAcc),
filterWidth,
intputWidthAllocated,
filterWidthAllocated);

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());

// Run the kernel
alpaka::exec<DevAcc>(
queueAcc,
Expand Down
21 changes: 14 additions & 7 deletions example/counterBasedRng/src/counterBasedRng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,22 @@ auto example(TAccTag const&) -> int
BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));

CounterBasedRngKernel counterBasedRngKernel;
auto const& bundeledKernel
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key);
auto const& bundeledKernel2
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key);
alpaka::KernelCfg<Acc> kernerlCfgAccDev = {extent, elementsPerThread};
alpaka::KernelCfg<Acc> kernerlCfgAccHost = {extent, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
auto const workDivHost
= alpaka::getValidWorkDivForKernel<AccHost>(devHost, bundeledKernel2, extent, elementsPerThreadHost);
auto const workDivAcc = alpaka::getValidWorkDiv(
devAcc,
kernerlCfgAccDev,
counterBasedRngKernel,
alpaka::experimental::getMdSpan(bufAcc),
key);
auto const workDivHost = alpaka::getValidWorkDiv(
devHost,
kernerlCfgAccHost,
counterBasedRngKernel,
alpaka::experimental::getMdSpan(bufHost),
key);

// Create the kernel execution task.
auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
Expand Down
6 changes: 4 additions & 2 deletions example/heatEquation/src/heatEquation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,11 @@ auto example(TAccTag const&) -> int

HeatEquationKernel heatEqKernel;

auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
alpaka::KernelCfg<Acc> const kernelCfg = {extent, elemPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elemPerThread);
auto const workDiv
= alpaka::getValidWorkDiv(devAcc, kernelCfg, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);

// Copy host -> device
alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
Expand Down
6 changes: 3 additions & 3 deletions example/helloWorld/src/helloWorld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ auto example(TAccTag const&) -> int
// argument. So a kernel can be a class or struct, a lambda, etc.
HelloWorldKernel helloWorldKernel;

auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(devAcc, kernelCfg, helloWorldKernel);

// Run the kernel
//
Expand Down
8 changes: 4 additions & 4 deletions example/helloWorldLambda/src/helloWorldLambda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ auto example(TAccTag const&) -> int
auto const threadsPerGrid = Vec{4, 2, 4};


const size_t nExclamationMarks = 10;
size_t const nExclamationMarks = 10;

// Run "Hello World" kernel with a lambda function
//
Expand Down Expand Up @@ -117,10 +117,10 @@ auto example(TAccTag const&) -> int
printf("\n");
};

auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(devAcc, kernelCfg, kernelLambda, nExclamationMarks);

alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
alpaka::wait(queue);
Expand Down
6 changes: 3 additions & 3 deletions example/kernelSpecialization/src/kernelSpecialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ auto example(TAccTag const&) -> int
std::size_t const elementsPerThread = 1u;
Kernel kernel;

auto const& bundeledKernel = alpaka::KernelBundle(kernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(devAcc, kernelCfg, kernel);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, kernel);
Expand Down
13 changes: 4 additions & 9 deletions example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,14 @@ auto example(TAccTag const&) -> int
auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);

MatrixMulKernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
extentC,
Vec::ones(),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
alpaka::KernelCfg<Acc> const kernelCfg
= {extentC, Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
auto const workDiv = alpaka::getValidWorkDiv<Acc>(devAcc, kernelCfg, kernel, mdDevA, mdDevB, mdDevC);

// Execute the kernel
alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
alpaka::exec<Acc>(queue, workDiv, kernel, mdDevA, mdDevB, mdDevC);

// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
Expand Down
9 changes: 3 additions & 6 deletions example/monteCarloIntegration/src/monteCarloIntegration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,11 @@ auto example(TAccTag const&) -> int
bufHost[0] = 0.0f;
alpaka::memcpy(queue, bufAcc, bufHost);

alpaka::KernelCfg<Acc> const kernelCfg = {Vec(numThreads), Vec(numAlpakaElementsPerThread)};
Kernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
Vec(numThreads),
Vec(numAlpakaElementsPerThread));
auto const workDiv = alpaka::getValidWorkDiv(devAcc, kernelCfg, kernel, numPoints, ptrBufAcc, Function{});

alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
alpaka::memcpy(queue, bufHost, bufAcc);
Expand Down
6 changes: 3 additions & 3 deletions example/openMPSchedule/src/openMPSchedule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ auto main() -> int
Idx const elementsPerThread = 1u;

OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Acc> kernelCfg = {threadsPerGrid, elementsPerThread};
auto const workDiv = alpaka::getValidWorkDiv(devAcc, kernelCfg, openMPScheduleDefaultKernel);

// Run the kernel setting no schedule explicitly.
std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
Expand Down
20 changes: 9 additions & 11 deletions example/randomCells2D/src/randomCells2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,11 @@ auto example(TAccTag const&) -> int

auto pitchBufAccRandV = alpaka::getPitchesInBytes(bufAccRandV)[0];

auto const& bundeledKernelInitRandom
= alpaka::KernelBundle(initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
alpaka::KernelCfg<Acc> const kernelCfg = {extent, Vec(perThreadY, perThreadX)};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivInitRandom
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernelInitRandom, extent, Vec(perThreadY, perThreadX));
= alpaka::getValidWorkDiv(devAcc, kernelCfg, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);

alpaka::exec<Acc>(queue, workDivInitRandom, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
alpaka::wait(queue);
Expand All @@ -230,21 +230,19 @@ auto example(TAccTag const&) -> int
alpaka::memcpy(queue, bufAccS, bufHostS);
RunTimestepKernelSingle runTimestepKernelSingle;

auto const& bundeledKernelRuntimeStep = alpaka::KernelBundle(
alpaka::KernelCfg<Acc> const runtimeRandomKernelCfg = {extent, Vec(perThreadY, perThreadX)};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivRuntimeStep = alpaka::getValidWorkDiv(
devAcc,
runtimeRandomKernelCfg,
runTimestepKernelSingle,
extent,
ptrBufAccRandS,
ptrBufAccS,
pitchBufAccRandS,
pitchBufAccS);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivRuntimeStep = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernelRuntimeStep,
extent,
Vec(perThreadY, perThreadX));

alpaka::exec<Acc>(
queue,
workDivRuntimeStep,
Expand Down
39 changes: 21 additions & 18 deletions example/randomStrategies/src/randomStrategies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,20 +247,20 @@ void runStrategy(Box<TAccTag>& box)
// the initial parameters solely from the thread index


auto const& bundeledKernel = alpaka::KernelBundle(
initRandomKernel,
box.extentRand,
ptrBufAccRand,
static_cast<unsigned>(box.extentResult[0] / box.extentRand[0]));
alpaka::KernelCfg<typename Box<TAccTag>::Acc> kernelCfg
= {box.extentRand,
typename Box<TAccTag>::Vec(typename Box<TAccTag>::Idx{1}),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivRand = alpaka::getValidWorkDivForKernel<typename Box<TAccTag>::Acc>(
auto const workDivRand = alpaka::getValidWorkDiv(
alpaka::getDevByIdx(box.accPlatform, 0),
bundeledKernel,
kernelCfg,
initRandomKernel,
box.extentRand,
typename Box<TAccTag>::Vec(typename Box<TAccTag>::Idx{1}),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
ptrBufAccRand,
static_cast<unsigned>(box.extentResult[0] / box.extentRand[0]));


alpaka::exec<typename Box<TAccTag>::Acc>(
Expand Down Expand Up @@ -291,18 +291,21 @@ void runStrategy(Box<TAccTag>& box)
alpaka::memcpy(box.queue, box.bufAccResult, box.bufHostResult);
FillKernel fillKernel;

auto const& bundeledKernelFill
= alpaka::KernelBundle(fillKernel, box.extentResult, ptrBufAccRand, ptrBufAccResult);
alpaka::KernelCfg<typename Box<TAccTag>::Acc> fillKernelCfg
= {box.extentResult,
typename Box<TAccTag>::Vec(static_cast<typename Box<TAccTag>::Idx>(
NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls"
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workdivResult = alpaka::getValidWorkDivForKernel<typename Box<TAccTag>::Acc>(
auto const workdivResult = alpaka::getValidWorkDiv(
alpaka::getDevByIdx(box.accPlatform, 0),
bundeledKernelFill,
fillKernelCfg,
fillKernel,
box.extentResult,
typename Box<TAccTag>::Vec(static_cast<typename Box<TAccTag>::Idx>(
NUM_ROLLS)), // One thread per "point"; each performs NUM_ROLLS "rolls"
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
ptrBufAccRand,
ptrBufAccResult);


alpaka::exec<typename Box<TAccTag>::Acc>(
Expand Down
Loading

0 comments on commit 527b4bc

Please sign in to comment.