Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove KernelBundle, change signature of [get|is]ValidWorkDiv* #2349

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 19 additions & 10 deletions docs/source/basic/cheatsheet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -180,21 +180,29 @@ Prepare Kernel Bundle
.. code-block:: c++

HeatEquationKernel heatEqKernel;
// Arguments of KernelBundle: The kernel instance and the kernel arguments
auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);

Automatically select a valid kernel launch configuration
.. code-block:: c++

Vec<Dim, Idx> const globalThreadExtent = vectorValue;
Vec<Dim, Idx> const elementsPerThread = vectorValue;

auto autoWorkDiv = getValidWorkDivForKernel<Acc>(
device,
bundeledKernel,
globalThreadExtent, elementsPerThread,
KernelCfg<Acc> const kernelCfg = {
globalThreadExtent,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be something like globalElementsExtent rather than globalThreadExtent, because the number of elements per thread are also taken into account.

elementsPerThread,
false,
GridBlockExtentSubDivRestrictions::Unrestricted);
GridBlockExtentSubDivRestrictions::Unrestricted};

auto autoWorkDiv = getValidWorkDiv(
kernelCfg,
device,
heatEqKernel,
heatEqKernel,
psychocoderHPC marked this conversation as resolved.
Show resolved Hide resolved
pCurrAcc,
pNextAcc,
numNodesX,
dx,
dt);

Manually set a kernel launch configuration
.. code-block:: c++
Expand All @@ -204,9 +212,10 @@ Manually set a kernel launch configuration
Vec<Dim, Idx> const elementsPerThread = vectorValue;

using WorkDiv = WorkDivMembers<Dim, Idx>;
auto manualWorkDiv = WorkDiv{blocksPerGrid,
threadsPerBlock,
elementsPerThread};
auto manualWorkDiv = WorkDiv{
blocksPerGrid,
threadsPerBlock,
elementsPerThread};

Instantiate a kernel and create a task that will run it (does not launch it yet)
.. code-block:: c++
Expand Down
12 changes: 5 additions & 7 deletions example/bufferCopy/src/bufferCopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ auto example(TAccTag const&) -> int
using Data = std::uint32_t;
constexpr Idx nElementsPerDim = 2;

const Vec extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
Vec const extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Vec const extents(Vec::all(static_cast<Idx>(nElementsPerDim)));
Vec const extents(Vec::all(nElementsPerDim));

since nElementsPerDim is already of type Idx.


// Allocate host memory buffers
//
Expand Down Expand Up @@ -164,9 +164,8 @@ auto example(TAccTag const&) -> int

FillBufferKernel fillBufferKernel;

auto const& bundeledFillBufferKernel = alpaka::KernelBundle(fillBufferKernel, hostViewPlainPtrMdSpan);
auto const hostWorkDiv
= alpaka::getValidWorkDivForKernel<Host>(devHost, bundeledFillBufferKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Host> const hostKernelCfg = {threadsPerGrid, elementsPerThread};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
alpaka::KernelCfg<Host> const hostKernelCfg = {threadsPerGrid, elementsPerThread};
alpaka::KernelCfg<Host> const hostKernelCfg = {elementsPerGrid, elementsPerThread};

?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same comment applies to the other examples.

auto const hostWorkDiv = alpaka::getValidWorkDiv(hostKernelCfg, devHost, fillBufferKernel, hostViewPlainPtrMdSpan);

alpaka::exec<Host>(hostQueue, hostWorkDiv, fillBufferKernel,
hostViewPlainPtrMdSpan); // 1st kernel argument
Expand Down Expand Up @@ -203,11 +202,10 @@ auto example(TAccTag const&) -> int
auto deviceBufferMdSpan2 = alpaka::experimental::getMdSpan(deviceBuffer2);

TestBufferKernel testBufferKernel;
auto const& bundeledTestBufferKernel = alpaka::KernelBundle(testBufferKernel, deviceBufferMdSpan1);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const devWorkDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledTestBufferKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Acc> const devKernelCfg = {threadsPerGrid, elementsPerThread};
auto const devWorkDiv = alpaka::getValidWorkDiv(devKernelCfg, devAcc, testBufferKernel, deviceBufferMdSpan1);

alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan1);
alpaka::exec<Acc>(devQueue, devWorkDiv, testBufferKernel, deviceBufferMdSpan2);
Expand Down
6 changes: 3 additions & 3 deletions example/complex/src/complex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ auto example(TAccTag const&) -> int

ComplexKernel complexKernel;

auto const& bundeledKernel = alpaka::KernelBundle(complexKernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, complexKernel);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, complexKernel);
Expand Down
10 changes: 5 additions & 5 deletions example/conv2DWithMdspan/src/conv2DWithMdspan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,16 @@ auto example(TAccTag const&) -> int
// Construct kernel object
ConvolutionKernelMdspan2D convolutionKernel2D;

// Make a bundle
auto const& bundeledKernel = alpaka::KernelBundle(
// Let alpaka calculate good block and grid sizes given our full problem extent.
alpaka::KernelCfg<DevAcc> const kernelCfg = {extent, Vec::ones()};
auto const workDiv = alpaka::getValidWorkDiv(
kernelCfg,
devAcc,
convolutionKernel2D,
alpaka::experimental::getMdSpan(bufInputAcc),
alpaka::experimental::getMdSpan(outputDeviceMemory),
alpaka::experimental::getMdSpan(bufFilterAcc));

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());


// Run the kernel, pass 3 arrays as 2D mdspans
alpaka::exec<DevAcc>(
Expand Down
14 changes: 8 additions & 6 deletions example/convolution1D/src/convolution1D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ struct ConvolutionKernel
TElem const* const input,
TElem const* const filter,
TElem* const output,
const std::size_t inputSize,
const std::size_t filterSize) const -> void
std::size_t const inputSize,
std::size_t const filterSize) const -> void
{
auto const globalThreadIdxX = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];

Expand Down Expand Up @@ -140,17 +140,19 @@ auto example(TAccTag const&) -> int
DataType* nativeInputDeviceMemory = std::data(inputDeviceMemory);
DataType* nativeOutputDeviceMemory = std::data(outputDeviceMemory);

auto const& bundeledKernel = alpaka::KernelBundle(
alpaka::KernelCfg<DevAcc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDiv(
kernelCfg,
devAcc,
convolutionKernel,
nativeInputDeviceMemory,
nativeFilterDeviceMemory,
nativeOutputDeviceMemory,
inputSize,
filterSize);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
// Run the kernel
alpaka::exec<DevAcc>(
queue,
Expand Down
20 changes: 11 additions & 9 deletions example/convolution2D/src/convolution2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const intputWidthAllocated = [&]() -> const Idx
auto const intputWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchInput{alpaka::getPitchesInBytes(bufInputAcc)[0]};
Expand Down Expand Up @@ -294,7 +294,7 @@ auto example(TAccTag const&) -> int
alpaka::wait(queueAcc);

// Calculate the allocated width, due to padding it might be larger then the matrix width
auto const filterWidthAllocated = [&]() -> const Idx
auto const filterWidthAllocated = [&]() -> Idx const
{
// Calculate pitch: The size of one line in bytes including padding.
auto const rowPitchFilter{alpaka::getPitchesInBytes(bufFilterAcc)[0]};
Expand All @@ -305,20 +305,22 @@ auto example(TAccTag const&) -> int
// ConvolutionKernel2DSharedMemory
ConvolutionKernel2DSharedMemory convolutionKernel2D;

auto const& bundeledKernel = alpaka::KernelBundle(
alpaka::KernelCfg<DevAcc> kernelCfg = {extent, Vec::ones()};

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDiv(
kernelCfg,
devAcc,
convolutionKernel2D,
alpaka::getPtrNative(bufInputAcc),
alpaka::getPtrNative(outputDeviceMemory),
std::data(bufInputAcc),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std::data is used for the kernel call too.

std::data(outputDeviceMemory),
matrixWidth,
matrixHeight,
alpaka::getPtrNative(bufFilterAcc),
std::data(bufFilterAcc),
filterWidth,
intputWidthAllocated,
filterWidthAllocated);

// Let alpaka calculate good block and grid sizes given our full problem extent.
auto const workDiv = alpaka::getValidWorkDivForKernel<DevAcc>(devAcc, bundeledKernel, extent, Vec::ones());

// Run the kernel
alpaka::exec<DevAcc>(
queueAcc,
Expand Down
23 changes: 16 additions & 7 deletions example/counterBasedRng/src/counterBasedRng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,22 +147,31 @@ auto example(TAccTag const&) -> int
BufAcc bufAcc(alpaka::allocBuf<Data, Idx>(devAcc, extent));

CounterBasedRngKernel counterBasedRngKernel;
auto const& bundeledKernel
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufAcc), key);
auto const& bundeledKernel2
= alpaka::KernelBundle(counterBasedRngKernel, alpaka::experimental::getMdSpan(bufHost), key);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDivAcc = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elementsPerThread);
auto const workDivHost
= alpaka::getValidWorkDivForKernel<AccHost>(devHost, bundeledKernel2, extent, elementsPerThreadHost);
alpaka::KernelCfg<Acc> kernerlCfgAccDev = {extent, elementsPerThread};
auto const workDivAcc = alpaka::getValidWorkDiv(
kernerlCfgAccDev,
devAcc,
counterBasedRngKernel,
alpaka::experimental::getMdSpan(bufAcc),
key);

// Create the kernel execution task.
auto const taskKernelAcc = alpaka::createTaskKernel<Acc>(
workDivAcc,
CounterBasedRngKernel(),
alpaka::experimental::getMdSpan(bufAcc),
key);

alpaka::KernelCfg<AccHost> kernerlCfgAccHost = {extent, elementsPerThreadHost};
auto const workDivHost = alpaka::getValidWorkDiv(
kernerlCfgAccHost,
devHost,
counterBasedRngKernel,
alpaka::experimental::getMdSpan(bufHost),
key);

auto const taskKernelHost = alpaka::createTaskKernel<AccHost>(
workDivHost,
CounterBasedRngKernel(),
Expand Down
6 changes: 4 additions & 2 deletions example/heatEquation/src/heatEquation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,11 @@ auto example(TAccTag const&) -> int

HeatEquationKernel heatEqKernel;

auto const& bundeledKernel = alpaka::KernelBundle(heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);
alpaka::KernelCfg<Acc> const kernelCfg = {extent, elemPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, extent, elemPerThread);
auto const workDiv
= alpaka::getValidWorkDiv(kernelCfg, devAcc, heatEqKernel, pCurrAcc, pNextAcc, numNodesX, dx, dt);

// Copy host -> device
alpaka::memcpy(queue, uCurrBufAcc, uCurrBufHost);
Expand Down
6 changes: 3 additions & 3 deletions example/helloWorld/src/helloWorld.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,10 @@ auto example(TAccTag const&) -> int
// argument. So a kernel can be a class or struct, a lambda, etc.
HelloWorldKernel helloWorldKernel;

auto const& bundeledKernel = alpaka::KernelBundle(helloWorldKernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, helloWorldKernel);

// Run the kernel
//
Expand Down
8 changes: 4 additions & 4 deletions example/helloWorldLambda/src/helloWorldLambda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ auto example(TAccTag const&) -> int
auto const threadsPerGrid = Vec{4, 2, 4};


const size_t nExclamationMarks = 10;
size_t const nExclamationMarks = 10;

// Run "Hello World" kernel with a lambda function
//
Expand Down Expand Up @@ -117,10 +117,10 @@ auto example(TAccTag const&) -> int
printf("\n");
};

auto const& bundeledKernel = alpaka::KernelBundle(kernelLambda, nExclamationMarks);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernelLambda, nExclamationMarks);

alpaka::exec<Acc>(queue, workDiv, kernelLambda, nExclamationMarks);
alpaka::wait(queue);
Expand Down
6 changes: 3 additions & 3 deletions example/kernelSpecialization/src/kernelSpecialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ auto example(TAccTag const&) -> int
std::size_t const elementsPerThread = 1u;
Kernel kernel;

auto const& bundeledKernel = alpaka::KernelBundle(kernel);
alpaka::KernelCfg<Acc> const kernelCfg = {threadsPerGrid, elementsPerThread};

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel);

// Run the kernel
alpaka::exec<Acc>(queue, workDiv, kernel);
Expand Down
13 changes: 4 additions & 9 deletions example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,19 +147,14 @@ auto example(TAccTag const&) -> int
auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);

MatrixMulKernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, mdDevA, mdDevB, mdDevC);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
extentC,
Vec::ones(),
false,
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
alpaka::KernelCfg<Acc> const kernelCfg
= {extentC, Vec::ones(), false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
auto const workDiv = alpaka::getValidWorkDiv<Acc>(kernelCfg, devAcc, kernel, mdDevA, mdDevB, mdDevC);

// Execute the kernel
alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
alpaka::exec<Acc>(queue, workDiv, kernel, mdDevA, mdDevB, mdDevC);

// Copy result back to host
alpaka::memcpy(queue, bufHostC, bufDevC);
Expand Down
9 changes: 3 additions & 6 deletions example/monteCarloIntegration/src/monteCarloIntegration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,14 +112,11 @@ auto example(TAccTag const&) -> int
bufHost[0] = 0.0f;
alpaka::memcpy(queue, bufAcc, bufHost);

alpaka::KernelCfg<Acc> const kernelCfg = {Vec(numThreads), Vec(numAlpakaElementsPerThread)};
Kernel kernel;
auto const& bundeledKernel = alpaka::KernelBundle(kernel, numPoints, ptrBufAcc, Function{});

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv = alpaka::getValidWorkDivForKernel<Acc>(
devAcc,
bundeledKernel,
Vec(numThreads),
Vec(numAlpakaElementsPerThread));
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, kernel, numPoints, ptrBufAcc, Function{});

alpaka::exec<Acc>(queue, workDiv, kernel, numPoints, ptrBufAcc, Function{});
alpaka::memcpy(queue, bufHost, bufAcc);
Expand Down
6 changes: 3 additions & 3 deletions example/openMPSchedule/src/openMPSchedule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ auto main() -> int
Idx const elementsPerThread = 1u;

OpenMPScheduleDefaultKernel openMPScheduleDefaultKernel;
auto const& bundeledKernel = alpaka::KernelBundle(openMPScheduleDefaultKernel);

// Let alpaka calculate good block and grid sizes given our full problem extent
auto const workDiv
= alpaka::getValidWorkDivForKernel<Acc>(devAcc, bundeledKernel, threadsPerGrid, elementsPerThread);
alpaka::KernelCfg<Acc> kernelCfg = {threadsPerGrid, elementsPerThread};
auto const workDiv = alpaka::getValidWorkDiv(kernelCfg, devAcc, openMPScheduleDefaultKernel);

// Run the kernel setting no schedule explicitly.
std::cout << "OpenMPScheduleDefaultKernel setting no schedule explicitly:\n";
Expand Down
Loading
Loading