From 1c44a5929d917194774d074f6860e53d095252b9 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 30 Jul 2024 14:23:56 +0200 Subject: [PATCH 1/3] Fix getValidWorkDivForKernel tests for the CUDA backend --- .../unit/workDiv/src/WorkDivForKernelTest.cpp | 93 ++++++------------- 1 file changed, 28 insertions(+), 65 deletions(-) diff --git a/test/unit/workDiv/src/WorkDivForKernelTest.cpp b/test/unit/workDiv/src/WorkDivForKernelTest.cpp index 0cdc54c611db..410ce20d25f6 100644 --- a/test/unit/workDiv/src/WorkDivForKernelTest.cpp +++ b/test/unit/workDiv/src/WorkDivForKernelTest.cpp @@ -99,25 +99,15 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc auto const isValid = alpaka::isValidWorkDivKernel(dev, bundeledKernel, workDiv); CHECK(isValid == true); - if constexpr(alpaka::accMatchesTags) - { - // Get calculated threads per block from the workDiv found by examining kernel function - auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); - // Get hard limits - auto const threadsPerBlockLimit = props.m_blockThreadCountMax; - - // Depending on the GPU type or the compiler the test below might fail because threadsPerBlock can be equal to - // threadsPerBlockLimit, which is the max device limit. - CHECK(threadsPerBlock < static_cast(threadsPerBlockLimit)); - } - else if constexpr(alpaka::accMatchesTags< - Acc, - alpaka::TagGpuHipRt, - alpaka::TagCpuThreads, - alpaka::TagCpuOmp2Threads, - alpaka::TagFpgaSyclIntel, - alpaka::TagGpuSyclIntel, - alpaka::TagGenericSycl>) + if constexpr(alpaka::accMatchesTags< + Acc, + alpaka::TagGpuCudaRt, + alpaka::TagGpuHipRt, + alpaka::TagCpuThreads, + alpaka::TagCpuOmp2Threads, + alpaka::TagFpgaSyclIntel, + alpaka::TagGpuSyclIntel, + alpaka::TagGenericSycl>) { // Get calculated threads per block from the workDiv found by examining kernel function auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); @@ -180,59 +170,32 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc auto const isValid = alpaka::isValidWorkDivKernel(dev, bundeledKernel, workDiv); CHECK(isValid == true); - if constexpr(alpaka::accMatchesTags) - { - // Expected valid workdiv values for this kernel might change depending on the GPU type and compiler. Therefore - // generated workdiv is not compared to a specific workdiv in this test. - - // Get calculated threads per block from the workDiv that was found by examining kernel function - auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); - // Get hard limits - auto const threadsPerBlockLimit = props.m_blockThreadCountMax; - - // Depending on the GPU type or the compiler the test below might fail because threadsPerBlock can be equal to - // threadsPerBlockLimit, which is the max device limit. - CHECK(threadsPerBlock < static_cast(threadsPerBlockLimit)); - - // too many threads per block - auto const invalidWorkDiv - = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{2 * threadsPerBlock, 1}, Vec{1, 1}}; - auto isWorkDivValidForCuda = alpaka::isValidWorkDivKernel(dev, bundeledKernel, invalidWorkDiv); - CHECK(isWorkDivValidForCuda == false); - - auto const validWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}}; - isWorkDivValidForCuda = alpaka::isValidWorkDivKernel(dev, bundeledKernel, validWorkDiv); - CHECK(isWorkDivValidForCuda == true); - } - else if constexpr(alpaka::accMatchesTags< - Acc, - alpaka::TagGpuHipRt, - alpaka::TagCpuThreads, - alpaka::TagCpuOmp2Threads, - alpaka::TagFpgaSyclIntel, - alpaka::TagGpuSyclIntel, - alpaka::TagGenericSycl>) + // The valid workdiv values for this kernel might change depending on the GPU type and compiler. + // Therefore the generated workdiv is not compared to a specific workdiv in this test. + if constexpr(alpaka::accMatchesTags< + Acc, + alpaka::TagGpuCudaRt, + alpaka::TagGpuHipRt, + alpaka::TagCpuThreads, + alpaka::TagCpuOmp2Threads, + alpaka::TagFpgaSyclIntel, + alpaka::TagGpuSyclIntel, + alpaka::TagGenericSycl>) { // Get calculated threads per block from the workDiv that was found by examining the kernel function auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); // Get hard limits auto const threadsPerBlockLimit = props.m_blockThreadCountMax; - // Depending on the GPU type or the compiler this test might fail because threadsPerBlock can be less than - // threadsPerBlockLimit, which is the max device limit. - if(threadsPerBlockLimit == 1) - CHECK(threadsPerBlock == static_cast(threadsPerBlockLimit)); - else - CHECK(threadsPerBlock < static_cast(threadsPerBlockLimit)); - - // too many threads per block - auto const invalidWorkDiv - = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{20 * threadsPerBlock, 1}, Vec{1, 1}}; - auto isWorkDivValidForHip = alpaka::isValidWorkDivKernel(dev, bundeledKernel, invalidWorkDiv); - CHECK(isWorkDivValidForHip == false); + // Check that the number of threads per block is within the device limit. + CHECK(threadsPerBlock <= static_cast(threadsPerBlockLimit)); + // Check that using the maximum number of threads per block is valid. auto const validWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}}; - isWorkDivValidForHip = alpaka::isValidWorkDivKernel(dev, bundeledKernel, validWorkDiv); - CHECK(isWorkDivValidForHip == true); + CHECK(alpaka::isValidWorkDivKernel(dev, bundeledKernel, validWorkDiv)); + + // Check that using too many threads per block is not valid. + auto const invalidWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{20, threadsPerBlock}, Vec{1, 1}}; + CHECK(not alpaka::isValidWorkDivKernel(dev, bundeledKernel, invalidWorkDiv)); } else if constexpr(alpaka::accMatchesTags< Acc, From a7fd020c23c473c1006ddaecc4555ebe3d822dba Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 30 Jul 2024 14:41:43 +0200 Subject: [PATCH 2/3] Fix getValidWorkDivForKernel tests for the SYCL CPU backend --- test/unit/workDiv/src/WorkDivForKernelTest.cpp | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/test/unit/workDiv/src/WorkDivForKernelTest.cpp b/test/unit/workDiv/src/WorkDivForKernelTest.cpp index 410ce20d25f6..64144fc317af 100644 --- a/test/unit/workDiv/src/WorkDivForKernelTest.cpp +++ b/test/unit/workDiv/src/WorkDivForKernelTest.cpp @@ -105,6 +105,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc alpaka::TagGpuHipRt, alpaka::TagCpuThreads, alpaka::TagCpuOmp2Threads, + alpaka::TagCpuSycl, alpaka::TagFpgaSyclIntel, alpaka::TagGpuSyclIntel, alpaka::TagGenericSycl>) @@ -116,12 +117,8 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc CHECK(threadsPerBlock <= static_cast(threadsPerBlockLimit)); } - else if constexpr(alpaka::accMatchesTags< - Acc, - alpaka::TagCpuSerial, - alpaka::TagCpuOmp2Blocks, - alpaka::TagCpuTbbBlocks, - alpaka::TagCpuSycl>) + else if constexpr(alpaka:: + accMatchesTags) { // CPU must have only 1 thread per block. In other words, number of blocks is equal to number of threads. CHECK(workDiv == WorkDiv{Vec{threadsPerGridTestValue}, Vec{1}, Vec{1}}); @@ -178,6 +175,7 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc alpaka::TagGpuHipRt, alpaka::TagCpuThreads, alpaka::TagCpuOmp2Threads, + alpaka::TagCpuSycl, alpaka::TagFpgaSyclIntel, alpaka::TagGpuSyclIntel, alpaka::TagGenericSycl>) @@ -197,12 +195,8 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc auto const invalidWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{20, threadsPerBlock}, Vec{1, 1}}; CHECK(not alpaka::isValidWorkDivKernel(dev, bundeledKernel, invalidWorkDiv)); } - else if constexpr(alpaka::accMatchesTags< - Acc, - alpaka::TagCpuSerial, - alpaka::TagCpuOmp2Blocks, - alpaka::TagCpuTbbBlocks, - alpaka::TagCpuSycl>) + else if constexpr(alpaka:: + accMatchesTags) { // CPU must have only 1 thread per block. In other words, number of blocks is equal to number of threads. CHECK(workDiv == WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}, Vec{1, 1}}); From 6206326fa31f8ce4e4054c639bd8638c538c6873 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Tue, 30 Jul 2024 19:52:13 +0200 Subject: [PATCH 3/3] Rewrite the getValidWorkDivForKernel tests --- .../unit/workDiv/src/WorkDivForKernelTest.cpp | 185 +++++++++--------- 1 file changed, 97 insertions(+), 88 deletions(-) diff --git a/test/unit/workDiv/src/WorkDivForKernelTest.cpp b/test/unit/workDiv/src/WorkDivForKernelTest.cpp index 64144fc317af..f6de4a020776 100644 --- a/test/unit/workDiv/src/WorkDivForKernelTest.cpp +++ b/test/unit/workDiv/src/WorkDivForKernelTest.cpp @@ -69,7 +69,7 @@ struct TestKernelWithManyRegisters double sum = var0 + var1 + var2 + var3 + var4 + var5 + var6 + var7 + var8 + var9 + var10 + var11 + var12 + var13 + var14 + var15 + var16 + var17 + var18 + var19 + var20 + var21 + var22 + var23 + var24 + var25 + var26 + var27 + var28 + var29 + var30 + var31 + var32 + var33 + var34 + var35; - printf("The sum is %5.2f, the argument is %lu ", sum, val); + printf("The sum is %5.2f, the argument is %lu\n", sum, val); } }; @@ -86,55 +86,64 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc auto const dev = alpaka::getDevByIdx(platform, 0); TestKernelWithManyRegisters kernel; - auto const bundeledKernel = alpaka::KernelBundle(kernel, 200ul); + auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul); - // Get hard limits for test - auto const props = alpaka::getAccDevProps(dev); + // Get the device properties and hard limits + auto const props = alpaka::getAccDevProps(dev); Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax; - // Test getValidWorkDivForKernel for threadsPerGridTestValue threads per grid + // Test the getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid. auto const workDiv - = alpaka::getValidWorkDivForKernel(dev, bundeledKernel, Vec{threadsPerGridTestValue}, Vec{1}); - // Test validity - auto const isValid = alpaka::isValidWorkDivKernel(dev, bundeledKernel, workDiv); - CHECK(isValid == true); + = alpaka::getValidWorkDivForKernel(dev, kernelBundle, Vec{threadsPerGridTestValue}, Vec{1}); + + // Test the isValidWorkDivKernel function + CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, workDiv)); + + // Get calculated threads per block from the workDiv that was found by examining the kernel function. + Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); + + // Get the device limit. + Idx const threadsPerBlockLimit = props.m_blockThreadCountMax; + + // Check that the number of threads per block is within the device limit. + CHECK(threadsPerBlock <= threadsPerBlockLimit); + + // Check that using the maximum number of threads per block is valid. + auto const validWorkDiv = WorkDiv{Vec{threadsPerGridTestValue / threadsPerBlock}, Vec{threadsPerBlock}, Vec{1}}; + CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, validWorkDiv)); + + // Check that using too many threads per block is not valid. + auto const invalidThreads = WorkDiv{Vec{1}, Vec{2 * threadsPerBlockLimit}, Vec{1}}; + CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, invalidThreads)); + + // Check that a work division with a single block, thread and element is always valid + auto const serialWorkDiv = WorkDiv{Vec{1}, Vec{1}, Vec{1}}; + CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, serialWorkDiv)); + // Some accelerators support only one thread per block: + if constexpr(alpaka::isSingleThreadAcc) + { + // Check that the compute work division uses a single thread per block. + auto const expectedWorkDiv = WorkDiv{Vec{threadsPerGridTestValue}, Vec{1}, Vec{1}}; + CHECK(workDiv == expectedWorkDiv); + + // Check that a work division with more than one thread per block is not valid. + auto const parallelWorkDiv = WorkDiv{Vec{1}, Vec{2}, Vec{1}}; + CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, parallelWorkDiv)); + } + + // Check the maxDynamicSharedSizeBytes for CPU backends if constexpr(alpaka::accMatchesTags< Acc, - alpaka::TagGpuCudaRt, - alpaka::TagGpuHipRt, + alpaka::TagCpuSerial, alpaka::TagCpuThreads, + alpaka::TagCpuOmp2Blocks, alpaka::TagCpuOmp2Threads, - alpaka::TagCpuSycl, - alpaka::TagFpgaSyclIntel, - alpaka::TagGpuSyclIntel, - alpaka::TagGenericSycl>) - { - // Get calculated threads per block from the workDiv found by examining kernel function - auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); - // Get hard limits - auto const threadsPerBlockLimit = props.m_blockThreadCountMax; - - CHECK(threadsPerBlock <= static_cast(threadsPerBlockLimit)); - } - else if constexpr(alpaka:: - accMatchesTags) - { - // CPU must have only 1 thread per block. In other words, number of blocks is equal to number of threads. - CHECK(workDiv == WorkDiv{Vec{threadsPerGridTestValue}, Vec{1}, Vec{1}}); - // Test a new 1D workdiv. Threads per block can not be larger than 1 for CPU. Hence 2 is not valid. - auto const workDiv1DUsingInitList = WorkDiv{Vec{threadsPerGridTestValue / 2}, Vec{2}, Vec{1}}; - auto const isWorkDivValidForCPU - = alpaka::isValidWorkDivKernel(dev, bundeledKernel, workDiv1DUsingInitList); - CHECK(isWorkDivValidForCPU == false); - // Check maxDynamicSharedSizeBytes for CPU backends - auto const funcAttributes = alpaka::getFunctionAttributes(dev, bundeledKernel); - CHECK( - funcAttributes.maxDynamicSharedSizeBytes == static_cast(alpaka::BlockSharedDynMemberAllocKiB * 1024)); - } - else + alpaka::TagCpuTbbBlocks>) { - throw std::invalid_argument("Acc type is not among tested Accs."); + int const maxDynamicSharedSizeBytes + = alpaka::getFunctionAttributes(dev, kernelBundle).maxDynamicSharedSizeBytes; + CHECK(maxDynamicSharedSizeBytes == static_cast(alpaka::BlockSharedDynMemberAllocKiB * 1024)); } } @@ -151,67 +160,67 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc auto const dev = alpaka::getDevByIdx(platform, 0); TestKernelWithManyRegisters kernel; - // A random value - size_t val(200ul); - auto const bundeledKernel = alpaka::KernelBundle(kernel, val); + auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul); - // Get hard limits for test + // Get the device properties and hard limits auto const props = alpaka::getAccDevProps(dev); Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax; // Test getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid. auto const workDiv - = alpaka::getValidWorkDivForKernel(dev, bundeledKernel, Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}); + = alpaka::getValidWorkDivForKernel(dev, kernelBundle, Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}); - // Test isValidWorkDivKernel function - auto const isValid = alpaka::isValidWorkDivKernel(dev, bundeledKernel, workDiv); - CHECK(isValid == true); + // Test the isValidWorkDivKernel function + CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, workDiv)); - // The valid workdiv values for this kernel might change depending on the GPU type and compiler. + // The valid workdiv values for the kernel may change depending on the GPU type and compiler. // Therefore the generated workdiv is not compared to a specific workdiv in this test. + + // Get calculated threads per block from the workDiv that was found by examining the kernel function. + Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); + + // Get the device limit. + Idx const threadsPerBlockLimit = props.m_blockThreadCountMax; + + // Check that the number of threads per block is within the device limit. + CHECK(threadsPerBlock <= threadsPerBlockLimit); + + // Check that using the maximum number of threads per block is valid. + auto const validWorkDiv + = WorkDiv{Vec{8, threadsPerGridTestValue / threadsPerBlock / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}}; + CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, validWorkDiv)); + + // Check that using too many threads per block is not valid. + auto const invalidThreads = WorkDiv{Vec{1, 1}, Vec{2, threadsPerBlockLimit}, Vec{1, 1}}; + CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, invalidThreads)); + + // Check that a work division with a single block, thread and element is always valid + auto const serialWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 1}, Vec{1, 1}}; + CHECK(alpaka::isValidWorkDivKernel(dev, kernelBundle, serialWorkDiv)); + + // Some accelerators support only one thread per block: + if constexpr(alpaka::isSingleThreadAcc) + { + // Check that the compute work division uses a single thread per block. + auto const expectedWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}, Vec{1, 1}}; + CHECK(workDiv == expectedWorkDiv); + + // Check that a work division with more than one thread per block is not valid. + auto const parallelWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 2}, Vec{1, 1}}; + CHECK(not alpaka::isValidWorkDivKernel(dev, kernelBundle, parallelWorkDiv)); + } + + // Check the maxDynamicSharedSizeBytes for CPU backends if constexpr(alpaka::accMatchesTags< Acc, - alpaka::TagGpuCudaRt, - alpaka::TagGpuHipRt, + alpaka::TagCpuSerial, alpaka::TagCpuThreads, + alpaka::TagCpuOmp2Blocks, alpaka::TagCpuOmp2Threads, - alpaka::TagCpuSycl, - alpaka::TagFpgaSyclIntel, - alpaka::TagGpuSyclIntel, - alpaka::TagGenericSycl>) - { - // Get calculated threads per block from the workDiv that was found by examining the kernel function - auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod(); - // Get hard limits - auto const threadsPerBlockLimit = props.m_blockThreadCountMax; - // Check that the number of threads per block is within the device limit. - CHECK(threadsPerBlock <= static_cast(threadsPerBlockLimit)); - - // Check that using the maximum number of threads per block is valid. - auto const validWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}}; - CHECK(alpaka::isValidWorkDivKernel(dev, bundeledKernel, validWorkDiv)); - - // Check that using too many threads per block is not valid. - auto const invalidWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{20, threadsPerBlock}, Vec{1, 1}}; - CHECK(not alpaka::isValidWorkDivKernel(dev, bundeledKernel, invalidWorkDiv)); - } - else if constexpr(alpaka:: - accMatchesTags) - { - // CPU must have only 1 thread per block. In other words, number of blocks is equal to number of threads. - CHECK(workDiv == WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}, Vec{1, 1}}); - // Test a new 2D workdiv. Threads per block can not be larger than 1 for CPU. Hence 2x1 threads is not valid. - auto const invalidWorkDiv2D = WorkDiv{Vec{1, 2048}, Vec{1, 2}, Vec{1, 1}}; - auto const isWorkDivValidForCpu = alpaka::isValidWorkDivKernel(dev, bundeledKernel, invalidWorkDiv2D); - CHECK(isWorkDivValidForCpu == false); - - // Check maxDynamicSharedSizeBytes for CPU backends - CHECK( - alpaka::getFunctionAttributes(dev, bundeledKernel).maxDynamicSharedSizeBytes - == static_cast(alpaka::BlockSharedDynMemberAllocKiB * 1024)); - } - else + alpaka::TagCpuTbbBlocks>) { - throw std::invalid_argument("Acc type is not among tested Accs."); + int const maxDynamicSharedSizeBytes + = alpaka::getFunctionAttributes(dev, kernelBundle).maxDynamicSharedSizeBytes; + CHECK(maxDynamicSharedSizeBytes == static_cast(alpaka::BlockSharedDynMemberAllocKiB * 1024)); } }