alpaka-group · psychocoderHPC · Aug 1, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/test/unit/workDiv/src/WorkDivForKernelTest.cpp b/test/unit/workDiv/src/WorkDivForKernelTest.cpp
@@ -69,7 +69,7 @@ struct TestKernelWithManyRegisters
         double sum = var0 + var1 + var2 + var3 + var4 + var5 + var6 + var7 + var8 + var9 + var10 + var11 + var12
                      + var13 + var14 + var15 + var16 + var17 + var18 + var19 + var20 + var21 + var22 + var23 + var24
                      + var25 + var26 + var27 + var28 + var29 + var30 + var31 + var32 + var33 + var34 + var35;
-        printf("The sum is %5.2f, the argument is %lu ", sum, val);
+        printf("The sum is %5.2f, the argument is %lu\n", sum, val);
     }
 };
 
@@ -86,68 +86,64 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.1D", "[workDivKernel]", TestAc
     auto const dev = alpaka::getDevByIdx(platform, 0);
 
     TestKernelWithManyRegisters kernel;
-    auto const bundeledKernel = alpaka::KernelBundle(kernel, 200ul);
+    auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul);
 
-    // Get hard limits for test
-    auto const props = alpaka::getAccDevProps<Acc, decltype(dev)>(dev);
+    // Get the device properties and hard limits
+    auto const props = alpaka::getAccDevProps<Acc>(dev);
     Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax;
 
-    // Test getValidWorkDivForKernel for threadsPerGridTestValue threads per grid
+    // Test the getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid.
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(dev, bundeledKernel, Vec{threadsPerGridTestValue}, Vec{1});
-    // Test validity
-    auto const isValid = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, workDiv);
-    CHECK(isValid == true);
+        = alpaka::getValidWorkDivForKernel<Acc>(dev, kernelBundle, Vec{threadsPerGridTestValue}, Vec{1});
 
-    if constexpr(alpaka::accMatchesTags<Acc, alpaka::TagGpuCudaRt>)
-    {
-        // Get calculated threads per block from the workDiv found by examining kernel function
-        auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
-        // Get hard limits
-        auto const threadsPerBlockLimit = props.m_blockThreadCountMax;
-
-        // Depending on the GPU type or the compiler the test below might fail because threadsPerBlock can be equal to
-        // threadsPerBlockLimit, which is the max device limit.
-        CHECK(threadsPerBlock < static_cast<Idx>(threadsPerBlockLimit));
-    }
-    else if constexpr(alpaka::accMatchesTags<
-                          Acc,
-                          alpaka::TagGpuHipRt,
-                          alpaka::TagCpuThreads,
-                          alpaka::TagCpuOmp2Threads,
-                          alpaka::TagFpgaSyclIntel,
-                          alpaka::TagGpuSyclIntel,
-                          alpaka::TagGenericSycl>)
-    {
-        // Get calculated threads per block from the workDiv found by examining kernel function
-        auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
-        // Get hard limits
-        auto const threadsPerBlockLimit = props.m_blockThreadCountMax;
+    // Test the isValidWorkDivKernel function
+    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, workDiv));
 
-        CHECK(threadsPerBlock <= static_cast<Idx>(threadsPerBlockLimit));
-    }
-    else if constexpr(alpaka::accMatchesTags<
-                          Acc,
-                          alpaka::TagCpuSerial,
-                          alpaka::TagCpuOmp2Blocks,
-                          alpaka::TagCpuTbbBlocks,
-                          alpaka::TagCpuSycl>)
+    // Get calculated threads per block from the workDiv that was found by examining the kernel function.
+    Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
+
+    // Get the device limit.
+    Idx const threadsPerBlockLimit = props.m_blockThreadCountMax;
+
+    // Check that the number of threads per block is within the device limit.
+    CHECK(threadsPerBlock <= threadsPerBlockLimit);
+
+    // Check that using the maximum number of threads per block is valid.
+    auto const validWorkDiv = WorkDiv{Vec{threadsPerGridTestValue / threadsPerBlock}, Vec{threadsPerBlock}, Vec{1}};
+    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, validWorkDiv));
+
+    // Check that using too many threads per block is not valid.
+    auto const invalidThreads = WorkDiv{Vec{1}, Vec{2 * threadsPerBlockLimit}, Vec{1}};
+    CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, invalidThreads));
+
+    // Check that a work division with a single block, thread and element is always valid
+    auto const serialWorkDiv = WorkDiv{Vec{1}, Vec{1}, Vec{1}};
+    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, serialWorkDiv));
+
+    // Some accelerators support only one thread per block:
+    if constexpr(alpaka::isSingleThreadAcc<Acc>)
     {
-        // CPU must have only 1 thread per block. In other words, number of blocks is equal to number of threads.
-        CHECK(workDiv == WorkDiv{Vec{threadsPerGridTestValue}, Vec{1}, Vec{1}});
-        // Test a new 1D workdiv. Threads per block can not be larger than 1 for CPU. Hence 2 is not valid.
-        auto const workDiv1DUsingInitList = WorkDiv{Vec{threadsPerGridTestValue / 2}, Vec{2}, Vec{1}};
-        auto const isWorkDivValidForCPU
-            = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, workDiv1DUsingInitList);
-        CHECK(isWorkDivValidForCPU == false);
-        // Check maxDynamicSharedSizeBytes for CPU backends
-        auto const funcAttributes = alpaka::getFunctionAttributes<Acc>(dev, bundeledKernel);
-        CHECK(
-            funcAttributes.maxDynamicSharedSizeBytes == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
+        // Check that the compute work division uses a single thread per block.
+        auto const expectedWorkDiv = WorkDiv{Vec{threadsPerGridTestValue}, Vec{1}, Vec{1}};
+        CHECK(workDiv == expectedWorkDiv);
+
+        // Check that a work division with more than one thread per block is not valid.
+        auto const parallelWorkDiv = WorkDiv{Vec{1}, Vec{2}, Vec{1}};
+        CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, parallelWorkDiv));
     }
-    else
+
+    // Check the maxDynamicSharedSizeBytes for CPU backends
+    if constexpr(alpaka::accMatchesTags<
+                     Acc,
+                     alpaka::TagCpuSerial,
+                     alpaka::TagCpuThreads,
+                     alpaka::TagCpuOmp2Blocks,
+                     alpaka::TagCpuOmp2Threads,
+                     alpaka::TagCpuTbbBlocks>)
     {
-        throw std::invalid_argument("Acc type is not among tested Accs.");
+        int const maxDynamicSharedSizeBytes
+            = alpaka::getFunctionAttributes<Acc>(dev, kernelBundle).maxDynamicSharedSizeBytes;
+        CHECK(maxDynamicSharedSizeBytes == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
     }
 }
 
@@ -164,97 +160,67 @@ TEMPLATE_LIST_TEST_CASE("getValidWorkDivForKernel.2D", "[workDivKernel]", TestAc
     auto const dev = alpaka::getDevByIdx(platform, 0);
 
     TestKernelWithManyRegisters kernel;
-    // A random value
-    size_t val(200ul);
-    auto const bundeledKernel = alpaka::KernelBundle(kernel, val);
+    auto const kernelBundle = alpaka::KernelBundle(kernel, 200ul);
 
-    // Get hard limits for test
+    // Get the device properties and hard limits
     auto const props = alpaka::getAccDevProps<Acc>(dev);
     Idx const threadsPerGridTestValue = props.m_blockThreadCountMax * props.m_gridBlockCountMax;
 
     // Test getValidWorkDivForKernel function for threadsPerGridTestValue threads per grid.
     auto const workDiv
-        = alpaka::getValidWorkDivForKernel<Acc>(dev, bundeledKernel, Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1});
+        = alpaka::getValidWorkDivForKernel<Acc>(dev, kernelBundle, Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1});
 
-    // Test isValidWorkDivKernel function
-    auto const isValid = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, workDiv);
-    CHECK(isValid == true);
+    // Test the isValidWorkDivKernel function
+    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, workDiv));
 
-    if constexpr(alpaka::accMatchesTags<Acc, alpaka::TagGpuCudaRt>)
-    {
-        // Expected valid workdiv values for this kernel might change depending on the GPU type and compiler. Therefore
-        // generated workdiv is not compared to a specific workdiv in this test.
-
-        // Get calculated threads per block from the workDiv that was found by examining kernel function
-        auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
-        // Get hard limits
-        auto const threadsPerBlockLimit = props.m_blockThreadCountMax;
-
-        // Depending on the GPU type or the compiler the test below might fail because threadsPerBlock can be equal to
-        // threadsPerBlockLimit, which is the max device limit.
-        CHECK(threadsPerBlock < static_cast<Idx>(threadsPerBlockLimit));
-
-        // too many threads per block
-        auto const invalidWorkDiv
-            = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{2 * threadsPerBlock, 1}, Vec{1, 1}};
-        auto isWorkDivValidForCuda = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, invalidWorkDiv);
-        CHECK(isWorkDivValidForCuda == false);
-
-        auto const validWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}};
-        isWorkDivValidForCuda = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, validWorkDiv);
-        CHECK(isWorkDivValidForCuda == true);
-    }
-    else if constexpr(alpaka::accMatchesTags<
-                          Acc,
-                          alpaka::TagGpuHipRt,
-                          alpaka::TagCpuThreads,
-                          alpaka::TagCpuOmp2Threads,
-                          alpaka::TagFpgaSyclIntel,
-                          alpaka::TagGpuSyclIntel,
-                          alpaka::TagGenericSycl>)
-    {
-        // Get calculated threads per block from the workDiv that was found by examining the kernel function
-        auto const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
-        // Get hard limits
-        auto const threadsPerBlockLimit = props.m_blockThreadCountMax;
-        // Depending on the GPU type or the compiler this test might fail because threadsPerBlock can be less than
-        // threadsPerBlockLimit, which is the max device limit.
-        if(threadsPerBlockLimit == 1)
-            CHECK(threadsPerBlock == static_cast<Idx>(threadsPerBlockLimit));
-        else
-            CHECK(threadsPerBlock < static_cast<Idx>(threadsPerBlockLimit));
-
-        // too many threads per block
-        auto const invalidWorkDiv
-            = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{20 * threadsPerBlock, 1}, Vec{1, 1}};
-        auto isWorkDivValidForHip = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, invalidWorkDiv);
-        CHECK(isWorkDivValidForHip == false);
-
-        auto const validWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}};
-        isWorkDivValidForHip = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, validWorkDiv);
-        CHECK(isWorkDivValidForHip == true);
-    }
-    else if constexpr(alpaka::accMatchesTags<
-                          Acc,
-                          alpaka::TagCpuSerial,
-                          alpaka::TagCpuOmp2Blocks,
-                          alpaka::TagCpuTbbBlocks,
-                          alpaka::TagCpuSycl>)
+    // The valid workdiv values for the kernel may change depending on the GPU type and compiler.
+    // Therefore the generated workdiv is not compared to a specific workdiv in this test.
+
+    // Get calculated threads per block from the workDiv that was found by examining the kernel function.
+    Idx const threadsPerBlock = workDiv.m_blockThreadExtent.prod();
+
+    // Get the device limit.
+    Idx const threadsPerBlockLimit = props.m_blockThreadCountMax;
+
+    // Check that the number of threads per block is within the device limit.
+    CHECK(threadsPerBlock <= threadsPerBlockLimit);
+
+    // Check that using the maximum number of threads per block is valid.
+    auto const validWorkDiv
+        = WorkDiv{Vec{8, threadsPerGridTestValue / threadsPerBlock / 8}, Vec{1, threadsPerBlock}, Vec{1, 1}};
+    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, validWorkDiv));
+
+    // Check that using too many threads per block is not valid.
+    auto const invalidThreads = WorkDiv{Vec{1, 1}, Vec{2, threadsPerBlockLimit}, Vec{1, 1}};
+    CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, invalidThreads));
+
+    // Check that a work division with a single block, thread and element is always valid
+    auto const serialWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 1}, Vec{1, 1}};
+    CHECK(alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, serialWorkDiv));
+
+    // Some accelerators support only one thread per block:
+    if constexpr(alpaka::isSingleThreadAcc<Acc>)
     {
-        // CPU must have only 1 thread per block. In other words, number of blocks is equal to number of threads.
-        CHECK(workDiv == WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}, Vec{1, 1}});
-        // Test a new 2D workdiv. Threads per block can not be larger than 1 for CPU. Hence 2x1 threads is not valid.
-        auto const invalidWorkDiv2D = WorkDiv{Vec{1, 2048}, Vec{1, 2}, Vec{1, 1}};
-        auto const isWorkDivValidForCpu = alpaka::isValidWorkDivKernel<Acc>(dev, bundeledKernel, invalidWorkDiv2D);
-        CHECK(isWorkDivValidForCpu == false);
-
-        // Check maxDynamicSharedSizeBytes for CPU backends
-        CHECK(
-            alpaka::getFunctionAttributes<Acc>(dev, bundeledKernel).maxDynamicSharedSizeBytes
-            == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
+        // Check that the compute work division uses a single thread per block.
+        auto const expectedWorkDiv = WorkDiv{Vec{8, threadsPerGridTestValue / 8}, Vec{1, 1}, Vec{1, 1}};
+        CHECK(workDiv == expectedWorkDiv);
+
+        // Check that a work division with more than one thread per block is not valid.
+        auto const parallelWorkDiv = WorkDiv{Vec{1, 1}, Vec{1, 2}, Vec{1, 1}};
+        CHECK(not alpaka::isValidWorkDivKernel<Acc>(dev, kernelBundle, parallelWorkDiv));
     }
-    else
+
+    // Check the maxDynamicSharedSizeBytes for CPU backends
+    if constexpr(alpaka::accMatchesTags<
+                     Acc,
+                     alpaka::TagCpuSerial,
+                     alpaka::TagCpuThreads,
+                     alpaka::TagCpuOmp2Blocks,
+                     alpaka::TagCpuOmp2Threads,
+                     alpaka::TagCpuTbbBlocks>)
     {
-        throw std::invalid_argument("Acc type is not among tested Accs.");
+        int const maxDynamicSharedSizeBytes
+            = alpaka::getFunctionAttributes<Acc>(dev, kernelBundle).maxDynamicSharedSizeBytes;
+        CHECK(maxDynamicSharedSizeBytes == static_cast<int>(alpaka::BlockSharedDynMemberAllocKiB * 1024));
     }
 }