diff --git a/example/matrixAddWithMdspan/src/matrixAddMdSpan.cpp b/example/matrixAddWithMdspan/src/matrixAddMdSpan.cpp
index e80edfb0b22..d14910d90ef 100644
--- a/example/matrixAddWithMdspan/src/matrixAddMdSpan.cpp
+++ b/example/matrixAddWithMdspan/src/matrixAddMdSpan.cpp
@@ -37,16 +37,15 @@ struct MatrixAddKernel
     //! \param A First input matrix
     //! \param B Second input matrix
     //! \param C Output matrix where the result of A + B will be stored
-    template<typename TAcc, typename MdSpan>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, MdSpan A, MdSpan B, MdSpan C) const
+    template<typename TAcc, typename TMdSpan>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, TMdSpan A, TMdSpan B, TMdSpan C) const
     {
         // compile time check
-        static_assert(isMdspan<MdSpan>::value, "The type MdSpan should be an std mdspan");
+        static_assert(isMdspan<TMdSpan>::value, "The type MdSpan should be an std mdspan");
 
         auto const i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
         auto const j = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[1];
 
-        static_assert(isMdspan<MdSpan>::value, "The type MdSpan should be an std mdspan");
         if(i < A.extent(0) && j < A.extent(1))
         {
             C(i, j) = A(i, j) + B(i, j);
@@ -130,7 +129,6 @@ auto example(TAccTag const&) -> int
     auto mdDevB = alpaka::experimental::getMdSpan(bufDevB);
     auto mdDevC = alpaka::experimental::getMdSpan(bufDevC);
 
-
     //  Let alpaka calculate good block and grid sizes given our full problem extent.
     auto const workDiv = alpaka::getValidWorkDiv<Acc>(
         devAcc,
@@ -142,11 +140,9 @@ auto example(TAccTag const&) -> int
     // Execute the kernel
     alpaka::exec<Acc>(queue, workDiv, MatrixAddKernel{}, mdDevA, mdDevB, mdDevC);
 
-    // Wait for the kernel to finish
-    alpaka::wait(queue);
-
     // Copy result back to host
     alpaka::memcpy(queue, bufHostC, bufDevC);
+    // This wait is not necessary if the queue is a blocking queue
     alpaka::wait(queue);
 
     // Verify the result
diff --git a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
index 96b5b55333d..a19960e3842 100644
--- a/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
+++ b/example/matrixMulWithMdspan/src/matrixMulMdSpan.cpp
@@ -43,11 +43,14 @@ struct MatrixMulKernel
     //! \param B Second input matrix
     //! \param C Output matrix where the result of A * B will be stored
     //! \param K The shared dimension between A and B
-    template<typename TAcc, typename MdSpan>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc, MdSpan A, MdSpan B, MdSpan C, Idx K) const
+    template<typename TAcc, typename TMdSpan>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc, TMdSpan A, TMdSpan B, TMdSpan C) const
     {
-        // compile time check
-        static_assert(isMdspan<MdSpan>::value, "The type MdSpan should be an std mdspan");
+        // compile time checks
+        static_assert(isMdspan<TMdSpan>::value, "The type MdSpan should be an std mdspan");
+
+        // A is MxK and B is KxN
+        auto const K = static_cast<Idx>(A.extent(1));
 
         auto const i = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
         auto const j = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[1];
@@ -146,13 +149,11 @@ auto example(TAccTag const&) -> int
         alpaka::GridBlockExtentSubDivRestrictions::Unrestricted);
 
     // Execute the kernel
-    alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC, K);
-
-    // Wait for the kernel to finish
-    alpaka::wait(queue);
+    alpaka::exec<Acc>(queue, workDiv, MatrixMulKernel{}, mdDevA, mdDevB, mdDevC);
 
     // Copy result back to host
     alpaka::memcpy(queue, bufHostC, bufDevC);
+    // This wait is not necessary if the queue is a blocking queue
     alpaka::wait(queue);
 
     // Verify the result