From 4fb4845c3f291ba30344b3a0272940c6173139f9 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Mon, 29 Jan 2024 10:43:20 +0000
Subject: [PATCH 1/8] refactor: Make all transpose operations work on a single
 field.

---
 src/backend.f90      | 18 ++----------------
 src/cuda/backend.f90 | 12 ++++++------
 src/omp/backend.f90  | 12 ++++++------
 src/solver.f90       | 14 +++++++++++---
 4 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/src/backend.f90 b/src/backend.f90
index 7e75118bd..4d7c27330 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -28,8 +28,8 @@ module m_base_backend
       procedure(transeq_ders), deferred :: transeq_y
       procedure(transeq_ders), deferred :: transeq_z
       procedure(tds_solve), deferred :: tds_solve
-      procedure(transposer), deferred :: trans_x2y
-      procedure(transposer), deferred :: trans_x2z
+      procedure(trans_d2d), deferred :: trans_x2y
+      procedure(trans_d2d), deferred :: trans_x2z
       procedure(trans_d2d), deferred :: trans_y2z
       procedure(trans_d2d), deferred :: trans_z2y
       procedure(trans_d2d), deferred :: trans_y2x
@@ -81,20 +81,6 @@ end subroutine tds_solve
    end interface
 
    abstract interface
-      subroutine transposer(self, u_, v_, w_, u, v, w)
-         !! transposer subroutines are straightforward, they rearrange
-         !! data into our specialist data structure so that regardless
-         !! of the direction tridiagonal systems are solved efficiently
-         !! and fast.
-         import :: base_backend_t
-         import :: field_t
-         implicit none
-
-         class(base_backend_t) :: self
-         class(field_t), intent(inout) :: u_, v_, w_
-         class(field_t), intent(in) :: u, v, w
-      end subroutine transposer
-
       subroutine trans_d2d(self, u_, u)
          !! transposer subroutines are straightforward, they rearrange
          !! data into our specialist data structure so that regardless
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 721f6d16c..7cac4ff7f 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -415,21 +415,21 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads)
 
    end subroutine tds_solve_dist
 
-   subroutine trans_x2y_cuda(self, u_y, v_y, w_y, u, v, w)
+   subroutine trans_x2y_cuda(self, u_y, u)
       implicit none
 
       class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_y, v_y, w_y
-      class(field_t), intent(in) :: u, v, w
+      class(field_t), intent(inout) :: u_y
+      class(field_t), intent(in) :: u
 
    end subroutine trans_x2y_cuda
 
-   subroutine trans_x2z_cuda(self, u_z, v_z, w_z, u, v, w)
+   subroutine trans_x2z_cuda(self, u_z, u)
       implicit none
 
       class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_z, v_z, w_z
-      class(field_t), intent(in) :: u, v, w
+      class(field_t), intent(inout) :: u_z
+      class(field_t), intent(in) :: u
 
    end subroutine trans_x2z_cuda
 
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 645137cb2..00f9a563f 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -164,21 +164,21 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops)
 
    end subroutine tds_solve_omp
 
-   subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w)
+   subroutine trans_x2y_omp(self, u_, u)
       implicit none
 
       class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_, v_, w_
-      class(field_t), intent(in) :: u, v, w
+      class(field_t), intent(inout) :: u_
+      class(field_t), intent(in) :: u
 
    end subroutine trans_x2y_omp
 
-   subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w)
+   subroutine trans_x2z_omp(self, u_, u)
       implicit none
 
       class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_, v_, w_
-      class(field_t), intent(in) :: u, v, w
+      class(field_t), intent(inout) :: u_
+      class(field_t), intent(in) :: u
 
    end subroutine trans_x2z_omp
 
diff --git a/src/solver.f90 b/src/solver.f90
index 5eb1139c4..e7d42bb76 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -185,7 +185,10 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_x2y(u_y, v_y, w_y, u, v, w)
+      call self%backend%trans_x2y(u_y, u)
+      call self%backend%trans_x2y(v_y, v)
+      call self%backend%trans_x2y(w_y, w)
+
       ! similar to the x direction, obtain derivatives in y.
       call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps)
 
@@ -206,7 +209,10 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_z => self%backend%allocator%get_block()
 
       ! reorder from x to z
-      call self%backend%trans_x2z(u_z, v_z, w_z, u, v, w)
+      call self%backend%trans_x2z(u_z, u)
+      call self%backend%trans_x2z(v_z, v)
+      call self%backend%trans_x2z(w_z, w)
+
       ! get the derivatives in z
       call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps)
 
@@ -261,7 +267,9 @@ subroutine divergence(self, div_u, u, v, w)
       w_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_x2y(u_y, v_y, w_y, du_x, dv_x, dw_x)
+      call self%backend%trans_x2y(u_y, du_x)
+      call self%backend%trans_x2y(v_y, dv_x)
+      call self%backend%trans_x2y(w_y, dw_x)
 
       call self%backend%allocator%release_block(du_x)
       call self%backend%allocator%release_block(dv_x)

From 6f5b6fa49243d7018fb8060dba59da0b1aa5d72f Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Mon, 29 Jan 2024 10:44:33 +0000
Subject: [PATCH 2/8] feat(cuda): Add transpose kernels.

---
 src/CMakeLists.txt         |   1 +
 src/cuda/kernels_trans.f90 | 119 +++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 src/cuda/kernels_trans.f90

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c1a1ede0b..93281d59a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,6 +17,7 @@ set(CUDASRC
   cuda/allocator.f90
   cuda/exec_dist.f90
   cuda/kernels_dist.f90
+  cuda/kernels_trans.f90
   cuda/sendrecv.f90
   cuda/tdsops.f90
 )
diff --git a/src/cuda/kernels_trans.f90 b/src/cuda/kernels_trans.f90
new file mode 100644
index 000000000..489dc0d2a
--- /dev/null
+++ b/src/cuda/kernels_trans.f90
@@ -0,0 +1,119 @@
+module m_cuda_kernels_trans
+   use cudafor
+
+   use m_common, only: dp
+   use m_cuda_common, only: SZ
+
+contains
+
+   attributes(global) subroutine trans_x2y_k(u_y, u_x, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_y
+      real(dp), device, intent(in), dimension(:, :, :) :: u_x
+      integer, value, intent(in) :: nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)
+
+   end subroutine trans_x2y_k
+
+   attributes(global) subroutine trans_x2z_k(u_z, u_x, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_z
+      real(dp), device, intent(in), dimension(:, :, :) :: u_x
+      integer, value, intent(in) :: nz
+
+      integer :: i, j, b_i, b_j, nx!, nz
+
+      i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
+      nx = gridDim%x
+
+      do j = 1, nz
+         u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
+      end do
+
+   end subroutine trans_x2z_k
+
+   attributes(global) subroutine trans_y2x_k(u_x, u_y, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_x
+      real(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine trans_y2x_k
+
+   attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_z
+      real(dp), device, intent(in), dimension(:, :, :) :: u_y
+      integer, value, intent(in) :: nx, nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)
+
+   end subroutine trans_y2z_k
+
+   attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_y
+      real(dp), device, intent(in), dimension(:, :, :) :: u_z
+      integer, value, intent(in) :: nx, nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)
+
+   end subroutine trans_z2y_k
+
+end module m_cuda_kernels_trans

From ac5e306f7172a01aa452103c7c9de4cdeb547598 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Mon, 29 Jan 2024 10:45:11 +0000
Subject: [PATCH 3/8] feat(cuda): Enable CUDA backend to call transpose
 kernels.

---
 src/backend.f90      |  1 +
 src/cuda/backend.f90 | 63 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

diff --git a/src/backend.f90 b/src/backend.f90
index 4d7c27330..93eb6bbd8 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -21,6 +21,7 @@ module m_base_backend
       !! architecture.
 
       real(dp) :: nu
+      integer :: nx_loc, ny_loc, nz_loc
       class(allocator_t), pointer :: allocator
       class(dirps_t), pointer :: xdirps, ydirps, zdirps
    contains
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 7cac4ff7f..212d9ce13 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -12,6 +12,8 @@ module m_cuda_backend
    use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
    use m_cuda_tdsops, only: cuda_tdsops_t
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
+   use m_cuda_kernels_trans, only: trans_x2y_k, trans_x2z_k, trans_y2x_k, &
+                                   trans_y2z_k, trans_z2y_k
 
    implicit none
 
@@ -74,6 +76,10 @@ function init(globs, allocator) result(backend)
       backend%zthreads = dim3(SZ, 1, 1)
       backend%zblocks = dim3(globs%n_groups_z, 1, 1)
 
+      backend%nx_loc = globs%nx_loc
+      backend%ny_loc = globs%ny_loc
+      backend%nz_loc = globs%nz_loc
+
       n_halo = 4
       n_block = globs%n_groups_x
 
@@ -422,6 +428,17 @@ subroutine trans_x2y_cuda(self, u_y, u)
       class(field_t), intent(inout) :: u_y
       class(field_t), intent(in) :: u
 
+      real(dp), device, pointer, dimension(:, :, :) :: u_d, u_y_d
+      type(dim3) :: blocks, threads
+
+      select type(u); type is (cuda_field_t); u_d => u%data_d; end select
+      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
+
+      blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
+      threads = dim3(SZ, SZ, 1)
+
+      call trans_x2y_k<<<blocks, threads>>>(u_y_d, u_d, self%nz_loc)
+
    end subroutine trans_x2y_cuda
 
    subroutine trans_x2z_cuda(self, u_z, u)
@@ -431,6 +448,17 @@ subroutine trans_x2z_cuda(self, u_z, u)
       class(field_t), intent(inout) :: u_z
       class(field_t), intent(in) :: u
 
+      real(dp), device, pointer, dimension(:, :, :) :: u_d, u_z_d
+      type(dim3) :: blocks, threads
+
+      select type(u); type is (cuda_field_t); u_d => u%data_d; end select
+      select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select
+
+      blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
+      threads = dim3(SZ, 1, 1)
+
+      call trans_x2z_k<<<blocks, threads>>>(u_z_d, u_d, self%nz_loc)
+
    end subroutine trans_x2z_cuda
 
    subroutine trans_y2z_cuda(self, u_z, u_y)
@@ -440,6 +468,18 @@ subroutine trans_y2z_cuda(self, u_z, u_y)
       class(field_t), intent(inout) :: u_z
       class(field_t), intent(in) :: u_y
 
+      real(dp), device, pointer, dimension(:, :, :) :: u_z_d, u_y_d
+      type(dim3) :: blocks, threads
+
+      select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select
+      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
+
+      blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+      threads = dim3(SZ, SZ, 1)
+
+      call trans_y2z_k<<<blocks, threads>>>(u_z_d, u_y_d, &
+                                            self%nx_loc, self%nz_loc)
+
    end subroutine trans_y2z_cuda
 
    subroutine trans_z2y_cuda(self, u_y, u_z)
@@ -449,6 +489,18 @@ subroutine trans_z2y_cuda(self, u_y, u_z)
       class(field_t), intent(inout) :: u_y
       class(field_t), intent(in) :: u_z
 
+      real(dp), device, pointer, dimension(:, :, :) :: u_y_d, u_z_d
+      type(dim3) :: blocks, threads
+
+      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
+      select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select
+
+      blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+      threads = dim3(SZ, SZ, 1)
+
+      call trans_z2y_k<<<blocks, threads>>>(u_y_d, u_z_d, &
+                                            self%nx_loc, self%nz_loc)
+
    end subroutine trans_z2y_cuda
 
    subroutine trans_y2x_cuda(self, u_x, u_y)
@@ -458,6 +510,17 @@ subroutine trans_y2x_cuda(self, u_x, u_y)
       class(field_t), intent(inout) :: u_x
       class(field_t), intent(in) :: u_y
 
+      real(dp), device, pointer, dimension(:, :, :) :: u_x_d, u_y_d
+      type(dim3) :: blocks, threads
+
+      select type(u_x); type is (cuda_field_t); u_x_d => u_x%data_d; end select
+      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
+
+      blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+      threads = dim3(SZ, SZ, 1)
+
+      call trans_y2x_k<<<blocks, threads>>>(u_x_d, u_y_d, self%nz_loc)
+
    end subroutine trans_y2x_cuda
 
    subroutine sum_yzintox_cuda(self, du, dv, dw, &

From 1e920eb1bc78ebcd9629dfd2f72d9b7a0dbb8daa Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Tue, 30 Jan 2024 16:26:35 +0000
Subject: [PATCH 4/8] refactor: Merge all transpose functions.

---
 src/backend.f90      |   9 +--
 src/cuda/backend.f90 | 137 ++++++++++++-------------------------------
 src/omp/backend.f90  |  47 ++-------------
 src/solver.f90       |  32 +++++-----
 4 files changed, 61 insertions(+), 164 deletions(-)

diff --git a/src/backend.f90 b/src/backend.f90
index 93eb6bbd8..01b449276 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -29,11 +29,7 @@ module m_base_backend
       procedure(transeq_ders), deferred :: transeq_y
       procedure(transeq_ders), deferred :: transeq_z
       procedure(tds_solve), deferred :: tds_solve
-      procedure(trans_d2d), deferred :: trans_x2y
-      procedure(trans_d2d), deferred :: trans_x2z
-      procedure(trans_d2d), deferred :: trans_y2z
-      procedure(trans_d2d), deferred :: trans_z2y
-      procedure(trans_d2d), deferred :: trans_y2x
+      procedure(trans_d2d), deferred :: trans_d2d
       procedure(sum9into3), deferred :: sum_yzintox
       procedure(vecadd), deferred :: vecadd
       procedure(get_fields), deferred :: get_fields
@@ -82,7 +78,7 @@ end subroutine tds_solve
    end interface
 
    abstract interface
-      subroutine trans_d2d(self, u_, u)
+      subroutine trans_d2d(self, u_, u, direction)
          !! transposer subroutines are straightforward, they rearrange
          !! data into our specialist data structure so that regardless
          !! of the direction tridiagonal systems are solved efficiently
@@ -94,6 +90,7 @@ subroutine trans_d2d(self, u_, u)
          class(base_backend_t) :: self
          class(field_t), intent(inout) :: u_
          class(field_t), intent(in) :: u
+         integer, intent(in) :: direction
       end subroutine trans_d2d
    end interface
 
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 212d9ce13..8cd1dcd72 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -34,11 +34,7 @@ module m_cuda_backend
       procedure :: transeq_y => transeq_y_cuda
       procedure :: transeq_z => transeq_z_cuda
       procedure :: tds_solve => tds_solve_cuda
-      procedure :: trans_x2y => trans_x2y_cuda
-      procedure :: trans_x2z => trans_x2z_cuda
-      procedure :: trans_y2z => trans_y2z_cuda
-      procedure :: trans_z2y => trans_z2y_cuda
-      procedure :: trans_y2x => trans_y2x_cuda
+      procedure :: trans_d2d => trans_d2d_cuda
       procedure :: sum_yzintox => sum_yzintox_cuda
       procedure :: vecadd => vecadd_cuda
       procedure :: set_fields => set_fields_cuda
@@ -421,107 +417,50 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads)
 
    end subroutine tds_solve_dist
 
-   subroutine trans_x2y_cuda(self, u_y, u)
+   subroutine trans_d2d_cuda(self, u_o, u_i, direction)
       implicit none
 
       class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_y
-      class(field_t), intent(in) :: u
-
-      real(dp), device, pointer, dimension(:, :, :) :: u_d, u_y_d
-      type(dim3) :: blocks, threads
-
-      select type(u); type is (cuda_field_t); u_d => u%data_d; end select
-      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
-
-      blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
-      threads = dim3(SZ, SZ, 1)
-
-      call trans_x2y_k<<<blocks, threads>>>(u_y_d, u_d, self%nz_loc)
-
-   end subroutine trans_x2y_cuda
-
-   subroutine trans_x2z_cuda(self, u_z, u)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_z
-      class(field_t), intent(in) :: u
+      class(field_t), intent(inout) :: u_o
+      class(field_t), intent(in) :: u_i
+      integer, intent(in) :: direction
 
-      real(dp), device, pointer, dimension(:, :, :) :: u_d, u_z_d
+      real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d
       type(dim3) :: blocks, threads
 
-      select type(u); type is (cuda_field_t); u_d => u%data_d; end select
-      select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select
-
-      blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
-      threads = dim3(SZ, 1, 1)
-
-      call trans_x2z_k<<<blocks, threads>>>(u_z_d, u_d, self%nz_loc)
-
-   end subroutine trans_x2z_cuda
-
-   subroutine trans_y2z_cuda(self, u_z, u_y)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_z
-      class(field_t), intent(in) :: u_y
-
-      real(dp), device, pointer, dimension(:, :, :) :: u_z_d, u_y_d
-      type(dim3) :: blocks, threads
-
-      select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select
-      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
-
-      blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
-      threads = dim3(SZ, SZ, 1)
-
-      call trans_y2z_k<<<blocks, threads>>>(u_z_d, u_y_d, &
-                                            self%nx_loc, self%nz_loc)
-
-   end subroutine trans_y2z_cuda
-
-   subroutine trans_z2y_cuda(self, u_y, u_z)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_y
-      class(field_t), intent(in) :: u_z
-
-      real(dp), device, pointer, dimension(:, :, :) :: u_y_d, u_z_d
-      type(dim3) :: blocks, threads
-
-      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
-      select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select
-
-      blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
-      threads = dim3(SZ, SZ, 1)
-
-      call trans_z2y_k<<<blocks, threads>>>(u_y_d, u_z_d, &
-                                            self%nx_loc, self%nz_loc)
-
-   end subroutine trans_z2y_cuda
-
-   subroutine trans_y2x_cuda(self, u_x, u_y)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u_x
-      class(field_t), intent(in) :: u_y
-
-      real(dp), device, pointer, dimension(:, :, :) :: u_x_d, u_y_d
-      type(dim3) :: blocks, threads
-
-      select type(u_x); type is (cuda_field_t); u_x_d => u_x%data_d; end select
-      select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
-
-      blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
-      threads = dim3(SZ, SZ, 1)
-
-      call trans_y2x_k<<<blocks, threads>>>(u_x_d, u_y_d, self%nz_loc)
+      select type(u_o); type is (cuda_field_t); u_o_d => u_o%data_d; end select
+      select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select
+
+      select case (direction)
+      case (12) ! x2y
+         blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
+         threads = dim3(SZ, SZ, 1)
+         call trans_x2y_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (13) ! x2z
+         blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
+         threads = dim3(SZ, 1, 1)
+         call trans_x2z_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (21) ! y2x
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call trans_y2x_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (23) ! y2z
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call trans_y2z_k<<<blocks, threads>>>(u_o_d, u_i_d, &
+                                               self%nx_loc, self%nz_loc)
+      case (32) ! z2y
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+
+         call trans_z2y_k<<<blocks, threads>>>(u_o_d, u_i_d, &
+                                               self%nx_loc, self%nz_loc)
+      case default
+         print *, 'Transpose direction is undefined.'
+         stop
+      end select
 
-   end subroutine trans_y2x_cuda
+   end subroutine trans_d2d_cuda
 
    subroutine sum_yzintox_cuda(self, du, dv, dw, &
                                du_y, dv_y, dw_y, du_z, dv_z, dw_z)
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 00f9a563f..8d9ed3e71 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -24,11 +24,7 @@ module m_omp_backend
       procedure :: transeq_y => transeq_y_omp
       procedure :: transeq_z => transeq_z_omp
       procedure :: tds_solve => tds_solve_omp
-      procedure :: trans_x2y => trans_x2y_omp
-      procedure :: trans_x2z => trans_x2z_omp
-      procedure :: trans_y2z => trans_y2z_omp
-      procedure :: trans_z2y => trans_z2y_omp
-      procedure :: trans_y2x => trans_y2x_omp
+      procedure :: trans_d2d => trans_d2d_omp
       procedure :: sum_yzintox => sum_yzintox_omp
       procedure :: vecadd => vecadd_omp
       procedure :: set_fields => set_fields_omp
@@ -164,50 +160,15 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops)
 
    end subroutine tds_solve_omp
 
-   subroutine trans_x2y_omp(self, u_, u)
+   subroutine trans_d2d_omp(self, u_, u, direction)
       implicit none
 
       class(omp_backend_t) :: self
       class(field_t), intent(inout) :: u_
       class(field_t), intent(in) :: u
+      integer, intent(in) :: direction
 
-   end subroutine trans_x2y_omp
-
-   subroutine trans_x2z_omp(self, u_, u)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-
-   end subroutine trans_x2z_omp
-
-   subroutine trans_y2z_omp(self, u_, u)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-
-   end subroutine trans_y2z_omp
-
-   subroutine trans_z2y_omp(self, u_, u)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-
-   end subroutine trans_z2y_omp
-
-   subroutine trans_y2x_omp(self, u_, u)
-      implicit none
-
-      class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u_
-      class(field_t), intent(in) :: u
-
-   end subroutine trans_y2x_omp
+   end subroutine trans_d2d_omp
 
    subroutine sum_yzintox_omp(self, du, dv, dw, &
                                du_y, dv_y, dw_y, du_z, dv_z, dw_z)
diff --git a/src/solver.f90 b/src/solver.f90
index e7d42bb76..92e7e17f3 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -185,9 +185,9 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_x2y(u_y, u)
-      call self%backend%trans_x2y(v_y, v)
-      call self%backend%trans_x2y(w_y, w)
+      call self%backend%trans_d2d(u_y, u, 12)
+      call self%backend%trans_d2d(v_y, v, 12)
+      call self%backend%trans_d2d(w_y, w, 12)
 
       ! similar to the x direction, obtain derivatives in y.
       call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps)
@@ -209,9 +209,9 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_z => self%backend%allocator%get_block()
 
       ! reorder from x to z
-      call self%backend%trans_x2z(u_z, u)
-      call self%backend%trans_x2z(v_z, v)
-      call self%backend%trans_x2z(w_z, w)
+      call self%backend%trans_d2d(u_z, u, 13)
+      call self%backend%trans_d2d(v_z, v, 13)
+      call self%backend%trans_d2d(w_z, w, 13)
 
       ! get the derivatives in z
       call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps)
@@ -267,9 +267,9 @@ subroutine divergence(self, div_u, u, v, w)
       w_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_x2y(u_y, du_x)
-      call self%backend%trans_x2y(v_y, dv_x)
-      call self%backend%trans_x2y(w_y, dw_x)
+      call self%backend%trans_d2d(u_y, du_x, 12)
+      call self%backend%trans_d2d(v_y, dv_x, 12)
+      call self%backend%trans_d2d(w_y, dw_x, 12)
 
       call self%backend%allocator%release_block(du_x)
       call self%backend%allocator%release_block(dv_x)
@@ -303,8 +303,8 @@ subroutine divergence(self, div_u, u, v, w)
       call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y)
 
       ! reorder from y to z
-      call self%backend%trans_y2z(u_z, du_y)
-      call self%backend%trans_y2z(w_z, dw_y)
+      call self%backend%trans_d2d(u_z, du_y, 23)
+      call self%backend%trans_d2d(w_z, dw_y, 23)
 
       ! release all the unnecessary blocks.
       call self%backend%allocator%release_block(du_y)
@@ -358,8 +358,8 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure)
       dpdz_sxy_y => self%backend%allocator%get_block()
 
       ! reorder data from z orientation to y orientation
-      call self%backend%trans_z2y(p_sxy_y, p_sxy_z)
-      call self%backend%trans_z2y(dpdz_sxy_y, dpdz_sxy_z)
+      call self%backend%trans_d2d(p_sxy_y, p_sxy_z, 32)
+      call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, 32)
 
       call self%backend%allocator%release_block(p_sxy_z)
       call self%backend%allocator%release_block(dpdz_sxy_z)
@@ -386,9 +386,9 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure)
       dpdz_sx_x => self%backend%allocator%get_block()
 
       ! reorder from y to x
-      call self%backend%trans_y2x(p_sx_x, p_sx_y)
-      call self%backend%trans_y2x(dpdy_sx_x, dpdy_sx_y)
-      call self%backend%trans_y2x(dpdz_sx_x, dpdz_sx_y)
+      call self%backend%trans_d2d(p_sx_x, p_sx_y, 21)
+      call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, 21)
+      call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, 21)
 
       ! release all the y directional fields.
       call self%backend%allocator%release_block(p_sx_y)

From 2f22c9f620fece82ddc81864b919e6b740663f19 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Tue, 30 Jan 2024 16:41:27 +0000
Subject: [PATCH 5/8] feat: Use an integer parameter for transpose direction.

---
 src/common.f90       |  3 +++
 src/cuda/backend.f90 | 12 ++++++------
 src/solver.f90       | 34 +++++++++++++++++-----------------
 3 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/src/common.f90 b/src/common.f90
index 4aad6f7c5..809c7bf2d 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -4,6 +4,9 @@ module m_common
    integer, parameter :: dp=kind(0.0d0)
    real(dp), parameter :: pi = 4*atan(1.0_dp)
 
+   integer, parameter :: TRP_X2Y = 12, TRP_X2Z = 13, TRP_Y2X = 21, &
+                         TRP_Y2Z = 23, TRP_Z2Y = 32
+
    type :: globs_t
       integer :: nx, ny, nz
       integer :: nx_loc, ny_loc, nz_loc
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 8cd1dcd72..d0651f70d 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -3,7 +3,7 @@ module m_cuda_backend
 
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
-   use m_common, only: dp, globs_t
+   use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y
    use m_tdsops, only: dirps_t, tdsops_t
 
    use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
@@ -432,24 +432,24 @@ subroutine trans_d2d_cuda(self, u_o, u_i, direction)
       select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select
 
       select case (direction)
-      case (12) ! x2y
+      case (TRP_X2Y) ! x2y
          blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
          threads = dim3(SZ, SZ, 1)
          call trans_x2y_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (13) ! x2z
+      case (TRP_X2Z) ! x2z
          blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
          threads = dim3(SZ, 1, 1)
          call trans_x2z_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (21) ! y2x
+      case (TRP_Y2X) ! y2x
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
          call trans_y2x_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (23) ! y2z
+      case (TRP_Y2Z) ! y2z
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
          call trans_y2z_k<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
-      case (32) ! z2y
+      case (TRP_Z2Y) ! z2y
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
 
diff --git a/src/solver.f90 b/src/solver.f90
index 92e7e17f3..67e520453 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -1,7 +1,7 @@
 module m_solver
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
-   use m_common, only: dp, globs_t
+   use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y
    use m_tdsops, only: tdsops_t, dirps_t
    use m_time_integrator, only: time_intg_t
 
@@ -185,9 +185,9 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_d2d(u_y, u, 12)
-      call self%backend%trans_d2d(v_y, v, 12)
-      call self%backend%trans_d2d(w_y, w, 12)
+      call self%backend%trans_d2d(u_y, u, TRP_X2Y)
+      call self%backend%trans_d2d(v_y, v, TRP_X2Y)
+      call self%backend%trans_d2d(w_y, w, TRP_X2Y)
 
       ! similar to the x direction, obtain derivatives in y.
       call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps)
@@ -209,9 +209,9 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_z => self%backend%allocator%get_block()
 
       ! reorder from x to z
-      call self%backend%trans_d2d(u_z, u, 13)
-      call self%backend%trans_d2d(v_z, v, 13)
-      call self%backend%trans_d2d(w_z, w, 13)
+      call self%backend%trans_d2d(u_z, u, TRP_X2Z)
+      call self%backend%trans_d2d(v_z, v, TRP_X2Z)
+      call self%backend%trans_d2d(w_z, w, TRP_X2Z)
 
       ! get the derivatives in z
       call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps)
@@ -267,9 +267,9 @@ subroutine divergence(self, div_u, u, v, w)
       w_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_d2d(u_y, du_x, 12)
-      call self%backend%trans_d2d(v_y, dv_x, 12)
-      call self%backend%trans_d2d(w_y, dw_x, 12)
+      call self%backend%trans_d2d(u_y, du_x, TRP_X2Y)
+      call self%backend%trans_d2d(v_y, dv_x, TRP_X2Y)
+      call self%backend%trans_d2d(w_y, dw_x, TRP_X2Y)
 
       call self%backend%allocator%release_block(du_x)
       call self%backend%allocator%release_block(dv_x)
@@ -303,8 +303,8 @@ subroutine divergence(self, div_u, u, v, w)
       call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y)
 
       ! reorder from y to z
-      call self%backend%trans_d2d(u_z, du_y, 23)
-      call self%backend%trans_d2d(w_z, dw_y, 23)
+      call self%backend%trans_d2d(u_z, du_y, TRP_Y2Z)
+      call self%backend%trans_d2d(w_z, dw_y, TRP_Y2Z)
 
       ! release all the unnecessary blocks.
       call self%backend%allocator%release_block(du_y)
@@ -358,8 +358,8 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure)
       dpdz_sxy_y => self%backend%allocator%get_block()
 
       ! reorder data from z orientation to y orientation
-      call self%backend%trans_d2d(p_sxy_y, p_sxy_z, 32)
-      call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, 32)
+      call self%backend%trans_d2d(p_sxy_y, p_sxy_z, TRP_Z2Y)
+      call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, TRP_Z2Y)
 
       call self%backend%allocator%release_block(p_sxy_z)
       call self%backend%allocator%release_block(dpdz_sxy_z)
@@ -386,9 +386,9 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure)
       dpdz_sx_x => self%backend%allocator%get_block()
 
       ! reorder from y to x
-      call self%backend%trans_d2d(p_sx_x, p_sx_y, 21)
-      call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, 21)
-      call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, 21)
+      call self%backend%trans_d2d(p_sx_x, p_sx_y, TRP_Y2X)
+      call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, TRP_Y2X)
+      call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, TRP_Y2X)
 
       ! release all the y directional fields.
       call self%backend%allocator%release_block(p_sx_y)

From 5f5f1293e2c254bef77b00f8e705f5e9edc2e1cf Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Tue, 30 Jan 2024 17:02:15 +0000
Subject: [PATCH 6/8] refactor: Rename trans/transpose -> reorder.

---
 src/CMakeLists.txt                            |  2 +-
 src/backend.f90                               |  8 ++---
 src/common.f90                                |  4 +--
 src/cuda/backend.f90                          | 32 ++++++++---------
 ...{kernels_trans.f90 => kernels_reorder.f90} | 24 ++++++-------
 src/omp/backend.f90                           |  6 ++--
 src/solver.f90                                | 34 +++++++++----------
 7 files changed, 55 insertions(+), 55 deletions(-)
 rename src/cuda/{kernels_trans.f90 => kernels_reorder.f90} (84%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 93281d59a..88a9480b6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,7 +17,7 @@ set(CUDASRC
   cuda/allocator.f90
   cuda/exec_dist.f90
   cuda/kernels_dist.f90
-  cuda/kernels_trans.f90
+  cuda/kernels_reorder.f90
   cuda/sendrecv.f90
   cuda/tdsops.f90
 )
diff --git a/src/backend.f90 b/src/backend.f90
index 01b449276..bf7c240cc 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -29,7 +29,7 @@ module m_base_backend
       procedure(transeq_ders), deferred :: transeq_y
       procedure(transeq_ders), deferred :: transeq_z
       procedure(tds_solve), deferred :: tds_solve
-      procedure(trans_d2d), deferred :: trans_d2d
+      procedure(reorder), deferred :: reorder
       procedure(sum9into3), deferred :: sum_yzintox
       procedure(vecadd), deferred :: vecadd
       procedure(get_fields), deferred :: get_fields
@@ -78,8 +78,8 @@ end subroutine tds_solve
    end interface
 
    abstract interface
-      subroutine trans_d2d(self, u_, u, direction)
-         !! transposer subroutines are straightforward, they rearrange
+      subroutine reorder(self, u_, u, direction)
+         !! reorder subroutines are straightforward, they rearrange
          !! data into our specialist data structure so that regardless
          !! of the direction tridiagonal systems are solved efficiently
          !! and fast.
@@ -91,7 +91,7 @@ subroutine trans_d2d(self, u_, u, direction)
          class(field_t), intent(inout) :: u_
          class(field_t), intent(in) :: u
          integer, intent(in) :: direction
-      end subroutine trans_d2d
+      end subroutine reorder
    end interface
 
    abstract interface
diff --git a/src/common.f90 b/src/common.f90
index 809c7bf2d..6712b356c 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -4,8 +4,8 @@ module m_common
    integer, parameter :: dp=kind(0.0d0)
    real(dp), parameter :: pi = 4*atan(1.0_dp)
 
-   integer, parameter :: TRP_X2Y = 12, TRP_X2Z = 13, TRP_Y2X = 21, &
-                         TRP_Y2Z = 23, TRP_Z2Y = 32
+   integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
+                         RDR_Y2Z = 23, RDR_Z2Y = 32
 
    type :: globs_t
       integer :: nx, ny, nz
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index d0651f70d..9eb7bcea2 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -3,7 +3,7 @@ module m_cuda_backend
 
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
-   use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y
+   use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
    use m_tdsops, only: dirps_t, tdsops_t
 
    use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
@@ -12,8 +12,8 @@ module m_cuda_backend
    use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
    use m_cuda_tdsops, only: cuda_tdsops_t
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
-   use m_cuda_kernels_trans, only: trans_x2y_k, trans_x2z_k, trans_y2x_k, &
-                                   trans_y2z_k, trans_z2y_k
+   use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
+                                     reorder_y2z, reorder_z2y
 
    implicit none
 
@@ -34,7 +34,7 @@ module m_cuda_backend
       procedure :: transeq_y => transeq_y_cuda
       procedure :: transeq_z => transeq_z_cuda
       procedure :: tds_solve => tds_solve_cuda
-      procedure :: trans_d2d => trans_d2d_cuda
+      procedure :: reorder => reorder_cuda
       procedure :: sum_yzintox => sum_yzintox_cuda
       procedure :: vecadd => vecadd_cuda
       procedure :: set_fields => set_fields_cuda
@@ -417,7 +417,7 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads)
 
    end subroutine tds_solve_dist
 
-   subroutine trans_d2d_cuda(self, u_o, u_i, direction)
+   subroutine reorder_cuda(self, u_o, u_i, direction)
       implicit none
 
       class(cuda_backend_t) :: self
@@ -432,35 +432,35 @@ subroutine trans_d2d_cuda(self, u_o, u_i, direction)
       select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select
 
       select case (direction)
-      case (TRP_X2Y) ! x2y
+      case (RDR_X2Y) ! x2y
          blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
          threads = dim3(SZ, SZ, 1)
-         call trans_x2y_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (TRP_X2Z) ! x2z
+         call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_X2Z) ! x2z
          blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
          threads = dim3(SZ, 1, 1)
-         call trans_x2z_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (TRP_Y2X) ! y2x
+         call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_Y2X) ! y2x
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
-         call trans_y2x_k<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (TRP_Y2Z) ! y2z
+         call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_Y2Z) ! y2z
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
-         call trans_y2z_k<<<blocks, threads>>>(u_o_d, u_i_d, &
+         call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
-      case (TRP_Z2Y) ! z2y
+      case (RDR_Z2Y) ! z2y
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
 
-         call trans_z2y_k<<<blocks, threads>>>(u_o_d, u_i_d, &
+         call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
       case default
          print *, 'Transpose direction is undefined.'
          stop
       end select
 
-   end subroutine trans_d2d_cuda
+   end subroutine reorder_cuda
 
    subroutine sum_yzintox_cuda(self, du, dv, dw, &
                                du_y, dv_y, dw_y, du_z, dv_z, dw_z)
diff --git a/src/cuda/kernels_trans.f90 b/src/cuda/kernels_reorder.f90
similarity index 84%
rename from src/cuda/kernels_trans.f90
rename to src/cuda/kernels_reorder.f90
index 489dc0d2a..f5f923216 100644
--- a/src/cuda/kernels_trans.f90
+++ b/src/cuda/kernels_reorder.f90
@@ -1,4 +1,4 @@
-module m_cuda_kernels_trans
+module m_cuda_kernels_reorder
    use cudafor
 
    use m_common, only: dp
@@ -6,7 +6,7 @@ module m_cuda_kernels_trans
 
 contains
 
-   attributes(global) subroutine trans_x2y_k(u_y, u_x, nz)
+   attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
       implicit none
 
       real(dp), device, intent(out), dimension(:, :, :) :: u_y
@@ -27,9 +27,9 @@ attributes(global) subroutine trans_x2y_k(u_y, u_x, nz)
       ! copy into output array from shared
       u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)
 
-   end subroutine trans_x2y_k
+   end subroutine reorder_x2y
 
-   attributes(global) subroutine trans_x2z_k(u_z, u_x, nz)
+   attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
       implicit none
 
       real(dp), device, intent(out), dimension(:, :, :) :: u_z
@@ -45,9 +45,9 @@ attributes(global) subroutine trans_x2z_k(u_z, u_x, nz)
          u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
       end do
 
-   end subroutine trans_x2z_k
+   end subroutine reorder_x2z
 
-   attributes(global) subroutine trans_y2x_k(u_x, u_y, nz)
+   attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
       implicit none
 
       real(dp), device, intent(out), dimension(:, :, :) :: u_x
@@ -68,9 +68,9 @@ attributes(global) subroutine trans_y2x_k(u_x, u_y, nz)
       ! copy into output array from shared
       u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)
 
-   end subroutine trans_y2x_k
+   end subroutine reorder_y2x
 
-   attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz)
+   attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
       implicit none
 
       real(dp), device, intent(out), dimension(:, :, :) :: u_z
@@ -91,9 +91,9 @@ attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz)
       ! copy into output array from shared
       u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)
 
-   end subroutine trans_y2z_k
+   end subroutine reorder_y2z
 
-   attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz)
+   attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
       implicit none
 
       real(dp), device, intent(out), dimension(:, :, :) :: u_y
@@ -114,6 +114,6 @@ attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz)
       ! copy into output array from shared
       u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)
 
-   end subroutine trans_z2y_k
+   end subroutine reorder_z2y
 
-end module m_cuda_kernels_trans
+end module m_cuda_kernels_reorder
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 8d9ed3e71..297ba9196 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -24,7 +24,7 @@ module m_omp_backend
       procedure :: transeq_y => transeq_y_omp
       procedure :: transeq_z => transeq_z_omp
       procedure :: tds_solve => tds_solve_omp
-      procedure :: trans_d2d => trans_d2d_omp
+      procedure :: reorder => reorder_omp
       procedure :: sum_yzintox => sum_yzintox_omp
       procedure :: vecadd => vecadd_omp
       procedure :: set_fields => set_fields_omp
@@ -160,7 +160,7 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops)
 
    end subroutine tds_solve_omp
 
-   subroutine trans_d2d_omp(self, u_, u, direction)
+   subroutine reorder_omp(self, u_, u, direction)
       implicit none
 
       class(omp_backend_t) :: self
@@ -168,7 +168,7 @@ subroutine trans_d2d_omp(self, u_, u, direction)
       class(field_t), intent(in) :: u
       integer, intent(in) :: direction
 
-   end subroutine trans_d2d_omp
+   end subroutine reorder_omp
 
    subroutine sum_yzintox_omp(self, du, dv, dw, &
                                du_y, dv_y, dw_y, du_z, dv_z, dw_z)
diff --git a/src/solver.f90 b/src/solver.f90
index 67e520453..1ecea22c8 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -1,7 +1,7 @@
 module m_solver
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
-   use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y
+   use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
    use m_tdsops, only: tdsops_t, dirps_t
    use m_time_integrator, only: time_intg_t
 
@@ -185,9 +185,9 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_d2d(u_y, u, TRP_X2Y)
-      call self%backend%trans_d2d(v_y, v, TRP_X2Y)
-      call self%backend%trans_d2d(w_y, w, TRP_X2Y)
+      call self%backend%reorder(u_y, u, RDR_X2Y)
+      call self%backend%reorder(v_y, v, RDR_X2Y)
+      call self%backend%reorder(w_y, w, RDR_X2Y)
 
       ! similar to the x direction, obtain derivatives in y.
       call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps)
@@ -209,9 +209,9 @@ subroutine transeq(self, du, dv, dw, u, v, w)
       dw_z => self%backend%allocator%get_block()
 
       ! reorder from x to z
-      call self%backend%trans_d2d(u_z, u, TRP_X2Z)
-      call self%backend%trans_d2d(v_z, v, TRP_X2Z)
-      call self%backend%trans_d2d(w_z, w, TRP_X2Z)
+      call self%backend%reorder(u_z, u, RDR_X2Z)
+      call self%backend%reorder(v_z, v, RDR_X2Z)
+      call self%backend%reorder(w_z, w, RDR_X2Z)
 
       ! get the derivatives in z
       call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps)
@@ -267,9 +267,9 @@ subroutine divergence(self, div_u, u, v, w)
       w_y => self%backend%allocator%get_block()
 
       ! reorder data from x orientation to y orientation
-      call self%backend%trans_d2d(u_y, du_x, TRP_X2Y)
-      call self%backend%trans_d2d(v_y, dv_x, TRP_X2Y)
-      call self%backend%trans_d2d(w_y, dw_x, TRP_X2Y)
+      call self%backend%reorder(u_y, du_x, RDR_X2Y)
+      call self%backend%reorder(v_y, dv_x, RDR_X2Y)
+      call self%backend%reorder(w_y, dw_x, RDR_X2Y)
 
       call self%backend%allocator%release_block(du_x)
       call self%backend%allocator%release_block(dv_x)
@@ -303,8 +303,8 @@ subroutine divergence(self, div_u, u, v, w)
       call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y)
 
       ! reorder from y to z
-      call self%backend%trans_d2d(u_z, du_y, TRP_Y2Z)
-      call self%backend%trans_d2d(w_z, dw_y, TRP_Y2Z)
+      call self%backend%reorder(u_z, du_y, RDR_Y2Z)
+      call self%backend%reorder(w_z, dw_y, RDR_Y2Z)
 
       ! release all the unnecessary blocks.
       call self%backend%allocator%release_block(du_y)
@@ -358,8 +358,8 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure)
       dpdz_sxy_y => self%backend%allocator%get_block()
 
       ! reorder data from z orientation to y orientation
-      call self%backend%trans_d2d(p_sxy_y, p_sxy_z, TRP_Z2Y)
-      call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, TRP_Z2Y)
+      call self%backend%reorder(p_sxy_y, p_sxy_z, RDR_Z2Y)
+      call self%backend%reorder(dpdz_sxy_y, dpdz_sxy_z, RDR_Z2Y)
 
       call self%backend%allocator%release_block(p_sxy_z)
       call self%backend%allocator%release_block(dpdz_sxy_z)
@@ -386,9 +386,9 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure)
       dpdz_sx_x => self%backend%allocator%get_block()
 
       ! reorder from y to x
-      call self%backend%trans_d2d(p_sx_x, p_sx_y, TRP_Y2X)
-      call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, TRP_Y2X)
-      call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, TRP_Y2X)
+      call self%backend%reorder(p_sx_x, p_sx_y, RDR_Y2X)
+      call self%backend%reorder(dpdy_sx_x, dpdy_sx_y, RDR_Y2X)
+      call self%backend%reorder(dpdz_sx_x, dpdz_sx_y, RDR_Y2X)
 
       ! release all the y directional fields.
       call self%backend%allocator%release_block(p_sx_y)

From 33ef9035077784ff997a35cc412bfd136b00cd80 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Mon, 5 Feb 2024 17:32:20 +0000
Subject: [PATCH 7/8] feat(cuda/tests): Add tests for all reordering kernels.

---
 tests/CMakeLists.txt             |   1 +
 tests/cuda/test_cuda_reorder.f90 | 228 +++++++++++++++++++++++++++++++
 2 files changed, 229 insertions(+)
 create mode 100644 tests/cuda/test_cuda_reorder.f90

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 968c33c74..808ada660 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(TESTSRC
 )
 set(CUDATESTSRC
   cuda/test_cuda_allocator.f90
+  cuda/test_cuda_reorder.f90
   cuda/test_cuda_tridiag.f90
   cuda/test_cuda_transeq.f90
 )
diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90
new file mode 100644
index 000000000..833b0774f
--- /dev/null
+++ b/tests/cuda/test_cuda_reorder.f90
@@ -0,0 +1,228 @@
+program test_cuda_reorder
+   use iso_fortran_env, only: stderr => error_unit
+   use cudafor
+   use mpi
+
+   use m_common, only: dp
+   use m_cuda_common, only: SZ
+   use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
+                                     reorder_y2z, reorder_z2y
+
+   implicit none
+
+   logical :: allpass = .true.
+   real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp
+   real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, u_temp_d
+
+   integer :: n_block, i, n_iters
+   integer :: nx, ny, nz, ndof
+   integer :: nrank, nproc, pprev, pnext
+   integer :: ierr, ndevs, devnum
+
+   type(dim3) :: blocks, threads
+   real(dp) :: norm_u, tol = 1d-8, tstart, tend
+
+   call MPI_Init(ierr)
+   call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
+   call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
+
+   if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'
+
+   ierr = cudaGetDeviceCount(ndevs)
+   ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin
+   ierr = cudaGetDevice(devnum)
+
+   !print*, 'I am rank', nrank, 'I am running on device', devnum
+   pnext = modulo(nrank - nproc + 1, nproc)
+   pprev = modulo(nrank - 1, nproc)
+
+   nx = 512; ny = 512; nz = 512
+   n_block = ny*nz/SZ
+   ndof = nx*ny*nz
+   n_iters = 100
+
+   allocate (u_i(SZ, nx, n_block), u_o(SZ, nx, n_block))
+   allocate (u_temp(SZ, nx, n_block))
+   allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block))
+   allocate (u_temp_d(SZ, nx, n_block))
+
+   ! set a random field
+   call random_number(u_i)
+
+   ! move data to device
+   u_i_d = u_i
+
+   ! do a x to y reordering first and then a y to x
+   blocks = dim3(nx/SZ, nz, ny/SZ)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_x2y<<<blocks, threads>>>(u_temp_d, u_i_d, nz)
+
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_y2x<<<blocks, threads>>>(u_o_d, u_temp_d, nz)
+
+   ! move the result back to host
+   u_o = u_o_d
+
+   ! and check whether it matches the initial random field
+   norm_u = norm2(u_o - u_i)
+   if (nrank == 0) then
+      if ( norm_u > tol ) then
+         allpass = .false.
+         write(stderr, '(a)') 'Check reorder x2y and y2x... failed'
+      else
+         write(stderr, '(a)') 'Check reorder x2y and y2x... passed'
+      end if
+   end if
+
+   ! we reuse u_o_d so zeroize in any case
+   u_o_d = 0
+
+   ! u_temp_d is in y orientation, use y2z to reorder it into z direction
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_y2z<<<blocks, threads>>>(u_o_d, u_temp_d, nx, nz)
+
+   ! store this in host
+   u_o = u_o_d
+
+   ! and zeroize u_o_d again in any case
+   u_o_d = 0
+
+   ! reorder initial random field into z orientation
+   blocks = dim3(nx, ny/SZ, 1)
+   threads = dim3(SZ, 1, 1)
+   call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   u_temp = u_o_d
+
+   ! compare two z oriented fields
+   norm_u = norm2(u_o - u_temp)
+   if (nrank == 0) then
+      if ( norm_u > tol ) then
+         allpass = .false.
+         write(stderr, '(a)') 'Check reorder x2z and y2z... failed'
+      else
+         write(stderr, '(a)') 'Check reorder x2z and y2z... passed'
+      end if
+   end if
+
+   ! z oriented field into y
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_z2y<<<blocks, threads>>>(u_temp_d, u_o_d, nx, nz)
+
+   ! zeroize just in case for reusing
+   u_o_d = 0
+
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_y2x<<<blocks, threads>>>(u_o_d, u_temp_d, nz)
+   u_o = u_o_d
+
+   ! and check whether it matches the initial random field
+   norm_u = norm2(u_o - u_i)
+   if (nrank == 0) then
+      if ( norm_u > tol ) then
+         allpass = .false.
+         write(stderr, '(a)') 'Check reorder z2y and y2x... failed'
+      else
+         write(stderr, '(a)') 'Check reorder z2y and y2x... passed'
+      end if
+   end if
+
+   ! Now the performance checks
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, nz, ny/SZ)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx, ny/SZ, 1)
+      threads = dim3(SZ, 1, 1)
+      call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, ny/SZ, nz)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, ny/SZ, nz)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      blocks = dim3(nx/SZ, ny/SZ, nz)
+      threads = dim3(SZ, SZ, 1)
+      call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   if (allpass) then
+      if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
+   else
+      error stop 'SOME TESTS FAILED.'
+   end if
+
+   call MPI_Finalize(ierr)
+
+contains
+
+   subroutine checkperf(t_tot, n_iters, ndof, consumed_bw)
+      implicit none
+
+      real(dp), intent(in) :: t_tot, consumed_bw
+      integer, intent(in) :: n_iters, ndof
+
+      real(dp) :: achievedBW, devBW, achievedBWmax, achievedBWmin
+      integer :: ierr, memClockRt, memBusWidth
+
+      ! BW utilisation and performance checks
+      achievedBW = consumed_bw*n_iters*ndof*dp/t_tot
+      call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, &
+                         MPI_MAX, MPI_COMM_WORLD, ierr)
+      call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, &
+                         MPI_MIN, MPI_COMM_WORLD, ierr)
+
+      if (nrank == 0) then
+         print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s'
+         print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s'
+      end if
+
+      ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0)
+      ierr = cudaDeviceGetAttribute(memBusWidth, &
+                                    cudaDevAttrGlobalMemoryBusWidth, 0)
+      devBW = 2*memBusWidth/8._dp*memClockRt*1000
+
+      if (nrank == 0) then
+         print'(a, f8.3, a)', 'Device BW:   ', devBW/2**30, ' GiB/s'
+         print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/devBW*100
+         print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/devBW*100
+      end if
+   end subroutine checkperf
+
+end program test_cuda_reorder
+

From 932df134af0b1ce66590f69043d517ec61dbadb5 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Tue, 6 Feb 2024 13:22:49 +0000
Subject: [PATCH 8/8] fix: Minor changes.

---
 src/cuda/backend.f90         | 5 ++---
 src/cuda/kernels_reorder.f90 | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 9eb7bcea2..07a68b00d 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -1,4 +1,5 @@
 module m_cuda_backend
+   use iso_fortran_env, only: stderr => error_unit
    use cudafor
 
    use m_allocator, only: allocator_t, field_t
@@ -452,12 +453,10 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
       case (RDR_Z2Y) ! z2y
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
-
          call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
       case default
-         print *, 'Transpose direction is undefined.'
-         stop
+         error stop 'Reorder direction is undefined.'
       end select
 
    end subroutine reorder_cuda
diff --git a/src/cuda/kernels_reorder.f90 b/src/cuda/kernels_reorder.f90
index f5f923216..76cd9369c 100644
--- a/src/cuda/kernels_reorder.f90
+++ b/src/cuda/kernels_reorder.f90
@@ -36,11 +36,13 @@ attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
       real(dp), device, intent(in), dimension(:, :, :) :: u_x
       integer, value, intent(in) :: nz
 
-      integer :: i, j, b_i, b_j, nx!, nz
+      integer :: i, j, b_i, b_j, nx
 
       i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
       nx = gridDim%x
 
+      ! Data access pattern for reordering between x and z is quite nice
+      ! thus we don't need to use shared memory for this operation.
       do j = 1, nz
          u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
       end do