From 4fb4845c3f291ba30344b3a0272940c6173139f9 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 29 Jan 2024 10:43:20 +0000 Subject: [PATCH 1/8] refactor: Make all transpose operations work on a single field. --- src/backend.f90 | 18 ++---------------- src/cuda/backend.f90 | 12 ++++++------ src/omp/backend.f90 | 12 ++++++------ src/solver.f90 | 14 +++++++++++--- 4 files changed, 25 insertions(+), 31 deletions(-) diff --git a/src/backend.f90 b/src/backend.f90 index 7e75118bd..4d7c27330 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -28,8 +28,8 @@ module m_base_backend procedure(transeq_ders), deferred :: transeq_y procedure(transeq_ders), deferred :: transeq_z procedure(tds_solve), deferred :: tds_solve - procedure(transposer), deferred :: trans_x2y - procedure(transposer), deferred :: trans_x2z + procedure(trans_d2d), deferred :: trans_x2y + procedure(trans_d2d), deferred :: trans_x2z procedure(trans_d2d), deferred :: trans_y2z procedure(trans_d2d), deferred :: trans_z2y procedure(trans_d2d), deferred :: trans_y2x @@ -81,20 +81,6 @@ end subroutine tds_solve end interface abstract interface - subroutine transposer(self, u_, v_, w_, u, v, w) - !! transposer subroutines are straightforward, they rearrange - !! data into our specialist data structure so that regardless - !! of the direction tridiagonal systems are solved efficiently - !! and fast. - import :: base_backend_t - import :: field_t - implicit none - - class(base_backend_t) :: self - class(field_t), intent(inout) :: u_, v_, w_ - class(field_t), intent(in) :: u, v, w - end subroutine transposer - subroutine trans_d2d(self, u_, u) !! transposer subroutines are straightforward, they rearrange !! data into our specialist data structure so that regardless diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 721f6d16c..7cac4ff7f 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -415,21 +415,21 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) end subroutine tds_solve_dist - subroutine trans_x2y_cuda(self, u_y, v_y, w_y, u, v, w) + subroutine trans_x2y_cuda(self, u_y, u) implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_y, v_y, w_y - class(field_t), intent(in) :: u, v, w + class(field_t), intent(inout) :: u_y + class(field_t), intent(in) :: u end subroutine trans_x2y_cuda - subroutine trans_x2z_cuda(self, u_z, v_z, w_z, u, v, w) + subroutine trans_x2z_cuda(self, u_z, u) implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_z, v_z, w_z - class(field_t), intent(in) :: u, v, w + class(field_t), intent(inout) :: u_z + class(field_t), intent(in) :: u end subroutine trans_x2z_cuda diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 645137cb2..00f9a563f 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -164,21 +164,21 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops) end subroutine tds_solve_omp - subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w) + subroutine trans_x2y_omp(self, u_, u) implicit none class(omp_backend_t) :: self - class(field_t), intent(inout) :: u_, v_, w_ - class(field_t), intent(in) :: u, v, w + class(field_t), intent(inout) :: u_ + class(field_t), intent(in) :: u end subroutine trans_x2y_omp - subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w) + subroutine trans_x2z_omp(self, u_, u) implicit none class(omp_backend_t) :: self - class(field_t), intent(inout) :: u_, v_, w_ - class(field_t), intent(in) :: u, v, w + class(field_t), intent(inout) :: u_ + class(field_t), intent(in) :: u end subroutine trans_x2z_omp diff --git a/src/solver.f90 b/src/solver.f90 index 5eb1139c4..e7d42bb76 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -185,7 +185,10 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_x2y(u_y, v_y, w_y, u, v, w) + call self%backend%trans_x2y(u_y, u) + call self%backend%trans_x2y(v_y, v) + call self%backend%trans_x2y(w_y, w) + ! similar to the x direction, obtain derivatives in y. call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps) @@ -206,7 +209,10 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_z => self%backend%allocator%get_block() ! reorder from x to z - call self%backend%trans_x2z(u_z, v_z, w_z, u, v, w) + call self%backend%trans_x2z(u_z, u) + call self%backend%trans_x2z(v_z, v) + call self%backend%trans_x2z(w_z, w) + ! get the derivatives in z call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps) @@ -261,7 +267,9 @@ subroutine divergence(self, div_u, u, v, w) w_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_x2y(u_y, v_y, w_y, du_x, dv_x, dw_x) + call self%backend%trans_x2y(u_y, du_x) + call self%backend%trans_x2y(v_y, dv_x) + call self%backend%trans_x2y(w_y, dw_x) call self%backend%allocator%release_block(du_x) call self%backend%allocator%release_block(dv_x) From 6f5b6fa49243d7018fb8060dba59da0b1aa5d72f Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 29 Jan 2024 10:44:33 +0000 Subject: [PATCH 2/8] feat(cuda): Add transpose kernels. --- src/CMakeLists.txt | 1 + src/cuda/kernels_trans.f90 | 119 +++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 src/cuda/kernels_trans.f90 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c1a1ede0b..93281d59a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,6 +17,7 @@ set(CUDASRC cuda/allocator.f90 cuda/exec_dist.f90 cuda/kernels_dist.f90 + cuda/kernels_trans.f90 cuda/sendrecv.f90 cuda/tdsops.f90 ) diff --git a/src/cuda/kernels_trans.f90 b/src/cuda/kernels_trans.f90 new file mode 100644 index 000000000..489dc0d2a --- /dev/null +++ b/src/cuda/kernels_trans.f90 @@ -0,0 +1,119 @@ +module m_cuda_kernels_trans + use cudafor + + use m_common, only: dp + use m_cuda_common, only: SZ + +contains + + attributes(global) subroutine trans_x2y_k(u_y, u_x, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_y + real(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz + + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz) + + call syncthreads() + + ! copy into output array from shared + u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i) + + end subroutine trans_x2y_k + + attributes(global) subroutine trans_x2z_k(u_z, u_x, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_z + real(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz + + integer :: i, j, b_i, b_j, nx!, nz + + i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y + nx = gridDim%x + + do j = 1, nz + u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz) + end do + + end subroutine trans_x2z_k + + attributes(global) subroutine trans_y2x_k(u_x, u_y, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_x + real(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nz + + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) + + call syncthreads() + + ! copy into output array from shared + u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i) + + end subroutine trans_y2x_k + + attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_z + real(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nx, nz + + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) + + call syncthreads() + + ! copy into output array from shared + u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i) + + end subroutine trans_y2z_k + + attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_y + real(dp), device, intent(in), dimension(:, :, :) :: u_z + integer, value, intent(in) :: nx, nz + + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) + + call syncthreads() + + ! copy into output array from shared + u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i) + + end subroutine trans_z2y_k + +end module m_cuda_kernels_trans From ac5e306f7172a01aa452103c7c9de4cdeb547598 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 29 Jan 2024 10:45:11 +0000 Subject: [PATCH 3/8] feat(cuda): Enable CUDA backend to call transpose kernels. --- src/backend.f90 | 1 + src/cuda/backend.f90 | 63 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) diff --git a/src/backend.f90 b/src/backend.f90 index 4d7c27330..93eb6bbd8 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -21,6 +21,7 @@ module m_base_backend !! architecture. real(dp) :: nu + integer :: nx_loc, ny_loc, nz_loc class(allocator_t), pointer :: allocator class(dirps_t), pointer :: xdirps, ydirps, zdirps contains diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 7cac4ff7f..212d9ce13 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -12,6 +12,8 @@ module m_cuda_backend use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields use m_cuda_tdsops, only: cuda_tdsops_t use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs + use m_cuda_kernels_trans, only: trans_x2y_k, trans_x2z_k, trans_y2x_k, & + trans_y2z_k, trans_z2y_k implicit none @@ -74,6 +76,10 @@ function init(globs, allocator) result(backend) backend%zthreads = dim3(SZ, 1, 1) backend%zblocks = dim3(globs%n_groups_z, 1, 1) + backend%nx_loc = globs%nx_loc + backend%ny_loc = globs%ny_loc + backend%nz_loc = globs%nz_loc + n_halo = 4 n_block = globs%n_groups_x @@ -422,6 +428,17 @@ subroutine trans_x2y_cuda(self, u_y, u) class(field_t), intent(inout) :: u_y class(field_t), intent(in) :: u + real(dp), device, pointer, dimension(:, :, :) :: u_d, u_y_d + type(dim3) :: blocks, threads + + select type(u); type is (cuda_field_t); u_d => u%data_d; end select + select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select + + blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) + threads = dim3(SZ, SZ, 1) + + call trans_x2y_k<<>>(u_y_d, u_d, self%nz_loc) + end subroutine trans_x2y_cuda subroutine trans_x2z_cuda(self, u_z, u) @@ -431,6 +448,17 @@ subroutine trans_x2z_cuda(self, u_z, u) class(field_t), intent(inout) :: u_z class(field_t), intent(in) :: u + real(dp), device, pointer, dimension(:, :, :) :: u_d, u_z_d + type(dim3) :: blocks, threads + + select type(u); type is (cuda_field_t); u_d => u%data_d; end select + select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select + + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + + call trans_x2z_k<<>>(u_z_d, u_d, self%nz_loc) + end subroutine trans_x2z_cuda subroutine trans_y2z_cuda(self, u_z, u_y) @@ -440,6 +468,18 @@ subroutine trans_y2z_cuda(self, u_z, u_y) class(field_t), intent(inout) :: u_z class(field_t), intent(in) :: u_y + real(dp), device, pointer, dimension(:, :, :) :: u_z_d, u_y_d + type(dim3) :: blocks, threads + + select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select + select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + + call trans_y2z_k<<>>(u_z_d, u_y_d, & + self%nx_loc, self%nz_loc) + end subroutine trans_y2z_cuda subroutine trans_z2y_cuda(self, u_y, u_z) @@ -449,6 +489,18 @@ subroutine trans_z2y_cuda(self, u_y, u_z) class(field_t), intent(inout) :: u_y class(field_t), intent(in) :: u_z + real(dp), device, pointer, dimension(:, :, :) :: u_y_d, u_z_d + type(dim3) :: blocks, threads + + select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select + select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + + call trans_z2y_k<<>>(u_y_d, u_z_d, & + self%nx_loc, self%nz_loc) + end subroutine trans_z2y_cuda subroutine trans_y2x_cuda(self, u_x, u_y) @@ -458,6 +510,17 @@ subroutine trans_y2x_cuda(self, u_x, u_y) class(field_t), intent(inout) :: u_x class(field_t), intent(in) :: u_y + real(dp), device, pointer, dimension(:, :, :) :: u_x_d, u_y_d + type(dim3) :: blocks, threads + + select type(u_x); type is (cuda_field_t); u_x_d => u_x%data_d; end select + select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + + call trans_y2x_k<<>>(u_x_d, u_y_d, self%nz_loc) + end subroutine trans_y2x_cuda subroutine sum_yzintox_cuda(self, du, dv, dw, & From 1e920eb1bc78ebcd9629dfd2f72d9b7a0dbb8daa Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Tue, 30 Jan 2024 16:26:35 +0000 Subject: [PATCH 4/8] refactor: Merge all transpose functions. --- src/backend.f90 | 9 +-- src/cuda/backend.f90 | 137 ++++++++++++------------------------------- src/omp/backend.f90 | 47 ++------------- src/solver.f90 | 32 +++++----- 4 files changed, 61 insertions(+), 164 deletions(-) diff --git a/src/backend.f90 b/src/backend.f90 index 93eb6bbd8..01b449276 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -29,11 +29,7 @@ module m_base_backend procedure(transeq_ders), deferred :: transeq_y procedure(transeq_ders), deferred :: transeq_z procedure(tds_solve), deferred :: tds_solve - procedure(trans_d2d), deferred :: trans_x2y - procedure(trans_d2d), deferred :: trans_x2z - procedure(trans_d2d), deferred :: trans_y2z - procedure(trans_d2d), deferred :: trans_z2y - procedure(trans_d2d), deferred :: trans_y2x + procedure(trans_d2d), deferred :: trans_d2d procedure(sum9into3), deferred :: sum_yzintox procedure(vecadd), deferred :: vecadd procedure(get_fields), deferred :: get_fields @@ -82,7 +78,7 @@ end subroutine tds_solve end interface abstract interface - subroutine trans_d2d(self, u_, u) + subroutine trans_d2d(self, u_, u, direction) !! transposer subroutines are straightforward, they rearrange !! data into our specialist data structure so that regardless !! of the direction tridiagonal systems are solved efficiently @@ -94,6 +90,7 @@ subroutine trans_d2d(self, u_, u) class(base_backend_t) :: self class(field_t), intent(inout) :: u_ class(field_t), intent(in) :: u + integer, intent(in) :: direction end subroutine trans_d2d end interface diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 212d9ce13..8cd1dcd72 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -34,11 +34,7 @@ module m_cuda_backend procedure :: transeq_y => transeq_y_cuda procedure :: transeq_z => transeq_z_cuda procedure :: tds_solve => tds_solve_cuda - procedure :: trans_x2y => trans_x2y_cuda - procedure :: trans_x2z => trans_x2z_cuda - procedure :: trans_y2z => trans_y2z_cuda - procedure :: trans_z2y => trans_z2y_cuda - procedure :: trans_y2x => trans_y2x_cuda + procedure :: trans_d2d => trans_d2d_cuda procedure :: sum_yzintox => sum_yzintox_cuda procedure :: vecadd => vecadd_cuda procedure :: set_fields => set_fields_cuda @@ -421,107 +417,50 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) end subroutine tds_solve_dist - subroutine trans_x2y_cuda(self, u_y, u) + subroutine trans_d2d_cuda(self, u_o, u_i, direction) implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_y - class(field_t), intent(in) :: u - - real(dp), device, pointer, dimension(:, :, :) :: u_d, u_y_d - type(dim3) :: blocks, threads - - select type(u); type is (cuda_field_t); u_d => u%data_d; end select - select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select - - blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) - threads = dim3(SZ, SZ, 1) - - call trans_x2y_k<<>>(u_y_d, u_d, self%nz_loc) - - end subroutine trans_x2y_cuda - - subroutine trans_x2z_cuda(self, u_z, u) - implicit none - - class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_z - class(field_t), intent(in) :: u + class(field_t), intent(inout) :: u_o + class(field_t), intent(in) :: u_i + integer, intent(in) :: direction - real(dp), device, pointer, dimension(:, :, :) :: u_d, u_z_d + real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d type(dim3) :: blocks, threads - select type(u); type is (cuda_field_t); u_d => u%data_d; end select - select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select - - blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) - threads = dim3(SZ, 1, 1) - - call trans_x2z_k<<>>(u_z_d, u_d, self%nz_loc) - - end subroutine trans_x2z_cuda - - subroutine trans_y2z_cuda(self, u_z, u_y) - implicit none - - class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_z - class(field_t), intent(in) :: u_y - - real(dp), device, pointer, dimension(:, :, :) :: u_z_d, u_y_d - type(dim3) :: blocks, threads - - select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select - select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select - - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - - call trans_y2z_k<<>>(u_z_d, u_y_d, & - self%nx_loc, self%nz_loc) - - end subroutine trans_y2z_cuda - - subroutine trans_z2y_cuda(self, u_y, u_z) - implicit none - - class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_y - class(field_t), intent(in) :: u_z - - real(dp), device, pointer, dimension(:, :, :) :: u_y_d, u_z_d - type(dim3) :: blocks, threads - - select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select - select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select - - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - - call trans_z2y_k<<>>(u_y_d, u_z_d, & - self%nx_loc, self%nz_loc) - - end subroutine trans_z2y_cuda - - subroutine trans_y2x_cuda(self, u_x, u_y) - implicit none - - class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u_x - class(field_t), intent(in) :: u_y - - real(dp), device, pointer, dimension(:, :, :) :: u_x_d, u_y_d - type(dim3) :: blocks, threads - - select type(u_x); type is (cuda_field_t); u_x_d => u_x%data_d; end select - select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select - - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - - call trans_y2x_k<<>>(u_x_d, u_y_d, self%nz_loc) + select type(u_o); type is (cuda_field_t); u_o_d => u_o%data_d; end select + select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select + + select case (direction) + case (12) ! x2y + blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) + threads = dim3(SZ, SZ, 1) + call trans_x2y_k<<>>(u_o_d, u_i_d, self%nz_loc) + case (13) ! x2z + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call trans_x2z_k<<>>(u_o_d, u_i_d, self%nz_loc) + case (21) ! y2x + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call trans_y2x_k<<>>(u_o_d, u_i_d, self%nz_loc) + case (23) ! y2z + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call trans_y2z_k<<>>(u_o_d, u_i_d, & + self%nx_loc, self%nz_loc) + case (32) ! z2y + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + + call trans_z2y_k<<>>(u_o_d, u_i_d, & + self%nx_loc, self%nz_loc) + case default + print *, 'Transpose direction is undefined.' + stop + end select - end subroutine trans_y2x_cuda + end subroutine trans_d2d_cuda subroutine sum_yzintox_cuda(self, du, dv, dw, & du_y, dv_y, dw_y, du_z, dv_z, dw_z) diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 00f9a563f..8d9ed3e71 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -24,11 +24,7 @@ module m_omp_backend procedure :: transeq_y => transeq_y_omp procedure :: transeq_z => transeq_z_omp procedure :: tds_solve => tds_solve_omp - procedure :: trans_x2y => trans_x2y_omp - procedure :: trans_x2z => trans_x2z_omp - procedure :: trans_y2z => trans_y2z_omp - procedure :: trans_z2y => trans_z2y_omp - procedure :: trans_y2x => trans_y2x_omp + procedure :: trans_d2d => trans_d2d_omp procedure :: sum_yzintox => sum_yzintox_omp procedure :: vecadd => vecadd_omp procedure :: set_fields => set_fields_omp @@ -164,50 +160,15 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops) end subroutine tds_solve_omp - subroutine trans_x2y_omp(self, u_, u) + subroutine trans_d2d_omp(self, u_, u, direction) implicit none class(omp_backend_t) :: self class(field_t), intent(inout) :: u_ class(field_t), intent(in) :: u + integer, intent(in) :: direction - end subroutine trans_x2y_omp - - subroutine trans_x2z_omp(self, u_, u) - implicit none - - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u_ - class(field_t), intent(in) :: u - - end subroutine trans_x2z_omp - - subroutine trans_y2z_omp(self, u_, u) - implicit none - - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u_ - class(field_t), intent(in) :: u - - end subroutine trans_y2z_omp - - subroutine trans_z2y_omp(self, u_, u) - implicit none - - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u_ - class(field_t), intent(in) :: u - - end subroutine trans_z2y_omp - - subroutine trans_y2x_omp(self, u_, u) - implicit none - - class(omp_backend_t) :: self - class(field_t), intent(inout) :: u_ - class(field_t), intent(in) :: u - - end subroutine trans_y2x_omp + end subroutine trans_d2d_omp subroutine sum_yzintox_omp(self, du, dv, dw, & du_y, dv_y, dw_y, du_z, dv_z, dw_z) diff --git a/src/solver.f90 b/src/solver.f90 index e7d42bb76..92e7e17f3 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -185,9 +185,9 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_x2y(u_y, u) - call self%backend%trans_x2y(v_y, v) - call self%backend%trans_x2y(w_y, w) + call self%backend%trans_d2d(u_y, u, 12) + call self%backend%trans_d2d(v_y, v, 12) + call self%backend%trans_d2d(w_y, w, 12) ! similar to the x direction, obtain derivatives in y. call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps) @@ -209,9 +209,9 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_z => self%backend%allocator%get_block() ! reorder from x to z - call self%backend%trans_x2z(u_z, u) - call self%backend%trans_x2z(v_z, v) - call self%backend%trans_x2z(w_z, w) + call self%backend%trans_d2d(u_z, u, 13) + call self%backend%trans_d2d(v_z, v, 13) + call self%backend%trans_d2d(w_z, w, 13) ! get the derivatives in z call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps) @@ -267,9 +267,9 @@ subroutine divergence(self, div_u, u, v, w) w_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_x2y(u_y, du_x) - call self%backend%trans_x2y(v_y, dv_x) - call self%backend%trans_x2y(w_y, dw_x) + call self%backend%trans_d2d(u_y, du_x, 12) + call self%backend%trans_d2d(v_y, dv_x, 12) + call self%backend%trans_d2d(w_y, dw_x, 12) call self%backend%allocator%release_block(du_x) call self%backend%allocator%release_block(dv_x) @@ -303,8 +303,8 @@ subroutine divergence(self, div_u, u, v, w) call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y) ! reorder from y to z - call self%backend%trans_y2z(u_z, du_y) - call self%backend%trans_y2z(w_z, dw_y) + call self%backend%trans_d2d(u_z, du_y, 23) + call self%backend%trans_d2d(w_z, dw_y, 23) ! release all the unnecessary blocks. call self%backend%allocator%release_block(du_y) @@ -358,8 +358,8 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure) dpdz_sxy_y => self%backend%allocator%get_block() ! reorder data from z orientation to y orientation - call self%backend%trans_z2y(p_sxy_y, p_sxy_z) - call self%backend%trans_z2y(dpdz_sxy_y, dpdz_sxy_z) + call self%backend%trans_d2d(p_sxy_y, p_sxy_z, 32) + call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, 32) call self%backend%allocator%release_block(p_sxy_z) call self%backend%allocator%release_block(dpdz_sxy_z) @@ -386,9 +386,9 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure) dpdz_sx_x => self%backend%allocator%get_block() ! reorder from y to x - call self%backend%trans_y2x(p_sx_x, p_sx_y) - call self%backend%trans_y2x(dpdy_sx_x, dpdy_sx_y) - call self%backend%trans_y2x(dpdz_sx_x, dpdz_sx_y) + call self%backend%trans_d2d(p_sx_x, p_sx_y, 21) + call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, 21) + call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, 21) ! release all the y directional fields. call self%backend%allocator%release_block(p_sx_y) From 2f22c9f620fece82ddc81864b919e6b740663f19 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Tue, 30 Jan 2024 16:41:27 +0000 Subject: [PATCH 5/8] feat: Use an integer parameter for transpose direction. --- src/common.f90 | 3 +++ src/cuda/backend.f90 | 12 ++++++------ src/solver.f90 | 34 +++++++++++++++++----------------- 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/common.f90 b/src/common.f90 index 4aad6f7c5..809c7bf2d 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -4,6 +4,9 @@ module m_common integer, parameter :: dp=kind(0.0d0) real(dp), parameter :: pi = 4*atan(1.0_dp) + integer, parameter :: TRP_X2Y = 12, TRP_X2Z = 13, TRP_Y2X = 21, & + TRP_Y2Z = 23, TRP_Z2Y = 32 + type :: globs_t integer :: nx, ny, nz integer :: nx_loc, ny_loc, nz_loc diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 8cd1dcd72..d0651f70d 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -3,7 +3,7 @@ module m_cuda_backend use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t + use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y use m_tdsops, only: dirps_t, tdsops_t use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t @@ -432,24 +432,24 @@ subroutine trans_d2d_cuda(self, u_o, u_i, direction) select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select select case (direction) - case (12) ! x2y + case (TRP_X2Y) ! x2y blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) threads = dim3(SZ, SZ, 1) call trans_x2y_k<<>>(u_o_d, u_i_d, self%nz_loc) - case (13) ! x2z + case (TRP_X2Z) ! x2z blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) threads = dim3(SZ, 1, 1) call trans_x2z_k<<>>(u_o_d, u_i_d, self%nz_loc) - case (21) ! y2x + case (TRP_Y2X) ! y2x blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) call trans_y2x_k<<>>(u_o_d, u_i_d, self%nz_loc) - case (23) ! y2z + case (TRP_Y2Z) ! y2z blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) call trans_y2z_k<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) - case (32) ! z2y + case (TRP_Z2Y) ! z2y blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) diff --git a/src/solver.f90 b/src/solver.f90 index 92e7e17f3..67e520453 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -1,7 +1,7 @@ module m_solver use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t + use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y use m_tdsops, only: tdsops_t, dirps_t use m_time_integrator, only: time_intg_t @@ -185,9 +185,9 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_d2d(u_y, u, 12) - call self%backend%trans_d2d(v_y, v, 12) - call self%backend%trans_d2d(w_y, w, 12) + call self%backend%trans_d2d(u_y, u, TRP_X2Y) + call self%backend%trans_d2d(v_y, v, TRP_X2Y) + call self%backend%trans_d2d(w_y, w, TRP_X2Y) ! similar to the x direction, obtain derivatives in y. call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps) @@ -209,9 +209,9 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_z => self%backend%allocator%get_block() ! reorder from x to z - call self%backend%trans_d2d(u_z, u, 13) - call self%backend%trans_d2d(v_z, v, 13) - call self%backend%trans_d2d(w_z, w, 13) + call self%backend%trans_d2d(u_z, u, TRP_X2Z) + call self%backend%trans_d2d(v_z, v, TRP_X2Z) + call self%backend%trans_d2d(w_z, w, TRP_X2Z) ! get the derivatives in z call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps) @@ -267,9 +267,9 @@ subroutine divergence(self, div_u, u, v, w) w_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_d2d(u_y, du_x, 12) - call self%backend%trans_d2d(v_y, dv_x, 12) - call self%backend%trans_d2d(w_y, dw_x, 12) + call self%backend%trans_d2d(u_y, du_x, TRP_X2Y) + call self%backend%trans_d2d(v_y, dv_x, TRP_X2Y) + call self%backend%trans_d2d(w_y, dw_x, TRP_X2Y) call self%backend%allocator%release_block(du_x) call self%backend%allocator%release_block(dv_x) @@ -303,8 +303,8 @@ subroutine divergence(self, div_u, u, v, w) call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y) ! reorder from y to z - call self%backend%trans_d2d(u_z, du_y, 23) - call self%backend%trans_d2d(w_z, dw_y, 23) + call self%backend%trans_d2d(u_z, du_y, TRP_Y2Z) + call self%backend%trans_d2d(w_z, dw_y, TRP_Y2Z) ! release all the unnecessary blocks. call self%backend%allocator%release_block(du_y) @@ -358,8 +358,8 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure) dpdz_sxy_y => self%backend%allocator%get_block() ! reorder data from z orientation to y orientation - call self%backend%trans_d2d(p_sxy_y, p_sxy_z, 32) - call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, 32) + call self%backend%trans_d2d(p_sxy_y, p_sxy_z, TRP_Z2Y) + call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, TRP_Z2Y) call self%backend%allocator%release_block(p_sxy_z) call self%backend%allocator%release_block(dpdz_sxy_z) @@ -386,9 +386,9 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure) dpdz_sx_x => self%backend%allocator%get_block() ! reorder from y to x - call self%backend%trans_d2d(p_sx_x, p_sx_y, 21) - call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, 21) - call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, 21) + call self%backend%trans_d2d(p_sx_x, p_sx_y, TRP_Y2X) + call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, TRP_Y2X) + call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, TRP_Y2X) ! release all the y directional fields. call self%backend%allocator%release_block(p_sx_y) From 5f5f1293e2c254bef77b00f8e705f5e9edc2e1cf Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Tue, 30 Jan 2024 17:02:15 +0000 Subject: [PATCH 6/8] refactor: Rename trans/transpose -> reorder. --- src/CMakeLists.txt | 2 +- src/backend.f90 | 8 ++--- src/common.f90 | 4 +-- src/cuda/backend.f90 | 32 ++++++++--------- ...{kernels_trans.f90 => kernels_reorder.f90} | 24 ++++++------- src/omp/backend.f90 | 6 ++-- src/solver.f90 | 34 +++++++++---------- 7 files changed, 55 insertions(+), 55 deletions(-) rename src/cuda/{kernels_trans.f90 => kernels_reorder.f90} (84%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 93281d59a..88a9480b6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,7 +17,7 @@ set(CUDASRC cuda/allocator.f90 cuda/exec_dist.f90 cuda/kernels_dist.f90 - cuda/kernels_trans.f90 + cuda/kernels_reorder.f90 cuda/sendrecv.f90 cuda/tdsops.f90 ) diff --git a/src/backend.f90 b/src/backend.f90 index 01b449276..bf7c240cc 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -29,7 +29,7 @@ module m_base_backend procedure(transeq_ders), deferred :: transeq_y procedure(transeq_ders), deferred :: transeq_z procedure(tds_solve), deferred :: tds_solve - procedure(trans_d2d), deferred :: trans_d2d + procedure(reorder), deferred :: reorder procedure(sum9into3), deferred :: sum_yzintox procedure(vecadd), deferred :: vecadd procedure(get_fields), deferred :: get_fields @@ -78,8 +78,8 @@ end subroutine tds_solve end interface abstract interface - subroutine trans_d2d(self, u_, u, direction) - !! transposer subroutines are straightforward, they rearrange + subroutine reorder(self, u_, u, direction) + !! reorder subroutines are straightforward, they rearrange !! data into our specialist data structure so that regardless !! of the direction tridiagonal systems are solved efficiently !! and fast. @@ -91,7 +91,7 @@ subroutine trans_d2d(self, u_, u, direction) class(field_t), intent(inout) :: u_ class(field_t), intent(in) :: u integer, intent(in) :: direction - end subroutine trans_d2d + end subroutine reorder end interface abstract interface diff --git a/src/common.f90 b/src/common.f90 index 809c7bf2d..6712b356c 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -4,8 +4,8 @@ module m_common integer, parameter :: dp=kind(0.0d0) real(dp), parameter :: pi = 4*atan(1.0_dp) - integer, parameter :: TRP_X2Y = 12, TRP_X2Z = 13, TRP_Y2X = 21, & - TRP_Y2Z = 23, TRP_Z2Y = 32 + integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & + RDR_Y2Z = 23, RDR_Z2Y = 32 type :: globs_t integer :: nx, ny, nz diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index d0651f70d..9eb7bcea2 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -3,7 +3,7 @@ module m_cuda_backend use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y + use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y use m_tdsops, only: dirps_t, tdsops_t use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t @@ -12,8 +12,8 @@ module m_cuda_backend use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields use m_cuda_tdsops, only: cuda_tdsops_t use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs - use m_cuda_kernels_trans, only: trans_x2y_k, trans_x2z_k, trans_y2x_k, & - trans_y2z_k, trans_z2y_k + use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, & + reorder_y2z, reorder_z2y implicit none @@ -34,7 +34,7 @@ module m_cuda_backend procedure :: transeq_y => transeq_y_cuda procedure :: transeq_z => transeq_z_cuda procedure :: tds_solve => tds_solve_cuda - procedure :: trans_d2d => trans_d2d_cuda + procedure :: reorder => reorder_cuda procedure :: sum_yzintox => sum_yzintox_cuda procedure :: vecadd => vecadd_cuda procedure :: set_fields => set_fields_cuda @@ -417,7 +417,7 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) end subroutine tds_solve_dist - subroutine trans_d2d_cuda(self, u_o, u_i, direction) + subroutine reorder_cuda(self, u_o, u_i, direction) implicit none class(cuda_backend_t) :: self @@ -432,35 +432,35 @@ subroutine trans_d2d_cuda(self, u_o, u_i, direction) select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select select case (direction) - case (TRP_X2Y) ! x2y + case (RDR_X2Y) ! x2y blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) threads = dim3(SZ, SZ, 1) - call trans_x2y_k<<>>(u_o_d, u_i_d, self%nz_loc) - case (TRP_X2Z) ! x2z + call reorder_x2y<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_X2Z) ! x2z blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) threads = dim3(SZ, 1, 1) - call trans_x2z_k<<>>(u_o_d, u_i_d, self%nz_loc) - case (TRP_Y2X) ! y2x + call reorder_x2z<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_Y2X) ! y2x blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) - call trans_y2x_k<<>>(u_o_d, u_i_d, self%nz_loc) - case (TRP_Y2Z) ! y2z + call reorder_y2x<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_Y2Z) ! y2z blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) - call trans_y2z_k<<>>(u_o_d, u_i_d, & + call reorder_y2z<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) - case (TRP_Z2Y) ! z2y + case (RDR_Z2Y) ! z2y blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) - call trans_z2y_k<<>>(u_o_d, u_i_d, & + call reorder_z2y<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) case default print *, 'Transpose direction is undefined.' stop end select - end subroutine trans_d2d_cuda + end subroutine reorder_cuda subroutine sum_yzintox_cuda(self, du, dv, dw, & du_y, dv_y, dw_y, du_z, dv_z, dw_z) diff --git a/src/cuda/kernels_trans.f90 b/src/cuda/kernels_reorder.f90 similarity index 84% rename from src/cuda/kernels_trans.f90 rename to src/cuda/kernels_reorder.f90 index 489dc0d2a..f5f923216 100644 --- a/src/cuda/kernels_trans.f90 +++ b/src/cuda/kernels_reorder.f90 @@ -1,4 +1,4 @@ -module m_cuda_kernels_trans +module m_cuda_kernels_reorder use cudafor use m_common, only: dp @@ -6,7 +6,7 @@ module m_cuda_kernels_trans contains - attributes(global) subroutine trans_x2y_k(u_y, u_x, nz) + attributes(global) subroutine reorder_x2y(u_y, u_x, nz) implicit none real(dp), device, intent(out), dimension(:, :, :) :: u_y @@ -27,9 +27,9 @@ attributes(global) subroutine trans_x2y_k(u_y, u_x, nz) ! copy into output array from shared u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i) - end subroutine trans_x2y_k + end subroutine reorder_x2y - attributes(global) subroutine trans_x2z_k(u_z, u_x, nz) + attributes(global) subroutine reorder_x2z(u_z, u_x, nz) implicit none real(dp), device, intent(out), dimension(:, :, :) :: u_z @@ -45,9 +45,9 @@ attributes(global) subroutine trans_x2z_k(u_z, u_x, nz) u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz) end do - end subroutine trans_x2z_k + end subroutine reorder_x2z - attributes(global) subroutine trans_y2x_k(u_x, u_y, nz) + attributes(global) subroutine reorder_y2x(u_x, u_y, nz) implicit none real(dp), device, intent(out), dimension(:, :, :) :: u_x @@ -68,9 +68,9 @@ attributes(global) subroutine trans_y2x_k(u_x, u_y, nz) ! copy into output array from shared u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i) - end subroutine trans_y2x_k + end subroutine reorder_y2x - attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz) + attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz) implicit none real(dp), device, intent(out), dimension(:, :, :) :: u_z @@ -91,9 +91,9 @@ attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz) ! copy into output array from shared u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i) - end subroutine trans_y2z_k + end subroutine reorder_y2z - attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz) + attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz) implicit none real(dp), device, intent(out), dimension(:, :, :) :: u_y @@ -114,6 +114,6 @@ attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz) ! copy into output array from shared u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i) - end subroutine trans_z2y_k + end subroutine reorder_z2y -end module m_cuda_kernels_trans +end module m_cuda_kernels_reorder diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 8d9ed3e71..297ba9196 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -24,7 +24,7 @@ module m_omp_backend procedure :: transeq_y => transeq_y_omp procedure :: transeq_z => transeq_z_omp procedure :: tds_solve => tds_solve_omp - procedure :: trans_d2d => trans_d2d_omp + procedure :: reorder => reorder_omp procedure :: sum_yzintox => sum_yzintox_omp procedure :: vecadd => vecadd_omp procedure :: set_fields => set_fields_omp @@ -160,7 +160,7 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops) end subroutine tds_solve_omp - subroutine trans_d2d_omp(self, u_, u, direction) + subroutine reorder_omp(self, u_, u, direction) implicit none class(omp_backend_t) :: self @@ -168,7 +168,7 @@ subroutine trans_d2d_omp(self, u_, u, direction) class(field_t), intent(in) :: u integer, intent(in) :: direction - end subroutine trans_d2d_omp + end subroutine reorder_omp subroutine sum_yzintox_omp(self, du, dv, dw, & du_y, dv_y, dw_y, du_z, dv_z, dw_z) diff --git a/src/solver.f90 b/src/solver.f90 index 67e520453..1ecea22c8 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -1,7 +1,7 @@ module m_solver use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t, TRP_X2Y, TRP_X2Z, TRP_Y2X, TRP_Y2Z, TRP_Z2Y + use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y use m_tdsops, only: tdsops_t, dirps_t use m_time_integrator, only: time_intg_t @@ -185,9 +185,9 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_d2d(u_y, u, TRP_X2Y) - call self%backend%trans_d2d(v_y, v, TRP_X2Y) - call self%backend%trans_d2d(w_y, w, TRP_X2Y) + call self%backend%reorder(u_y, u, RDR_X2Y) + call self%backend%reorder(v_y, v, RDR_X2Y) + call self%backend%reorder(w_y, w, RDR_X2Y) ! similar to the x direction, obtain derivatives in y. call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps) @@ -209,9 +209,9 @@ subroutine transeq(self, du, dv, dw, u, v, w) dw_z => self%backend%allocator%get_block() ! reorder from x to z - call self%backend%trans_d2d(u_z, u, TRP_X2Z) - call self%backend%trans_d2d(v_z, v, TRP_X2Z) - call self%backend%trans_d2d(w_z, w, TRP_X2Z) + call self%backend%reorder(u_z, u, RDR_X2Z) + call self%backend%reorder(v_z, v, RDR_X2Z) + call self%backend%reorder(w_z, w, RDR_X2Z) ! get the derivatives in z call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps) @@ -267,9 +267,9 @@ subroutine divergence(self, div_u, u, v, w) w_y => self%backend%allocator%get_block() ! reorder data from x orientation to y orientation - call self%backend%trans_d2d(u_y, du_x, TRP_X2Y) - call self%backend%trans_d2d(v_y, dv_x, TRP_X2Y) - call self%backend%trans_d2d(w_y, dw_x, TRP_X2Y) + call self%backend%reorder(u_y, du_x, RDR_X2Y) + call self%backend%reorder(v_y, dv_x, RDR_X2Y) + call self%backend%reorder(w_y, dw_x, RDR_X2Y) call self%backend%allocator%release_block(du_x) call self%backend%allocator%release_block(dv_x) @@ -303,8 +303,8 @@ subroutine divergence(self, div_u, u, v, w) call self%backend%vecadd(1._dp, dw_y, 1._dp, dv_y) ! reorder from y to z - call self%backend%trans_d2d(u_z, du_y, TRP_Y2Z) - call self%backend%trans_d2d(w_z, dw_y, TRP_Y2Z) + call self%backend%reorder(u_z, du_y, RDR_Y2Z) + call self%backend%reorder(w_z, dw_y, RDR_Y2Z) ! release all the unnecessary blocks. call self%backend%allocator%release_block(du_y) @@ -358,8 +358,8 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure) dpdz_sxy_y => self%backend%allocator%get_block() ! reorder data from z orientation to y orientation - call self%backend%trans_d2d(p_sxy_y, p_sxy_z, TRP_Z2Y) - call self%backend%trans_d2d(dpdz_sxy_y, dpdz_sxy_z, TRP_Z2Y) + call self%backend%reorder(p_sxy_y, p_sxy_z, RDR_Z2Y) + call self%backend%reorder(dpdz_sxy_y, dpdz_sxy_z, RDR_Z2Y) call self%backend%allocator%release_block(p_sxy_z) call self%backend%allocator%release_block(dpdz_sxy_z) @@ -386,9 +386,9 @@ subroutine gradient(self, dpdx, dpdy, dpdz, pressure) dpdz_sx_x => self%backend%allocator%get_block() ! reorder from y to x - call self%backend%trans_d2d(p_sx_x, p_sx_y, TRP_Y2X) - call self%backend%trans_d2d(dpdy_sx_x, dpdy_sx_y, TRP_Y2X) - call self%backend%trans_d2d(dpdz_sx_x, dpdz_sx_y, TRP_Y2X) + call self%backend%reorder(p_sx_x, p_sx_y, RDR_Y2X) + call self%backend%reorder(dpdy_sx_x, dpdy_sx_y, RDR_Y2X) + call self%backend%reorder(dpdz_sx_x, dpdz_sx_y, RDR_Y2X) ! release all the y directional fields. call self%backend%allocator%release_block(p_sx_y) From 33ef9035077784ff997a35cc412bfd136b00cd80 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 5 Feb 2024 17:32:20 +0000 Subject: [PATCH 7/8] feat(cuda/tests): Add tests for all reordering kernels. --- tests/CMakeLists.txt | 1 + tests/cuda/test_cuda_reorder.f90 | 228 +++++++++++++++++++++++++++++++ 2 files changed, 229 insertions(+) create mode 100644 tests/cuda/test_cuda_reorder.f90 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 968c33c74..808ada660 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -4,6 +4,7 @@ set(TESTSRC ) set(CUDATESTSRC cuda/test_cuda_allocator.f90 + cuda/test_cuda_reorder.f90 cuda/test_cuda_tridiag.f90 cuda/test_cuda_transeq.f90 ) diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90 new file mode 100644 index 000000000..833b0774f --- /dev/null +++ b/tests/cuda/test_cuda_reorder.f90 @@ -0,0 +1,228 @@ +program test_cuda_reorder + use iso_fortran_env, only: stderr => error_unit + use cudafor + use mpi + + use m_common, only: dp + use m_cuda_common, only: SZ + use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, & + reorder_y2z, reorder_z2y + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp + real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, u_temp_d + + integer :: n_block, i, n_iters + integer :: nx, ny, nz, ndof + integer :: nrank, nproc, pprev, pnext + integer :: ierr, ndevs, devnum + + type(dim3) :: blocks, threads + real(dp) :: norm_u, tol = 1d-8, tstart, tend + + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + + ierr = cudaGetDeviceCount(ndevs) + ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin + ierr = cudaGetDevice(devnum) + + !print*, 'I am rank', nrank, 'I am running on device', devnum + pnext = modulo(nrank - nproc + 1, nproc) + pprev = modulo(nrank - 1, nproc) + + nx = 512; ny = 512; nz = 512 + n_block = ny*nz/SZ + ndof = nx*ny*nz + n_iters = 100 + + allocate (u_i(SZ, nx, n_block), u_o(SZ, nx, n_block)) + allocate (u_temp(SZ, nx, n_block)) + allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block)) + allocate (u_temp_d(SZ, nx, n_block)) + + ! set a random field + call random_number(u_i) + + ! move data to device + u_i_d = u_i + + ! do a x to y reordering first and then a y to x + blocks = dim3(nx/SZ, nz, ny/SZ) + threads = dim3(SZ, SZ, 1) + call reorder_x2y<<>>(u_temp_d, u_i_d, nz) + + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_y2x<<>>(u_o_d, u_temp_d, nz) + + ! move the result back to host + u_o = u_o_d + + ! and check whether it matches the initial random field + norm_u = norm2(u_o - u_i) + if (nrank == 0) then + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder x2y and y2x... failed' + else + write(stderr, '(a)') 'Check reorder x2y and y2x... passed' + end if + end if + + ! we reuse u_o_d so zeroize in any case + u_o_d = 0 + + ! u_temp_d is in y orientation, use y2z to reorder it into z direction + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_y2z<<>>(u_o_d, u_temp_d, nx, nz) + + ! store this in host + u_o = u_o_d + + ! and zeroize u_o_d again in any case + u_o_d = 0 + + ! reorder initial random field into z orientation + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_x2z<<>>(u_o_d, u_i_d, nz) + u_temp = u_o_d + + ! compare two z oriented fields + norm_u = norm2(u_o - u_temp) + if (nrank == 0) then + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder x2z and y2z... failed' + else + write(stderr, '(a)') 'Check reorder x2z and y2z... passed' + end if + end if + + ! z oriented field into y + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_z2y<<>>(u_temp_d, u_o_d, nx, nz) + + ! zeroize just in case for reusing + u_o_d = 0 + + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_y2x<<>>(u_o_d, u_temp_d, nz) + u_o = u_o_d + + ! and check whether it matches the initial random field + norm_u = norm2(u_o - u_i) + if (nrank == 0) then + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder z2y and y2x... failed' + else + write(stderr, '(a)') 'Check reorder z2y and y2x... passed' + end if + end if + + ! Now the performance checks + call cpu_time(tstart) + do i = 1, n_iters + blocks = dim3(nx/SZ, nz, ny/SZ) + threads = dim3(SZ, SZ, 1) + call reorder_x2y<<>>(u_o_d, u_i_d, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + + call cpu_time(tstart) + do i = 1, n_iters + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_x2z<<>>(u_o_d, u_i_d, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + + call cpu_time(tstart) + do i = 1, n_iters + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_y2x<<>>(u_o_d, u_i_d, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + + call cpu_time(tstart) + do i = 1, n_iters + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_y2z<<>>(u_o_d, u_i_d, nx, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + + call cpu_time(tstart) + do i = 1, n_iters + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_z2y<<>>(u_o_d, u_i_d, nx, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + + if (allpass) then + if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + + call MPI_Finalize(ierr) + +contains + + subroutine checkperf(t_tot, n_iters, ndof, consumed_bw) + implicit none + + real(dp), intent(in) :: t_tot, consumed_bw + integer, intent(in) :: n_iters, ndof + + real(dp) :: achievedBW, devBW, achievedBWmax, achievedBWmin + integer :: ierr, memClockRt, memBusWidth + + ! BW utilisation and performance checks + achievedBW = consumed_bw*n_iters*ndof*dp/t_tot + call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & + MPI_MAX, MPI_COMM_WORLD, ierr) + call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & + MPI_MIN, MPI_COMM_WORLD, ierr) + + if (nrank == 0) then + print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s' + print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s' + end if + + ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) + ierr = cudaDeviceGetAttribute(memBusWidth, & + cudaDevAttrGlobalMemoryBusWidth, 0) + devBW = 2*memBusWidth/8._dp*memClockRt*1000 + + if (nrank == 0) then + print'(a, f8.3, a)', 'Device BW: ', devBW/2**30, ' GiB/s' + print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/devBW*100 + print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/devBW*100 + end if + end subroutine checkperf + +end program test_cuda_reorder + From 932df134af0b1ce66590f69043d517ec61dbadb5 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Tue, 6 Feb 2024 13:22:49 +0000 Subject: [PATCH 8/8] fix: Minor changes. --- src/cuda/backend.f90 | 5 ++--- src/cuda/kernels_reorder.f90 | 4 +++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 9eb7bcea2..07a68b00d 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -1,4 +1,5 @@ module m_cuda_backend + use iso_fortran_env, only: stderr => error_unit use cudafor use m_allocator, only: allocator_t, field_t @@ -452,12 +453,10 @@ subroutine reorder_cuda(self, u_o, u_i, direction) case (RDR_Z2Y) ! z2y blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) - call reorder_z2y<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) case default - print *, 'Transpose direction is undefined.' - stop + error stop 'Reorder direction is undefined.' end select end subroutine reorder_cuda diff --git a/src/cuda/kernels_reorder.f90 b/src/cuda/kernels_reorder.f90 index f5f923216..76cd9369c 100644 --- a/src/cuda/kernels_reorder.f90 +++ b/src/cuda/kernels_reorder.f90 @@ -36,11 +36,13 @@ attributes(global) subroutine reorder_x2z(u_z, u_x, nz) real(dp), device, intent(in), dimension(:, :, :) :: u_x integer, value, intent(in) :: nz - integer :: i, j, b_i, b_j, nx!, nz + integer :: i, j, b_i, b_j, nx i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y nx = gridDim%x + ! Data access pattern for reordering between x and z is quite nice + ! thus we don't need to use shared memory for this operation. do j = 1, nz u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz) end do