Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add transposer kernels for reordering field data. #29

Merged
merged 8 commits into from
Feb 6, 2024
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ set(CUDASRC
cuda/allocator.f90
cuda/exec_dist.f90
cuda/kernels_dist.f90
cuda/kernels_reorder.f90
cuda/sendrecv.f90
cuda/tdsops.f90
)
Expand Down
28 changes: 6 additions & 22 deletions src/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,15 @@ module m_base_backend
!! architecture.

real(dp) :: nu
integer :: nx_loc, ny_loc, nz_loc
class(allocator_t), pointer :: allocator
class(dirps_t), pointer :: xdirps, ydirps, zdirps
contains
procedure(transeq_ders), deferred :: transeq_x
procedure(transeq_ders), deferred :: transeq_y
procedure(transeq_ders), deferred :: transeq_z
procedure(tds_solve), deferred :: tds_solve
procedure(transposer), deferred :: trans_x2y
procedure(transposer), deferred :: trans_x2z
procedure(trans_d2d), deferred :: trans_y2z
procedure(trans_d2d), deferred :: trans_z2y
procedure(trans_d2d), deferred :: trans_y2x
procedure(reorder), deferred :: reorder
procedure(sum9into3), deferred :: sum_yzintox
procedure(vecadd), deferred :: vecadd
procedure(get_fields), deferred :: get_fields
Expand Down Expand Up @@ -81,22 +78,8 @@ end subroutine tds_solve
end interface

abstract interface
subroutine transposer(self, u_, v_, w_, u, v, w)
!! transposer subroutines are straightforward, they rearrange
!! data into our specialist data structure so that regardless
!! of the direction tridiagonal systems are solved efficiently
!! and fast.
import :: base_backend_t
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w
end subroutine transposer

subroutine trans_d2d(self, u_, u)
!! transposer subroutines are straightforward, they rearrange
subroutine reorder(self, u_, u, direction)
!! reorder subroutines are straightforward, they rearrange
!! data into our specialist data structure so that regardless
!! of the direction tridiagonal systems are solved efficiently
!! and fast.
Expand All @@ -107,7 +90,8 @@ subroutine trans_d2d(self, u_, u)
class(base_backend_t) :: self
class(field_t), intent(inout) :: u_
class(field_t), intent(in) :: u
end subroutine trans_d2d
integer, intent(in) :: direction
end subroutine reorder
end interface

abstract interface
Expand Down
3 changes: 3 additions & 0 deletions src/common.f90
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ module m_common
integer, parameter :: dp=kind(0.0d0)
real(dp), parameter :: pi = 4*atan(1.0_dp)

integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
RDR_Y2Z = 23, RDR_Z2Y = 32
semi-h marked this conversation as resolved.
Show resolved Hide resolved

type :: globs_t
integer :: nx, ny, nz
integer :: nx_loc, ny_loc, nz_loc
Expand Down
90 changes: 46 additions & 44 deletions src/cuda/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module m_cuda_backend

use m_allocator, only: allocator_t, field_t
use m_base_backend, only: base_backend_t
use m_common, only: dp, globs_t
use m_common, only: dp, globs_t, RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2Y
use m_tdsops, only: dirps_t, tdsops_t

use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t
Expand All @@ -12,6 +12,8 @@ module m_cuda_backend
use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
use m_cuda_tdsops, only: cuda_tdsops_t
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
reorder_y2z, reorder_z2y

implicit none

Expand All @@ -32,11 +34,7 @@ module m_cuda_backend
procedure :: transeq_y => transeq_y_cuda
procedure :: transeq_z => transeq_z_cuda
procedure :: tds_solve => tds_solve_cuda
procedure :: trans_x2y => trans_x2y_cuda
procedure :: trans_x2z => trans_x2z_cuda
procedure :: trans_y2z => trans_y2z_cuda
procedure :: trans_z2y => trans_z2y_cuda
procedure :: trans_y2x => trans_y2x_cuda
procedure :: reorder => reorder_cuda
procedure :: sum_yzintox => sum_yzintox_cuda
procedure :: vecadd => vecadd_cuda
procedure :: set_fields => set_fields_cuda
Expand Down Expand Up @@ -74,6 +72,10 @@ function init(globs, allocator) result(backend)
backend%zthreads = dim3(SZ, 1, 1)
backend%zblocks = dim3(globs%n_groups_z, 1, 1)

backend%nx_loc = globs%nx_loc
backend%ny_loc = globs%ny_loc
backend%nz_loc = globs%nz_loc

n_halo = 4
n_block = globs%n_groups_x

Expand Down Expand Up @@ -415,50 +417,50 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads)

end subroutine tds_solve_dist

subroutine trans_x2y_cuda(self, u_y, v_y, w_y, u, v, w)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u_y, v_y, w_y
class(field_t), intent(in) :: u, v, w

end subroutine trans_x2y_cuda

subroutine trans_x2z_cuda(self, u_z, v_z, w_z, u, v, w)
subroutine reorder_cuda(self, u_o, u_i, direction)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u_z, v_z, w_z
class(field_t), intent(in) :: u, v, w

end subroutine trans_x2z_cuda
class(field_t), intent(inout) :: u_o
class(field_t), intent(in) :: u_i
integer, intent(in) :: direction

subroutine trans_y2z_cuda(self, u_z, u_y)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u_z
class(field_t), intent(in) :: u_y

end subroutine trans_y2z_cuda

subroutine trans_z2y_cuda(self, u_y, u_z)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u_y
class(field_t), intent(in) :: u_z

end subroutine trans_z2y_cuda

subroutine trans_y2x_cuda(self, u_x, u_y)
implicit none
real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d
type(dim3) :: blocks, threads

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u_x
class(field_t), intent(in) :: u_y
select type(u_o); type is (cuda_field_t); u_o_d => u_o%data_d; end select
select type(u_i); type is (cuda_field_t); u_i_d => u_i%data_d; end select

select case (direction)
case (RDR_X2Y) ! x2y
blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
threads = dim3(SZ, SZ, 1)
call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_X2Z) ! x2z
blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
threads = dim3(SZ, 1, 1)
call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_Y2X) ! y2x
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_Y2Z) ! y2z
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
self%nx_loc, self%nz_loc)
case (RDR_Z2Y) ! z2y
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)

call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
self%nx_loc, self%nz_loc)
case default
print *, 'Transpose direction is undefined.'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be useful to report errors to an appropriate output stream, e.g. stderr via something like https://stackoverflow.com/a/8508757. Maybe we want to do this with a global logging system though. Thoughts?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes please for a more appropriate output stream!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We make use of this in the tests already. Changed the print here accordingly.

Could you give more details about a global logging system you have in mind? I haven't really used something like that. Maybe we can start an issue on this for further discussions.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see #30

stop
end select

end subroutine trans_y2x_cuda
end subroutine reorder_cuda

subroutine sum_yzintox_cuda(self, du, dv, dw, &
du_y, dv_y, dw_y, du_z, dv_z, dw_z)
Expand Down
119 changes: 119 additions & 0 deletions src/cuda/kernels_reorder.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
module m_cuda_kernels_reorder
use cudafor

use m_common, only: dp
use m_cuda_common, only: SZ

contains

attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_y
real(dp), device, intent(in), dimension(:, :, :) :: u_x
integer, value, intent(in) :: nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)

call syncthreads()

! copy into output array from shared
u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)

end subroutine reorder_x2y

attributes(global) subroutine reorder_x2z(u_z, u_x, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_z
real(dp), device, intent(in), dimension(:, :, :) :: u_x
integer, value, intent(in) :: nz

integer :: i, j, b_i, b_j, nx!, nz
semi-h marked this conversation as resolved.
Show resolved Hide resolved

i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
nx = gridDim%x

semi-h marked this conversation as resolved.
Show resolved Hide resolved
do j = 1, nz
u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
end do

end subroutine reorder_x2z

attributes(global) subroutine reorder_y2x(u_x, u_y, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_x
real(dp), device, intent(in), dimension(:, :, :) :: u_y
integer, value, intent(in) :: nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

call syncthreads()

! copy into output array from shared
u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)

end subroutine reorder_y2x

attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_z
real(dp), device, intent(in), dimension(:, :, :) :: u_y
integer, value, intent(in) :: nx, nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

call syncthreads()

! copy into output array from shared
u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)

end subroutine reorder_y2z

attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_y
real(dp), device, intent(in), dimension(:, :, :) :: u_z
integer, value, intent(in) :: nx, nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)

call syncthreads()

! copy into output array from shared
u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)

end subroutine reorder_z2y

end module m_cuda_kernels_reorder
47 changes: 4 additions & 43 deletions src/omp/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@ module m_omp_backend
procedure :: transeq_y => transeq_y_omp
procedure :: transeq_z => transeq_z_omp
procedure :: tds_solve => tds_solve_omp
procedure :: trans_x2y => trans_x2y_omp
procedure :: trans_x2z => trans_x2z_omp
procedure :: trans_y2z => trans_y2z_omp
procedure :: trans_z2y => trans_z2y_omp
procedure :: trans_y2x => trans_y2x_omp
procedure :: reorder => reorder_omp
procedure :: sum_yzintox => sum_yzintox_omp
procedure :: vecadd => vecadd_omp
procedure :: set_fields => set_fields_omp
Expand Down Expand Up @@ -164,50 +160,15 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops)

end subroutine tds_solve_omp

subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w

end subroutine trans_x2y_omp

subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w

end subroutine trans_x2z_omp

subroutine trans_y2z_omp(self, u_, u)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_
class(field_t), intent(in) :: u

end subroutine trans_y2z_omp

subroutine trans_z2y_omp(self, u_, u)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_
class(field_t), intent(in) :: u

end subroutine trans_z2y_omp

subroutine trans_y2x_omp(self, u_, u)
subroutine reorder_omp(self, u_, u, direction)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_
class(field_t), intent(in) :: u
integer, intent(in) :: direction

end subroutine trans_y2x_omp
end subroutine reorder_omp

subroutine sum_yzintox_omp(self, du, dv, dw, &
du_y, dv_y, dw_y, du_z, dv_z, dw_z)
Expand Down
Loading