Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add transposer kernels for reordering field data. #29

Merged
merged 8 commits into from
Feb 6, 2024
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ set(CUDASRC
cuda/allocator.f90
cuda/exec_dist.f90
cuda/kernels_dist.f90
cuda/kernels_trans.f90
cuda/sendrecv.f90
cuda/tdsops.f90
)
Expand Down
19 changes: 3 additions & 16 deletions src/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,16 @@ module m_base_backend
!! architecture.

real(dp) :: nu
integer :: nx_loc, ny_loc, nz_loc
class(allocator_t), pointer :: allocator
class(dirps_t), pointer :: xdirps, ydirps, zdirps
contains
procedure(transeq_ders), deferred :: transeq_x
procedure(transeq_ders), deferred :: transeq_y
procedure(transeq_ders), deferred :: transeq_z
procedure(tds_solve), deferred :: tds_solve
procedure(transposer), deferred :: trans_x2y
procedure(transposer), deferred :: trans_x2z
procedure(trans_d2d), deferred :: trans_x2y
procedure(trans_d2d), deferred :: trans_x2z
procedure(trans_d2d), deferred :: trans_y2z
procedure(trans_d2d), deferred :: trans_z2y
procedure(trans_d2d), deferred :: trans_y2x
Expand Down Expand Up @@ -81,20 +82,6 @@ end subroutine tds_solve
end interface

abstract interface
subroutine transposer(self, u_, v_, w_, u, v, w)
!! transposer subroutines are straightforward, they rearrange
!! data into our specialist data structure so that regardless
!! of the direction tridiagonal systems are solved efficiently
!! and fast.
import :: base_backend_t
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w
end subroutine transposer

subroutine trans_d2d(self, u_, u)
!! transposer subroutines are straightforward, they rearrange
!! data into our specialist data structure so that regardless
Expand Down
75 changes: 69 additions & 6 deletions src/cuda/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ module m_cuda_backend
use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
use m_cuda_tdsops, only: cuda_tdsops_t
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
use m_cuda_kernels_trans, only: trans_x2y_k, trans_x2z_k, trans_y2x_k, &
trans_y2z_k, trans_z2y_k

implicit none

Expand Down Expand Up @@ -74,6 +76,10 @@ function init(globs, allocator) result(backend)
backend%zthreads = dim3(SZ, 1, 1)
backend%zblocks = dim3(globs%n_groups_z, 1, 1)

backend%nx_loc = globs%nx_loc
backend%ny_loc = globs%ny_loc
backend%nz_loc = globs%nz_loc

n_halo = 4
n_block = globs%n_groups_x

Expand Down Expand Up @@ -415,21 +421,43 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads)

end subroutine tds_solve_dist

subroutine trans_x2y_cuda(self, u_y, v_y, w_y, u, v, w)
subroutine trans_x2y_cuda(self, u_y, u)
semi-h marked this conversation as resolved.
Show resolved Hide resolved
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u_y, v_y, w_y
class(field_t), intent(in) :: u, v, w
class(field_t), intent(inout) :: u_y
class(field_t), intent(in) :: u

real(dp), device, pointer, dimension(:, :, :) :: u_d, u_y_d
type(dim3) :: blocks, threads

select type(u); type is (cuda_field_t); u_d => u%data_d; end select
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WRT comment that "select type stuff is annoying" - I agree. One possible way to make it slightly better would be

subroutine set_data_pointer(u, u_d)
  class(field_t), intent(in) :: u
  real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_d
  select type(u)
  type is (cuda_field_t)
    u_d => u%data_d
  end select
end subroutine

and then the code in the main subroutine should be

call set_data_pointer(u, u_d)
call set_data_pointer(u_y, u_y_d)

which is perhaps a little better.

The other option would be to rewrite this as a wrapper subroutine accepting class(field_t) and then it calls the actual subroutine with type(cuda_field_t) arguments. The wrapper subroutine would need to do the select type dance (and should probably use class default for some error checking), but the actual implementation would be free if this complication which may or may not be an improvement.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need something like this for sure, but was planning to deal with it later on when fixing the allocator. The issue is mentioned here #24. And I think we can pass the size we need into this set_data_pointer function and handle the bounds remapping there.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, better not to clutter the change.

select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select

blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
threads = dim3(SZ, SZ, 1)

call trans_x2y_k<<<blocks, threads>>>(u_y_d, u_d, self%nz_loc)

end subroutine trans_x2y_cuda

subroutine trans_x2z_cuda(self, u_z, v_z, w_z, u, v, w)
subroutine trans_x2z_cuda(self, u_z, u)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u_z, v_z, w_z
class(field_t), intent(in) :: u, v, w
class(field_t), intent(inout) :: u_z
class(field_t), intent(in) :: u

real(dp), device, pointer, dimension(:, :, :) :: u_d, u_z_d
type(dim3) :: blocks, threads

select type(u); type is (cuda_field_t); u_d => u%data_d; end select
select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select

blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
threads = dim3(SZ, 1, 1)

call trans_x2z_k<<<blocks, threads>>>(u_z_d, u_d, self%nz_loc)

end subroutine trans_x2z_cuda

Expand All @@ -440,6 +468,18 @@ subroutine trans_y2z_cuda(self, u_z, u_y)
class(field_t), intent(inout) :: u_z
class(field_t), intent(in) :: u_y

real(dp), device, pointer, dimension(:, :, :) :: u_z_d, u_y_d
type(dim3) :: blocks, threads

select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select
select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select

blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)

call trans_y2z_k<<<blocks, threads>>>(u_z_d, u_y_d, &
self%nx_loc, self%nz_loc)

end subroutine trans_y2z_cuda

subroutine trans_z2y_cuda(self, u_y, u_z)
Expand All @@ -449,6 +489,18 @@ subroutine trans_z2y_cuda(self, u_y, u_z)
class(field_t), intent(inout) :: u_y
class(field_t), intent(in) :: u_z

real(dp), device, pointer, dimension(:, :, :) :: u_y_d, u_z_d
type(dim3) :: blocks, threads

select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
select type(u_z); type is (cuda_field_t); u_z_d => u_z%data_d; end select

blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)

call trans_z2y_k<<<blocks, threads>>>(u_y_d, u_z_d, &
self%nx_loc, self%nz_loc)

end subroutine trans_z2y_cuda

subroutine trans_y2x_cuda(self, u_x, u_y)
Expand All @@ -458,6 +510,17 @@ subroutine trans_y2x_cuda(self, u_x, u_y)
class(field_t), intent(inout) :: u_x
class(field_t), intent(in) :: u_y

real(dp), device, pointer, dimension(:, :, :) :: u_x_d, u_y_d
type(dim3) :: blocks, threads

select type(u_x); type is (cuda_field_t); u_x_d => u_x%data_d; end select
select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select

blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)

call trans_y2x_k<<<blocks, threads>>>(u_x_d, u_y_d, self%nz_loc)

end subroutine trans_y2x_cuda

subroutine sum_yzintox_cuda(self, du, dv, dw, &
Expand Down
119 changes: 119 additions & 0 deletions src/cuda/kernels_trans.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
module m_cuda_kernels_trans
use cudafor

use m_common, only: dp
use m_cuda_common, only: SZ

contains

attributes(global) subroutine trans_x2y_k(u_y, u_x, nz)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly, I feel like these can be refactored into a single subroutine

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if we can merge all these kernels into one while making sure the performance is good. This is specific to CUDA backend tough, on the OpenMP backend I think it is possible. What you have in CUDA kernels is like the bit of code you have inside some number of nested loops, and any conditional checks over there can make performance bad.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes ok, that's the part I wasn't sure about.

implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_y
real(dp), device, intent(in), dimension(:, :, :) :: u_x
integer, value, intent(in) :: nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz)

call syncthreads()

! copy into output array from shared
u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i)

end subroutine trans_x2y_k

attributes(global) subroutine trans_x2z_k(u_z, u_x, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_z
real(dp), device, intent(in), dimension(:, :, :) :: u_x
integer, value, intent(in) :: nz

integer :: i, j, b_i, b_j, nx!, nz

i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y
nx = gridDim%x

do j = 1, nz
u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz)
end do

end subroutine trans_x2z_k

attributes(global) subroutine trans_y2x_k(u_x, u_y, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_x
real(dp), device, intent(in), dimension(:, :, :) :: u_y
integer, value, intent(in) :: nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

call syncthreads()

! copy into output array from shared
u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i)

end subroutine trans_y2x_k

attributes(global) subroutine trans_y2z_k(u_z, u_y, nx, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_z
real(dp), device, intent(in), dimension(:, :, :) :: u_y
integer, value, intent(in) :: nx, nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k)

call syncthreads()

! copy into output array from shared
u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i)

end subroutine trans_y2z_k

attributes(global) subroutine trans_z2y_k(u_y, u_z, nx, nz)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_y
real(dp), device, intent(in), dimension(:, :, :) :: u_z
integer, value, intent(in) :: nx, nz

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z

! copy into shared
tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx)

call syncthreads()

! copy into output array from shared
u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i)

end subroutine trans_z2y_k

end module m_cuda_kernels_trans
12 changes: 6 additions & 6 deletions src/omp/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -164,21 +164,21 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops)

end subroutine tds_solve_omp

subroutine trans_x2y_omp(self, u_, v_, w_, u, v, w)
subroutine trans_x2y_omp(self, u_, u)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w
class(field_t), intent(inout) :: u_
class(field_t), intent(in) :: u

end subroutine trans_x2y_omp

subroutine trans_x2z_omp(self, u_, v_, w_, u, v, w)
subroutine trans_x2z_omp(self, u_, u)
implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u_, v_, w_
class(field_t), intent(in) :: u, v, w
class(field_t), intent(inout) :: u_
class(field_t), intent(in) :: u

end subroutine trans_x2z_omp

Expand Down
14 changes: 11 additions & 3 deletions src/solver.f90
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,10 @@ subroutine transeq(self, du, dv, dw, u, v, w)
dw_y => self%backend%allocator%get_block()

! reorder data from x orientation to y orientation
call self%backend%trans_x2y(u_y, v_y, w_y, u, v, w)
call self%backend%trans_x2y(u_y, u)
call self%backend%trans_x2y(v_y, v)
call self%backend%trans_x2y(w_y, w)

! similar to the x direction, obtain derivatives in y.
call self%backend%transeq_y(du_y, dv_y, dw_y, u_y, v_y, w_y, self%ydirps)

Expand All @@ -206,7 +209,10 @@ subroutine transeq(self, du, dv, dw, u, v, w)
dw_z => self%backend%allocator%get_block()

! reorder from x to z
call self%backend%trans_x2z(u_z, v_z, w_z, u, v, w)
call self%backend%trans_x2z(u_z, u)
call self%backend%trans_x2z(v_z, v)
call self%backend%trans_x2z(w_z, w)

! get the derivatives in z
call self%backend%transeq_z(du_z, dv_z, dw_z, u_z, v_z, w_z, self%zdirps)

Expand Down Expand Up @@ -261,7 +267,9 @@ subroutine divergence(self, div_u, u, v, w)
w_y => self%backend%allocator%get_block()

! reorder data from x orientation to y orientation
call self%backend%trans_x2y(u_y, v_y, w_y, du_x, dv_x, dw_x)
call self%backend%trans_x2y(u_y, du_x)
call self%backend%trans_x2y(v_y, dv_x)
call self%backend%trans_x2y(w_y, dw_x)

call self%backend%allocator%release_block(du_x)
call self%backend%allocator%release_block(dv_x)
Expand Down