Skip to content

Commit

Permalink
feat(cuda): Add transposer kernels for reordering field data.
Browse files Browse the repository at this point in the history
  • Loading branch information
semi-h committed Jan 24, 2024
1 parent 26d4d4c commit 1aa737c
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ set(CUDASRC
cuda/allocator.f90
cuda/exec_dist.f90
cuda/kernels_dist.f90
cuda/kernels_trans.f90
cuda/sendrecv.f90
cuda/tdsops.f90
)
Expand Down
15 changes: 15 additions & 0 deletions src/cuda/backend.f90
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module m_cuda_backend
use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields
use m_cuda_tdsops, only: cuda_tdsops_t
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
use m_cuda_kernels_trans, only: trans_x2y_k

implicit none

Expand Down Expand Up @@ -422,6 +423,20 @@ subroutine trans_x2y_cuda(self, u_y, v_y, w_y, u, v, w)
class(field_t), intent(inout) :: u_y, v_y, w_y
class(field_t), intent(in) :: u, v, w

real(dp), device, pointer, dimension(:, :, :) :: u_d, v_d, w_d, &
u_y_d, v_y_d, w_y_d

select type(u); type is (cuda_field_t); u_d => u%data_d; end select
select type(v); type is (cuda_field_t); v_d => v%data_d; end select
select type(w); type is (cuda_field_t); w_d => w%data_d; end select
select type(u_y); type is (cuda_field_t); u_y_d => u_y%data_d; end select
select type(v_y); type is (cuda_field_t); v_y_d => v_y%data_d; end select
select type(w_y); type is (cuda_field_t); w_y_d => w_y%data_d; end select

call trans_x2y_k<<<self%xblocks, self%xthreads>>>(u_y_d, u_d)
call trans_x2y_k<<<self%xblocks, self%xthreads>>>(v_y_d, v_d)
call trans_x2y_k<<<self%xblocks, self%xthreads>>>(w_y_d, w_d)

end subroutine trans_x2y_cuda

subroutine trans_x2z_cuda(self, u_z, v_z, w_z, u, v, w)
Expand Down
32 changes: 32 additions & 0 deletions src/cuda/kernels_trans.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module m_cuda_kernels_trans
use cudafor

use m_common, only: dp
use m_cuda_common, only: SZ

contains

attributes(global) subroutine trans_x2y_k(u_y, u_x)
implicit none

real(dp), device, intent(out), dimension(:, :, :) :: u_y
real(dp), device, intent(in), dimension(:, :, :) :: u_x

real(dp), shared :: tile(SZ, SZ)
integer :: i, j, b_i, b_j, b_k, nz

i = threadIdx%x; j = threadIdx%y
b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
nz = gridDim%z*blockDim%z

! copy into shared
tile(i, j) = u_x(i, j+(b_i-1)*SZ, b_j+(b_k-1)*nz)

call syncthreads()

! copy into output array from shared
u_y(i, j+(b_k-1)*SZ, b_j+(b_i-1)*nz) = tile(j, i)

end subroutine trans_x2y_k

end module m_cuda_kernels_trans

0 comments on commit 1aa737c

Please sign in to comment.