Skip to content

Commit

Permalink
Merge pull request #45 from awnawab/naan-port-propags2
Browse files Browse the repository at this point in the history
Enable GPU offload of depth and current refraction solver
  • Loading branch information
wdeconinck authored Dec 5, 2024
2 parents 1f274f3 + 3dd78a4 commit 682893c
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 28 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ translation toolchain Loki. Currently, three Loki transformations are supported:
The scc-hoist and scc-stack transformations offer superior performance to the scc transformation. Currently, only the
OpenACC programming model on Nvidia GPUs is supported.

NB: GPU offload is not yet supported for ecWAM 1.4.x.
NB: GPU offload is not supported for ecWAM 1.4.0.

Building
--------
Expand Down
6 changes: 5 additions & 1 deletion share/ecwam/scripts/ecwam_run_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,14 @@ nproma=$(read_config nproma --default=24)
iphys=$(read_config iphys --default=1)
llgcbz0=$(read_config llgcbz0 --default=F)
llnormagam=$(read_config llnormagam --default=F)
irefra=$(read_config irefra --default=0)

# read timesteps
phys_tstp=$(read_config physics.timestep --format=seconds --default=900)
adv_base_tstp=$(read_config advection.timestep --format=seconds --default=900)
adv_fast_tstp=$(read_config advection.fast_waves.timestep --format=seconds --default=$adv_base_tstp)
ifrelfmax=$(read_config advection.fast_waves.max_frequency --default=0)
idelcur=$(read_config currents.input_step --default=86400)

# verify timesteps
if [ $(( $adv_base_tstp%$adv_fast_tstp )) -ne 0 ] ; then
Expand Down Expand Up @@ -211,11 +213,13 @@ cat > wam_namelist << EOF
CBPLTDT = "${begofrn}",
CEPLTDT = "${endofrn}",
CDATEF = "${begoffo}",
CDATECURA = "${begofrn}",
DELPRO_LF = ${adv_fast_tstp},
IFRELFMAX = ${ifrelfmax},
IDELPRO = ${adv_base_tstp},
IDELT = ${phys_tstp},
IDELINT = ${ppfreq},
IDELCUR = ${idelcur}
IREST = 1,
LFDBIOOUT = F,
LFDB = F,
Expand All @@ -228,7 +232,7 @@ cat > wam_namelist << EOF
LLNORMAGAM = ${llnormagam},
IPROPAGS = 2,
LSUBGRID = F,
IREFRA = 0,
IREFRA = ${irefra},
LICERUN = ${licerun},
LMASKICE = T,
LWAMRSETCI = T,
Expand Down
13 changes: 13 additions & 0 deletions src/ecwam/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,19 @@ if( HAVE_LOKI )
target_compile_definitions( ${ecwam} PRIVATE WAM_GPU )
endif()

if( NOT LOKI_MODE MATCHES "idem|idem-stack")
# Preprocess propags2.F90 for GPU enabled runs using Loki
loki_transform_target( TARGET ${ecwam}
MODE "idem"
DIRECTIVE openacc
FRONTEND ${LOKI_FRONTEND}
CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/ecwam_propags2_loki.config
PLAN ${CMAKE_CURRENT_BINARY_DIR}/loki_propags2_plan_ecwam.cmake
SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/
)
endif()

# Apply Loki source file transformation to lib target
loki_transform_target( TARGET ${ecwam}
MODE ${LOKI_MODE}
Expand Down
19 changes: 4 additions & 15 deletions src/ecwam/ctuwupdt.F90
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,13 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, &
IF (IREFRA == 2 .OR. IREFRA == 3) THEN
IF (.NOT. ALLOCATED(WMPMN)) ALLOCATE(WMPMN(IJS:IJL,NANG,NFRE_RED,-1:1))

#ifndef _OPENACC
IF (.NOT. ALLOCATED(LLWLATN)) ALLOCATE(LLWLATN(NANG,NFRE_RED,2,2))
IF (.NOT. ALLOCATED(LLWLONN)) ALLOCATE(LLWLONN(NANG,NFRE_RED,2))
IF (.NOT. ALLOCATED(LLWCORN)) ALLOCATE(LLWCORN(NANG,NFRE_RED,4,2))
IF (.NOT. ALLOCATED(LLWKPMN)) ALLOCATE(LLWKPMN(NANG,NFRE_RED,-1:1))
IF (.NOT. ALLOCATED(LLWMPMN)) ALLOCATE(LLWMPMN(NANG,NFRE_RED,-1:1))
#endif
ENDIF


Expand Down Expand Up @@ -255,15 +257,14 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, &
! FIND THE LOGICAL FLAGS THAT WILL LIMIT THE EXTEND OF THE CALCULATION IN PROPAGS2
! IN CASE REFRACTION IS USED

#ifndef _OPENACC
IF (IREFRA == 2 .OR. IREFRA == 3) THEN

!$acc parallel loop independent collapse(4)
DO ICL=1,2
DO IC=1,2
DO K=1,NANG
DO M=1,NFRE_RED
LLWLATN(K,M,IC,ICL)=.FALSE.
!$acc loop
DO IJ=IJS,IJL
IF (WLATN(IJ,K,M,IC,ICL) > 0.0_JWRB) THEN
LLWLATN(K,M,IC,ICL)=.TRUE.
Expand All @@ -274,14 +275,11 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, &
ENDDO
ENDDO
ENDDO
!$acc end parallel

!$acc parallel loop independent collapse(3)
DO IC=1,2
DO M=1,NFRE_RED
DO K=1,NANG
LLWLONN(K,M,IC)=.FALSE.
!$acc loop
DO IJ=IJS,IJL
IF (WLONN(IJ,K,M,IC) > 0.0_JWRB) THEN
LLWLONN(K,M,IC)=.TRUE.
Expand All @@ -291,15 +289,12 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, &
ENDDO
ENDDO
ENDDO
!$acc end parallel

!$acc parallel loop independent collapse(4)
DO ICL=1,2
DO ICR=1,4
DO M=1,NFRE_RED
DO K=1,NANG
LLWCORN(K,M,ICR,ICL)=.FALSE.
!$acc loop
DO IJ=IJS,IJL
IF (WCORN(IJ,K,M,ICR,ICL) > 0.0_JWRB) THEN
LLWCORN(K,M,ICR,ICL)=.TRUE.
Expand All @@ -310,14 +305,11 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, &
ENDDO
ENDDO
ENDDO
!$acc end parallel

!$acc parallel loop independent collapse(3)
DO IC=-1,1
DO M=1,NFRE_RED
DO K=1,NANG
LLWKPMN(K,M,IC)=.FALSE.
!$acc loop
DO IJ=IJS,IJL
IF (WKPMN(IJ,K,M,IC) > 0.0_JWRB) THEN
LLWKPMN(K,M,IC)=.TRUE.
Expand All @@ -327,14 +319,11 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, &
ENDDO
ENDDO
ENDDO
!$acc end parallel

!$acc parallel loop independent collapse(3)
DO IC=-1,1
DO M=1,NFRE_RED
DO K=1,NANG
LLWMPMN(K,M,IC)=.FALSE.
!$acc loop
DO IJ=IJS,IJL
IF (WMPMN(IJ,K,M,IC) > 0.0_JWRB) THEN
LLWMPMN(K,M,IC)=.TRUE.
Expand All @@ -344,9 +333,9 @@ SUBROUTINE CTUWUPDT (IJS, IJL, NINF, NSUP, &
ENDDO
ENDDO
ENDDO
!$acc end parallel

ENDIF
#endif

IF (ALLOCATED(THDD)) DEALLOCATE(THDD)
IF (ALLOCATED(THDC)) DEALLOCATE(THDC)
Expand Down
57 changes: 57 additions & 0 deletions src/ecwam/ecwam_propags2_loki.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
[default]
mode = "idem"
role = "kernel"
expand = true
strict = true
replicate = true

# Utility calls and IO statements to remove
[transformations.RemoveCodeTransformation]
module = "loki.transformations"
[transformations.RemoveCodeTransformation.options]
remove_dead_code = true
kernel_only = true

# Loop transformations
[transformations.LoopUnrollTransformation]
module = "loki.transformations"
classname = "TransformLoopsTransformation"
[transformations.LoopUnrollTransformation.options]
loop_unroll = true

[transformations.LoopFuseTransformation]
module = "loki.transformations"
classname = "TransformLoopsTransformation"
[transformations.LoopFuseTransformation.options]
loop_fusion = true

# SubstituteExpressionTransformation
[transformations.SubstituteExpressionTransformation]
module = "loki.transformations"
[transformations.SubstituteExpressionTransformation.options]
substitute_expressions = true
substitute_body = true
[transformations.SubstituteExpressionTransformation.options.expression_map]
"llwlonn(k,m,ic)" = ".true."
"llwlatn(k,m,ic,icl)" = ".true."
"llwcorn(k,m,icr,icl)" = ".true."
"llwkpmn(k,m,ic)" = ".true."
"llwmpmn(k,m,ic)" = ".true."

# Idem transformation
[transformations.IdemTransformation]
module = "loki.transformations"

# loki pipelines
[pipelines.idem]
transformations = [
'SubstituteExpressionTransformation', 'RemoveCodeTransformation', 'LoopUnrollTransformation',
'LoopFuseTransformation', 'IdemTransformation'
]

# Define entry point for call-tree transformation
[routines.propags2]
role = "kernel"
expand = false
replicate = false

14 changes: 14 additions & 0 deletions src/ecwam/gradi.F90
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, &
!* 1. INITIALISE.
! -----------

!$acc data present(KLAT,WLAT,DPTHEXT) copyin(DELLAM)

NLAND=NSUP+1
ONEO2DELPHI = 0.5_JWRB/DELPHI

Expand All @@ -116,6 +118,7 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, &
! --------------------------

IF (IREFRA == 1 .OR. IREFRA == 3) THEN
!$acc kernels
DO IJ=KIJS,KIJL
IPP = KLAT(IJ,2,1)
IPM = KLAT(IJ,1,1)
Expand Down Expand Up @@ -146,11 +149,14 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, &
DDLAM(IJ) = 0.0_JWRB
ENDIF
ENDDO
!$acc end kernels
ELSE
!$acc kernels
DO IJ=KIJS,KIJL
DDPHI(IJ) = 0.0_JWRB
DDLAM(IJ) = 0.0_JWRB
ENDDO
!$acc end kernels
ENDIF

! ----------------------------------------------------------------------
Expand All @@ -159,6 +165,7 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, &
! -------------------------------------

IF (IREFRA == 2 .OR. IREFRA == 3) THEN
!$acc kernels
DO IJ=KIJS,KIJL
IPP = KLAT(IJ,2,1)
! exact 0 means that the current field was not defined, hence
Expand Down Expand Up @@ -206,7 +213,9 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, &
DVLAM(IJ) = 0.0_JWRB
ENDIF
ENDDO
!$acc end kernels

!$acc kernels
DO IJ=KIJS,KIJL
KX = BLK2GLO%KXLT(IJ)
CGMAX = CURRENT_GRADIENT_MAX*COSPH(KX)
Expand All @@ -215,16 +224,21 @@ SUBROUTINE GRADI (KIJS, KIJL, NINF, NSUP, IREFRA, &
DULAM(IJ) = SIGN(MIN(ABS(DULAM(IJ)),CGMAX),DULAM(IJ))
DVLAM(IJ) = SIGN(MIN(ABS(DVLAM(IJ)),CGMAX),DVLAM(IJ))
ENDDO
!$acc end kernels

ELSE
!$acc kernels
DO IJ=KIJS,KIJL
DUPHI(IJ) = 0.0_JWRB
DVPHI(IJ) = 0.0_JWRB
DULAM(IJ) = 0.0_JWRB
DVLAM(IJ) = 0.0_JWRB
ENDDO
!$acc end kernels
ENDIF

!$acc end data

IF (LHOOK) CALL DR_HOOK('GRADI',1,ZHOOK_HANDLE)

END SUBROUTINE GRADI
15 changes: 10 additions & 5 deletions src/ecwam/propag_wam.F90
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, &
!!! the advection schemes are still written in block structure
!!! mapping chuncks to block ONLY for actual grid points !!!!
#ifdef _OPENACC
!$acc kernels loop independent private(KIJS, IJSB, KIJL, IJLB)
!$acc parallel loop private(KIJS, IJSB, KIJL, IJLB)
#else
!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(ICHNK, KIJS, IJSB, KIJL, IJLB, M, K)
#endif /*_OPENACC*/
Expand All @@ -134,7 +134,7 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, &
ENDDO
ENDDO
#ifdef _OPENACC
!$acc end kernels
!$acc end parallel loop
#else
!$OMP END PARALLEL DO
#endif /*_OPENACC*/
Expand Down Expand Up @@ -171,9 +171,6 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, &
! ---------------------

IF (LLUPDTTD) THEN
#ifdef _OPENACC
CALL WAM_ABORT("PROPAG_WAM: BRANCH NOT YET PORTED FOR GPU EXECUTION")
#endif
IF (.NOT.ALLOCATED(THDC)) ALLOCATE(THDC(IJSG:IJLG, NANG))
IF (.NOT.ALLOCATED(THDD)) ALLOCATE(THDD(IJSG:IJLG, NANG))
IF (.NOT.ALLOCATED(SDOT)) ALLOCATE(SDOT(IJSG:IJLG, NANG, NFRE_RED))
Expand All @@ -187,7 +184,11 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, &

! DOT THETA TERM:

#ifdef _OPENACC
!$acc data create(THDC,THDD,SDOT) present(BUFFER_EXT,BLK2GLO)
#else
!$OMP PARALLEL DO SCHEDULE(DYNAMIC,1) PRIVATE(JKGLO, KIJS, KIJL)
#endif
DO JKGLO = IJSG, IJLG, NPROMA
KIJS=JKGLO
KIJL=MIN(KIJS+NPROMA-1, IJLG)
Expand All @@ -199,7 +200,11 @@ SUBROUTINE PROPAG_WAM (BLK2GLO, WAVNUM, CGROUP, OMOSNH2KD, FL1, &
& BUFFER_EXT(:,3*NFRE_RED+4), BUFFER_EXT(:,3*NFRE_RED+5), &
& THDC(KIJS:KIJL,:), THDD(KIJS:KIJL,:), SDOT(KIJS:KIJL,:,:))
ENDDO
#ifdef _OPENACC
!$acc end data
#else
!$OMP END PARALLEL DO
#endif

LLUPDTTD = .FALSE.
ENDIF
Expand Down
Loading

0 comments on commit 682893c

Please sign in to comment.