From d4abfd7b8539800ee2a37c4e23eb1ed934cd453d Mon Sep 17 00:00:00 2001 From: Asim YarKhan Date: Thu, 19 Oct 2023 16:19:45 +0000 Subject: [PATCH] For CUDA fro-norm kernels, move syncthreads out of loop. Problem discovered under dpct-generated-sycl kernels. --- src/cuda/device_genorm.cu | 2 +- src/cuda/device_henorm.cu | 2 +- src/cuda/device_synorm.cu | 2 +- src/cuda/device_trnorm.cu | 2 +- src/hip/device_genorm.hip.cc | 2 +- src/hip/device_genorm.hip.cc.dep | 2 +- src/hip/device_henorm.hip.cc | 2 +- src/hip/device_henorm.hip.cc.dep | 2 +- src/hip/device_synorm.hip.cc | 2 +- src/hip/device_synorm.hip.cc.dep | 2 +- src/hip/device_trnorm.hip.cc | 2 +- src/hip/device_trnorm.hip.cc.dep | 2 +- 12 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/cuda/device_genorm.cu b/src/cuda/device_genorm.cu index 651a65dc3..a116ffb53 100644 --- a/src/cuda/device_genorm.cu +++ b/src/cuda/device_genorm.cu @@ -262,11 +262,11 @@ __global__ void genorm_fro_kernel( // Save partial results in shared memory. combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { tile_scale = row_scale[0]; tile_sumsq = row_sumsq[0]; diff --git a/src/cuda/device_henorm.cu b/src/cuda/device_henorm.cu index bb802a374..d19219c5a 100644 --- a/src/cuda/device_henorm.cu +++ b/src/cuda/device_henorm.cu @@ -227,11 +227,11 @@ __global__ void henorm_fro_kernel( row_sumsq[chunk] = 1; } combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; diff --git a/src/cuda/device_synorm.cu b/src/cuda/device_synorm.cu index 229f21098..9ab1e74a5 100644 --- a/src/cuda/device_synorm.cu +++ b/src/cuda/device_synorm.cu @@ -219,11 +219,11 @@ __global__ void synorm_fro_kernel( row_sumsq[chunk] = 1; } combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; diff --git a/src/cuda/device_trnorm.cu b/src/cuda/device_trnorm.cu index 82acf947a..6f740c701 100644 --- a/src/cuda/device_trnorm.cu +++ b/src/cuda/device_trnorm.cu @@ -335,11 +335,11 @@ __global__ void trnorm_fro_kernel( } combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; diff --git a/src/hip/device_genorm.hip.cc b/src/hip/device_genorm.hip.cc index fea850194..145ea0b1e 100644 --- a/src/hip/device_genorm.hip.cc +++ b/src/hip/device_genorm.hip.cc @@ -263,11 +263,11 @@ __global__ void genorm_fro_kernel( // Save partial results in shared memory. combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { tile_scale = row_scale[0]; tile_sumsq = row_sumsq[0]; diff --git a/src/hip/device_genorm.hip.cc.dep b/src/hip/device_genorm.hip.cc.dep index 3df42daa2..8032f5562 100644 --- a/src/hip/device_genorm.hip.cc.dep +++ b/src/hip/device_genorm.hip.cc.dep @@ -1 +1 @@ -0e0a5d56961c18926d854137b793f41e src/cuda/device_genorm.cu +7e00edc231757cb85aff3874a2df26fe src/cuda/device_genorm.cu diff --git a/src/hip/device_henorm.hip.cc b/src/hip/device_henorm.hip.cc index 695a5ce99..b19df4ca0 100644 --- a/src/hip/device_henorm.hip.cc +++ b/src/hip/device_henorm.hip.cc @@ -228,11 +228,11 @@ __global__ void henorm_fro_kernel( row_sumsq[chunk] = 1; } combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; diff --git a/src/hip/device_henorm.hip.cc.dep b/src/hip/device_henorm.hip.cc.dep index fd4fd64d4..2e2afad73 100644 --- a/src/hip/device_henorm.hip.cc.dep +++ b/src/hip/device_henorm.hip.cc.dep @@ -1 +1 @@ -edb4c4922d8baff7e67b5aa4cd53cd4a src/cuda/device_henorm.cu +59ba88d555511c8042a44b64f5a10658 src/cuda/device_henorm.cu diff --git a/src/hip/device_synorm.hip.cc b/src/hip/device_synorm.hip.cc index dff67ad62..05772502d 100644 --- a/src/hip/device_synorm.hip.cc +++ b/src/hip/device_synorm.hip.cc @@ -220,11 +220,11 @@ __global__ void synorm_fro_kernel( row_sumsq[chunk] = 1; } combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; diff --git a/src/hip/device_synorm.hip.cc.dep b/src/hip/device_synorm.hip.cc.dep index 3e753b908..c4d74adbd 100644 --- a/src/hip/device_synorm.hip.cc.dep +++ b/src/hip/device_synorm.hip.cc.dep @@ -1 +1 @@ -bb1a7390496864beb7a2999f514f2583 src/cuda/device_synorm.cu +1379a5e2fc495c10f43521cb14c3302b src/cuda/device_synorm.cu diff --git a/src/hip/device_trnorm.hip.cc b/src/hip/device_trnorm.hip.cc index c6c3ee5b9..5ae971f92 100644 --- a/src/hip/device_trnorm.hip.cc +++ b/src/hip/device_trnorm.hip.cc @@ -336,11 +336,11 @@ __global__ void trnorm_fro_kernel( } combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq); - __syncthreads(); } // Reduction to find sum-of-squares of tile. // todo: parallel reduction. + __syncthreads(); if (threadIdx.x == 0) { real_t tile_scale = row_scale[0]; real_t tile_sumsq = row_sumsq[0]; diff --git a/src/hip/device_trnorm.hip.cc.dep b/src/hip/device_trnorm.hip.cc.dep index 20e099a61..310264d05 100644 --- a/src/hip/device_trnorm.hip.cc.dep +++ b/src/hip/device_trnorm.hip.cc.dep @@ -1 +1 @@ -ca0b95ee7b4f23a6edb8e608e9061207 src/cuda/device_trnorm.cu +e10e6af7419166f911576cd5d18653ad src/cuda/device_trnorm.cu