Skip to content

Commit

Permalink
Merge pull request #130 from ayarkhan/cuda-norm-syncthreads-fix
Browse files Browse the repository at this point in the history
For CUDA fro-norm kernels, move syncthreads out of loop.  Problem discovered under dpct-generated-sycl kernels.
  • Loading branch information
mgates3 authored Oct 23, 2023
2 parents 8354553 + d4abfd7 commit fbcb9df
Show file tree
Hide file tree
Showing 12 changed files with 12 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/cuda/device_genorm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,11 @@ __global__ void genorm_fro_kernel(

// Save partial results in shared memory.
combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
tile_scale = row_scale[0];
tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/device_henorm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,11 @@ __global__ void henorm_fro_kernel(
row_sumsq[chunk] = 1;
}
combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
real_t tile_scale = row_scale[0];
real_t tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/device_synorm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,11 @@ __global__ void synorm_fro_kernel(
row_sumsq[chunk] = 1;
}
combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
real_t tile_scale = row_scale[0];
real_t tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/device_trnorm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,11 @@ __global__ void trnorm_fro_kernel(
}

combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
real_t tile_scale = row_scale[0];
real_t tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/hip/device_genorm.hip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,11 @@ __global__ void genorm_fro_kernel(

// Save partial results in shared memory.
combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
tile_scale = row_scale[0];
tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/hip/device_genorm.hip.cc.dep
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0e0a5d56961c18926d854137b793f41e src/cuda/device_genorm.cu
7e00edc231757cb85aff3874a2df26fe src/cuda/device_genorm.cu
2 changes: 1 addition & 1 deletion src/hip/device_henorm.hip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,11 @@ __global__ void henorm_fro_kernel(
row_sumsq[chunk] = 1;
}
combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
real_t tile_scale = row_scale[0];
real_t tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/hip/device_henorm.hip.cc.dep
Original file line number Diff line number Diff line change
@@ -1 +1 @@
edb4c4922d8baff7e67b5aa4cd53cd4a src/cuda/device_henorm.cu
59ba88d555511c8042a44b64f5a10658 src/cuda/device_henorm.cu
2 changes: 1 addition & 1 deletion src/hip/device_synorm.hip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,11 @@ __global__ void synorm_fro_kernel(
row_sumsq[chunk] = 1;
}
combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
real_t tile_scale = row_scale[0];
real_t tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/hip/device_synorm.hip.cc.dep
Original file line number Diff line number Diff line change
@@ -1 +1 @@
bb1a7390496864beb7a2999f514f2583 src/cuda/device_synorm.cu
1379a5e2fc495c10f43521cb14c3302b src/cuda/device_synorm.cu
2 changes: 1 addition & 1 deletion src/hip/device_trnorm.hip.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,11 +336,11 @@ __global__ void trnorm_fro_kernel(
}

combine_sumsq(row_scale[chunk], row_sumsq[chunk], scale, sumsq);
__syncthreads();
}

// Reduction to find sum-of-squares of tile.
// todo: parallel reduction.
__syncthreads();
if (threadIdx.x == 0) {
real_t tile_scale = row_scale[0];
real_t tile_sumsq = row_sumsq[0];
Expand Down
2 changes: 1 addition & 1 deletion src/hip/device_trnorm.hip.cc.dep
Original file line number Diff line number Diff line change
@@ -1 +1 @@
ca0b95ee7b4f23a6edb8e608e9061207 src/cuda/device_trnorm.cu
e10e6af7419166f911576cd5d18653ad src/cuda/device_trnorm.cu

0 comments on commit fbcb9df

Please sign in to comment.