Skip to content

Commit

Permalink
Merge TSAN fixes
Browse files Browse the repository at this point in the history
This PR includes fixes for errors reported by the TSAN. The changes include:

- RCM: locking the write mutex when checking the queue size 
- SparsityCsr: using atomic load/store when checking of other threads report that the matrix is unsorted
- Par IC/ILU (t): use atomic load/stores in the same way as the CUDA/HIP implementations
- MG: 'fix' a write-after-write race in the kcycle stopping criteria. Honestly, I don't think this is necessary, but it stops TSAN from complaining.
- Batch Solvers: fixed the generation of the 3pt stencil matrix for tests, which caused data races.
- PGM: The unified implementation of the `match_edge`kernel has been split up into separate implementations for each backend, and atomic load/stores are added.
- NOT FIXED:  [PGM] There is a data race in the non-deterministic part of `assign_to_exist_agg`, but I ignored that, since it is marked as non-deterministic anyway.

Related PR: #1743
  • Loading branch information
MarcelKoch authored Dec 8, 2024
2 parents f95fc48 + 6ad5853 commit 059823f
Show file tree
Hide file tree
Showing 15 changed files with 261 additions and 58 deletions.
40 changes: 40 additions & 0 deletions common/cuda_hip/multigrid/pgm_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/components/memory.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"


namespace gko {
Expand All @@ -28,6 +30,44 @@ namespace GKO_DEVICE_NAMESPACE {
* @ingroup pgm
*/
namespace pgm {
namespace kernels {


template <typename IndexType>
__global__ void match_edge(size_type size,
const IndexType* __restrict__ strongest_neighbor,
IndexType* __restrict__ agg)
{
auto tidx = static_cast<IndexType>(thread::get_thread_id_flat<int64>());
if (tidx >= size || load_relaxed(agg + tidx) != -1) {
return;
}
auto neighbor = strongest_neighbor[tidx];
if (neighbor != -1 && strongest_neighbor[neighbor] == tidx &&
tidx <= neighbor) {
store_relaxed(agg + tidx, tidx);
store_relaxed(agg + neighbor, tidx);
}
}


} // namespace kernels


template <typename IndexType>
void match_edge(std::shared_ptr<const DefaultExecutor> exec,
const array<IndexType>& strongest_neighbor,
array<IndexType>& agg)
{
constexpr int default_block_size = 512;
auto num_blocks = ceildiv(agg.get_size(), default_block_size);
kernels::
match_edge<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
agg.get_size(), strongest_neighbor.get_const_data(),
agg.get_data());
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);


template <typename IndexType>
Expand Down
25 changes: 0 additions & 25 deletions common/unified/multigrid/pgm_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,6 @@ namespace GKO_DEVICE_NAMESPACE {
namespace pgm {


template <typename IndexType>
void match_edge(std::shared_ptr<const DefaultExecutor> exec,
const array<IndexType>& strongest_neighbor,
array<IndexType>& agg)
{
run_kernel(
exec,
[] GKO_KERNEL(auto tidx, auto strongest_neighbor_vals, auto agg_vals) {
if (agg_vals[tidx] != -1) {
return;
}
auto neighbor = strongest_neighbor_vals[tidx];
if (neighbor != -1 && strongest_neighbor_vals[neighbor] == tidx &&
tidx <= neighbor) {
// Use the smaller index as agg point
agg_vals[tidx] = tidx;
agg_vals[neighbor] = tidx;
}
},
agg.get_size(), strongest_neighbor.get_const_data(), agg.get_data());
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);


template <typename IndexType>
void count_unagg(std::shared_ptr<const DefaultExecutor> exec,
const array<IndexType>& agg, IndexType* num_unagg)
Expand Down
2 changes: 1 addition & 1 deletion core/test/utils/batch_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ std::unique_ptr<MatrixType> generate_3pt_stencil_batch_matrix(
{}};
for (int row = 0; row < num_rows; ++row) {
if (row > 0) {
data.nonzeros.emplace_back(row - 1, row, value_type{-1.0});
data.nonzeros.emplace_back(row, row - 1, value_type{-1.0});
}
data.nonzeros.emplace_back(row, row, value_type{6.0});
if (row < num_rows - 1) {
Expand Down
25 changes: 25 additions & 0 deletions dpcpp/components/atomic.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,31 @@ __dpct_inline__ T atomic_max(T* __restrict__ addr, T val)
}


template <sycl::access::address_space addressSpace = atomic::global_space,
typename T>
__dpct_inline__ void store(
T* __restrict__ addr, T val,
sycl::memory_order memoryOrder = sycl::memory_order::relaxed)
{
sycl::atomic_ref<T, sycl::memory_order::relaxed,
atomic::memory_scope_v<addressSpace>, addressSpace>
obj(*addr);
obj.store(val, memoryOrder);
}


template <sycl::access::address_space addressSpace = atomic::global_space,
typename T>
__dpct_inline__ T load(T* __restrict__ addr, sycl::memory_order memoryOrder =
sycl::memory_order::relaxed)
{
sycl::atomic_ref<T, sycl::memory_order::relaxed,
atomic::memory_scope_v<addressSpace>, addressSpace>
obj(*addr);
return obj.load(memoryOrder);
}


} // namespace dpcpp
} // namespace kernels
} // namespace gko
Expand Down
30 changes: 30 additions & 0 deletions dpcpp/multigrid/pgm_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <ginkgo/core/base/math.hpp>

#include "dpcpp/base/onedpl.hpp"
#include "dpcpp/components/atomic.dp.hpp"


namespace gko {
Expand All @@ -25,6 +26,35 @@ namespace dpcpp {
namespace pgm {


template <typename IndexType>
void match_edge(std::shared_ptr<const DefaultExecutor> exec,
const array<IndexType>& strongest_neighbor,
array<IndexType>& agg)
{
exec->get_queue()->submit([size = agg.get_size(), agg = agg.get_data(),
strongest_neighbor =
strongest_neighbor.get_const_data()](
sycl::handler& cgh) {
cgh.parallel_for(
sycl::range<1>{static_cast<std::size_t>(size)},
[=](sycl::id<1> idx_id) {
auto tidx = static_cast<IndexType>(idx_id[0]);
if (load(agg + tidx, sycl::memory_order_relaxed) != -1) {
return;
}
auto neighbor = strongest_neighbor[tidx];
if (neighbor != -1 && strongest_neighbor[neighbor] == tidx &&
tidx <= neighbor) {
store(agg + tidx, tidx, sycl::memory_order_relaxed);
store(agg + neighbor, tidx, sycl::memory_order_relaxed);
}
});
});
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PGM_MATCH_EDGE_KERNEL);


template <typename IndexType>
void sort_agg(std::shared_ptr<const DefaultExecutor> exec, IndexType num,
IndexType* row_idxs, IndexType* col_idxs)
Expand Down
96 changes: 95 additions & 1 deletion omp/components/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ inline ResultType copy_cast(const ValueType& val)


template <>
void atomic_add(half& out, half val)
inline void atomic_add(half& out, half val)
{
#ifdef __NVCOMPILER
// NVC++ uses atomic capture on uint16 leads the following error.
Expand Down Expand Up @@ -85,6 +85,100 @@ void atomic_add(half& out, half val)
}


// There is an error in Clang 17 which prevents us from merging the
// implementation of double and float. The compiler will throw an error if the
// templated version is implemented. GCC doesn't throw an error.
inline void store(double* addr, double val)
{
#pragma omp atomic write
*addr = val;
}

inline void store(float* addr, float val)
{
#pragma omp atomic write
*addr = val;
}

inline void store(int32* addr, int32 val)
{
#pragma omp atomic write
*addr = val;
}

inline void store(int64* addr, int64 val)
{
#pragma omp atomic write
*addr = val;
}

inline void store(half* addr, half val)
{
auto uint_addr = copy_cast<uint16_t*>(addr);
auto uint_val = copy_cast<uint16_t>(val);
#pragma omp atomic write
*uint_addr = uint_val;
}

template <typename T>
inline void store(std::complex<T>* addr, std::complex<T> val)
{
auto values = reinterpret_cast<T*>(addr);
store(values + 0, real(val));
store(values + 1, imag(val));
}


// Same issue as with the store_helper
inline float load(float* addr)
{
float val;
#pragma omp atomic read
val = *addr;
return val;
}

inline double load(double* addr)
{
double val;
#pragma omp atomic read
val = *addr;
return val;
}

inline int32 load(int32* addr)
{
float val;
#pragma omp atomic read
val = *addr;
return val;
}

inline int64 load(int64* addr)
{
float val;
#pragma omp atomic read
val = *addr;
return val;
}

inline half load(half* addr)
{
uint16_t uint_val;
auto uint_addr = copy_cast<uint16_t*>(addr);
#pragma omp atomic read
uint_val = *uint_addr;
return copy_cast<half>(uint_val);
}

template <typename T>
inline std::complex<T> load(std::complex<T>* addr)
{
auto values = reinterpret_cast<T*>(addr);
return {load(values + 0), load(values + 1)};
}


} // namespace omp
} // namespace kernels
} // namespace gko
Expand Down
8 changes: 5 additions & 3 deletions omp/factorization/par_ic_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <ginkgo/core/matrix/csr.hpp>

#include "core/base/utils.hpp"
#include "omp/components/atomic.hpp"


namespace gko {
Expand Down Expand Up @@ -76,7 +77,8 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
auto l_col = l_col_idxs[l_begin];
auto lh_row = l_col_idxs[lh_begin];
if (l_col == lh_row && l_col < col) {
sum += l_vals[l_begin] * conj(l_vals[lh_begin]);
sum += load(l_vals + l_begin) *
conj(load(l_vals + lh_begin));
}
l_begin += (l_col <= lh_row);
lh_begin += (lh_row <= l_col);
Expand All @@ -85,11 +87,11 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
if (row == col) {
new_val = sqrt(new_val);
} else {
auto diag = l_vals[l_row_ptrs[col + 1] - 1];
auto diag = load(l_vals + l_row_ptrs[col + 1] - 1);
new_val = new_val / diag;
}
if (is_finite(new_val)) {
l_vals[l_nz] = new_val;
store(l_vals + l_nz, new_val);
}
}
}
Expand Down
8 changes: 5 additions & 3 deletions omp/factorization/par_ict_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "core/base/utils.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/matrix/csr_builder.hpp"
#include "omp/components/atomic.hpp"
#include "omp/components/csr_spgeam.hpp"


Expand Down Expand Up @@ -69,7 +70,8 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
auto l_col = l_col_idxs[l_begin];
auto lh_row = l_col_idxs[lh_begin];
if (l_col == lh_row && l_col < col) {
sum += l_vals[l_begin] * conj(l_vals[lh_begin]);
sum +=
load(l_vals + l_begin) * conj(load(l_vals + lh_begin));
}
if (lh_row == row) {
lh_nz = lh_begin;
Expand All @@ -81,11 +83,11 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
if (row == col) {
new_val = sqrt(new_val);
} else {
auto diag = l_vals[l_row_ptrs[col + 1] - 1];
auto diag = load(l_vals + l_row_ptrs[col + 1] - 1);
new_val = new_val / diag;
}
if (is_finite(new_val)) {
l_vals[l_nz] = new_val;
store(l_vals + l_nz, new_val);
}
}
}
Expand Down
11 changes: 7 additions & 4 deletions omp/factorization/par_ilu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
#include <ginkgo/core/matrix/coo.hpp>
#include <ginkgo/core/matrix/csr.hpp>

#include "omp/components/atomic.hpp"


namespace gko {
namespace kernels {
Expand Down Expand Up @@ -57,7 +59,8 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
auto col_l = col_idxs_l[row_l];
auto col_u = col_idxs_u[row_u];
if (col_l == col_u) {
last_operation = vals_l[row_l] * vals_u[row_u];
last_operation =
load(vals_l + row_l) * load(vals_u + row_u);
sum -= last_operation;
} else {
last_operation = zero<ValueType>();
Expand All @@ -74,14 +77,14 @@ void compute_l_u_factors(std::shared_ptr<const OmpExecutor> exec,
sum += last_operation; // undo the last operation

if (row > col) { // modify entry in L
auto to_write = sum / vals_u[row_ptrs_u[col + 1] - 1];
auto to_write = sum / load(vals_u + row_ptrs_u[col + 1] - 1);
if (is_finite(to_write)) {
vals_l[row_l - 1] = to_write;
store(vals_l + row_l - 1, to_write);
}
} else { // modify entry in U
auto to_write = sum;
if (is_finite(to_write)) {
vals_u[row_u - 1] = to_write;
store(vals_u + row_u - 1, to_write);
}
}
}
Expand Down
Loading

0 comments on commit 059823f

Please sign in to comment.