From 4ed36a8abeb723a296688e9cce07f276f2b16e95 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 29 Nov 2022 08:09:00 -0500 Subject: [PATCH 01/62] only can compile cuda/omp --- .../base/device_matrix_data_kernels.cpp | 13 +- common/cuda_hip/components/reduction.hpp | 2 +- .../jacobi_generate_kernels.instantiate.cpp | 2 +- common/unified/multigrid/pgm_kernels.cpp | 2 + core/base/extended_float.hpp | 210 ++++++++++++++++-- core/preconditioner/jacobi_utils.hpp | 12 +- cuda/base/types.hpp | 47 +++- cuda/solver/common_trs_kernels.cuh | 4 +- include/ginkgo/core/base/math.hpp | 56 ++++- include/ginkgo/core/base/types.hpp | 50 ++++- include/ginkgo/core/matrix/dense.hpp | 1 + omp/components/atomic.hpp | 30 +++ reference/matrix/diagonal_kernels.cpp | 1 + 13 files changed, 387 insertions(+), 43 deletions(-) diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp index c5742653a93..c8dabf63660 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.cpp +++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp @@ -31,9 +31,13 @@ void remove_zeros(std::shared_ptr exec, auto value_ptr = as_device_type(values.get_const_data()); auto size = values.get_size(); // count nonzeros - auto nnz = thrust::count_if( - thrust_policy(exec), value_ptr, value_ptr + size, - [] __device__(device_value_type value) { return is_nonzero(value); }); + // __half != is only device, can not call __device__ from a __host__ + // __device__ (is_nonzero) + auto nnz = + thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size, + [] __device__(device_value_type value) { + return value != zero(value); + }); if (nnz < size) { using tuple_type = thrust::tuple; @@ -49,7 +53,8 @@ void remove_zeros(std::shared_ptr exec, as_device_type(new_values.get_data()))); thrust::copy_if(thrust_policy(exec), it, it + size, out_it, [] __device__(tuple_type entry) { - return is_nonzero(thrust::get<2>(entry)); + return thrust::get<2>(entry) != + zero(thrust::get<2>(entry)); }); // swap out storage values = std::move(new_values); diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp index 7c66befa6bd..b2f74fd8598 100644 --- a/common/cuda_hip/components/reduction.hpp +++ b/common/cuda_hip/components/reduction.hpp @@ -73,7 +73,7 @@ __device__ __forceinline__ int choose_pivot(const Group& group, bool is_pivoted) { using real = remove_complex; - real lmag = is_pivoted ? -one() : abs(local_data); + real lmag = real(is_pivoted ? -one() : abs(local_data)); const auto pivot = reduce(group, group.thread_rank(), [&](int lidx, int ridx) { const auto rmag = group.shfl(lmag, ridx); diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp index d004309c622..ca0c480c08e 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp @@ -68,7 +68,7 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility( } } - return succeeded && block_cond >= 1.0 && + return succeeded && block_cond >= remove_complex{1.0} && block_cond * static_cast>( float_traits>::eps) < remove_complex{1e-3}; diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index 9ba144cba2e..af6d8f198d8 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -235,6 +235,7 @@ void assign_to_exist_agg(std::shared_ptr exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals, auto diag, auto agg_const_val, auto agg_val) { + using value_type = device_type; if (agg_val[row] != -1) { return; } @@ -273,6 +274,7 @@ void assign_to_exist_agg(std::shared_ptr exec, exec, [] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals, auto diag, auto agg_val) { + using value_type = device_type; if (agg_val[row] != -1) { return; } diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index c14b5d1bd39..86620c9c01d 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -71,6 +71,17 @@ struct basic_float_traits { static constexpr bool rounds_to_nearest = true; }; +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +template <> +struct basic_float_traits<__half> { + using type = __half; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; +#endif + template <> struct basic_float_traits { using type = float32; @@ -280,22 +291,30 @@ struct precision_converter { */ class half { public: - half() noexcept = default; + GKO_ATTRIBUTES half() noexcept = default; + + GKO_ATTRIBUTES half& operator=(const half& val) = default; + GKO_ATTRIBUTES half(const half& val) = default; + // GKO_ATTRIBUTES half(half const&) = default; + // complex() = default; - GKO_ATTRIBUTES half(float32 val) noexcept + // complex(const complex& z) = default; + + explicit GKO_ATTRIBUTES half(float32 val) noexcept { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto tmp = __float2half_rn(val); - data_ = reinterpret_cast(tmp); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - data_ = float2half(reinterpret_cast(val)); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + this->float2half(val); } - GKO_ATTRIBUTES half(float64 val) noexcept : half(static_cast(val)) + explicit GKO_ATTRIBUTES half(float64 val) noexcept + : half(static_cast(val)) {} - GKO_ATTRIBUTES operator float32() const noexcept + explicit GKO_ATTRIBUTES half(int val) noexcept + : half(static_cast(val)) { + + } + + GKO_ATTRIBUTES operator float() const noexcept { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) return __half2float(reinterpret_cast(data_)); @@ -305,23 +324,159 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } - GKO_ATTRIBUTES operator float64() const noexcept +// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// GKO_ATTRIBUTES operator __half() noexcept +// { +// return reinterpret_cast(*this); +// } +// #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + + + GKO_ATTRIBUTES half& operator+=(const float& rhs) { - return static_cast(static_cast(*this)); + auto val = *this + rhs; + this->float2half(val); + return *this; } - GKO_ATTRIBUTES half operator-() const noexcept + GKO_ATTRIBUTES half& operator/=(const float& rhs) { - auto res = *this; - // flip sign bit - res.data_ ^= f16_traits::sign_mask; - return res; + auto val = *this / rhs; + this->float2half(val); + return *this; + } + + GKO_ATTRIBUTES half& operator*=(const float& rhs) + { + auto val = *this * rhs; + this->float2half(val); + return *this; + } + + GKO_ATTRIBUTES half& operator-=(const float& rhs) + { + auto val = *this - rhs; + this->float2half(val); + return *this; + } + + // half& operator+=(const half& rhs) + // { + // auto val = *this + float(rhs); + // this->float2half(val); + // return *this; + // } + + // half& operator/=(const half& rhs) + // { + // auto val = *this / float(rhs); + // this->float2half(val); + // return *this; + // } + + // half& operator*=(const half& rhs) + // { + // auto val = *this * float(rhs); + // this->float2half(val); + // return *this; + // } + + // half& operator-=(const half& rhs) + // { + // auto val = *this - float(rhs); + // this->float2half(val); + // return *this; + // } + + GKO_ATTRIBUTES friend half operator+(half lhs, const half& rhs) + { + float flhs = lhs; + flhs += rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator-(half lhs, const half& rhs) + { + float flhs = lhs; + flhs -= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator*(half lhs, const half& rhs) + { + float flhs = lhs; + flhs *= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator/(half lhs, const half& rhs) + { + float flhs = lhs; + flhs /= rhs; // reuse compound assignment + return half(flhs); + } + + + GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) + { + float flhs = lhs; + flhs += rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) + { + float flhs = lhs; + flhs -= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) + { + float flhs = lhs; + flhs *= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) + { + float flhs = lhs; + flhs /= rhs; // reuse compound assignment + return half(flhs); + } + + GKO_ATTRIBUTES half& operator=(int val) + { + this->float2half(float(val)); + return *this; + } + + GKO_ATTRIBUTES half& operator=(float val) + { + this->float2half(val); + return *this; + } + + GKO_ATTRIBUTES half& operator=(double val) + { + this->float2half(static_cast(val)); + return *this; } private: using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; + GKO_ATTRIBUTES void float2half(float val) noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto tmp = __float2half_rn(val); + data_ = reinterpret_cast(tmp); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + data_ = float2half(reinterpret_cast(val)); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + static uint16 float2half(uint32 data_) noexcept { using conv = detail::precision_converter; @@ -463,9 +618,14 @@ class complex { public: using value_type = gko::half; - complex(const value_type& real = 0.f, const value_type& imag = 0.f) + complex(const value_type& real = value_type(0.f), + const value_type& imag = value_type(0.f)) : real_(real), imag_(imag) {} + template + explicit complex(const T& real, const U& imag) + : complex(static_cast(real), static_cast(imag)) + {} template explicit complex(const complex& other) @@ -544,6 +704,20 @@ struct numeric_limits { { return gko::detail::float_traits::eps; } + + static constexpr float infinity() + { + return numeric_limits::infinity(); + } + + static constexpr float min() { return numeric_limits::min(); } + + static constexpr float max() { return numeric_limits::max(); } + + static constexpr float quiet_NaN() + { + return numeric_limits::quiet_NaN(); + } }; } // namespace std diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index e159fd15776..b0aa8b5f38a 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -115,21 +115,23 @@ GKO_ATTRIBUTES GKO_INLINE uint32 get_supported_storage_reductions( auto supported = static_cast(prd::p0n0); // the following code uses short-circuiting to avoid calling possibly // expensive verificatiors multiple times - if (accurate(float_traits>>::eps)) { + if (accurate(type(float_traits>>::eps))) { supported |= prd::p2n0; } - if (accurate(float_traits>>::eps) && + if (accurate( + type(float_traits>>::eps)) && (is_verified1 = verificator1())) { supported |= prd::p1n1; } - if (accurate(float_traits>>::eps) && + if (accurate(type( + float_traits>>::eps)) && is_verified1 != 0 && verificator2()) { supported |= prd::p0n2; } - if (accurate(float_traits>::eps)) { + if (accurate(type(float_traits>::eps))) { supported |= prd::p1n0; } - if (accurate(float_traits>::eps) && + if (accurate(type(float_traits>::eps)) && (is_verified1 == 1 || (is_verified1 == 2 && (is_verified1 = verificator1())))) { supported |= prd::p0n1; diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 7252f7d673d..8c4f0a93d0c 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -18,16 +18,33 @@ #include +namespace std { + +template <> +struct is_scalar<__half> : std::true_type {}; + +} // namespace std + namespace gko { +#if defined(__CUDA_ARCH__) +template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return is_nan(float(val)); +} +#endif namespace kernels { namespace cuda { +#if defined(__CUDA_ARCH__) +// template <> +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } - +__device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } +#endif namespace detail { - /** * @internal * @@ -124,6 +141,17 @@ struct culibs_type_impl> { using type = cuDoubleComplex; }; + +template <> +struct culibs_type_impl { + using type = __half; +}; + +template <> +struct culibs_type_impl> { + using type = __half2; +}; + template struct culibs_type_impl> { using type = typename culibs_type_impl>::type; @@ -154,6 +182,11 @@ struct cuda_type_impl { using type = volatile typename cuda_type_impl::type; }; +template <> +struct cuda_type_impl { + using type = __half; +}; + template struct cuda_type_impl> { using type = thrust::complex; @@ -169,6 +202,11 @@ struct cuda_type_impl { using type = thrust::complex; }; +template <> +struct cuda_type_impl<__half2> { + using type = thrust::complex<__half>; +}; + template struct cuda_struct_member_type_impl { using type = T; @@ -179,6 +217,11 @@ struct cuda_struct_member_type_impl> { using type = fake_complex; }; +template <> +struct cuda_struct_member_type_impl { + using type = __half; +}; + template struct cuda_type_impl> { using type = matrix_data_entry< diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 291c842325f..686d8026a64 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -503,7 +503,7 @@ __global__ void sptrsv_naive_legacy_kernel( const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1]; const int row_step = is_upper ? -1 : 1; - ValueType sum = 0.0; + ValueType sum = ValueType{0.0}; auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { @@ -557,7 +557,7 @@ void sptrsv_naive_caching(std::shared_ptr exec, const auto nrhs = b->get_size()[1]; // Initialize x to all NaNs. - dense::fill(exec, x, nan()); + dense::fill(exec, x, ValueType(nan())); array nan_produced(exec, 1); array atomic_counter(exec, 1); diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index cd5e489b95d..fbefbbe20b6 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -33,6 +33,13 @@ using std::abs; using std::sqrt; +inline half abs(half a) { return half((a > 0) ? a : -a); } +inline half abs(std::complex a) +{ + return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); +} +inline half sqrt(half a) { return half(sqrt(float(a))); } + } // namespace reference } // namespace kernels @@ -47,6 +54,14 @@ using std::abs; using std::sqrt; +inline half abs(half a) { return half((a > 0) ? a : -a); } +inline half abs(std::complex a) +{ + return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); +} +inline half sqrt(half a) { return half(sqrt(float(a))); } + + } // namespace omp } // namespace kernels @@ -360,6 +375,11 @@ namespace detail { template struct next_precision_impl {}; +template <> +struct next_precision_impl { + using type = float; +}; + template <> struct next_precision_impl { using type = double; @@ -367,7 +387,7 @@ struct next_precision_impl { template <> struct next_precision_impl { - using type = float; + using type = half; }; template @@ -418,11 +438,22 @@ struct increase_precision_impl { }; +template +struct arth_type { + using type = T; +}; + +template <> +struct arth_type { + using type = float; +}; + template struct infinity_impl { // CUDA doesn't allow us to call std::numeric_limits functions // so we need to store the value instead. - static constexpr auto value = std::numeric_limits::infinity(); + static constexpr auto value = + std::numeric_limits::type>::infinity(); }; @@ -623,7 +654,7 @@ GKO_INLINE constexpr int64 ceildiv(int64 num, int64 den) template GKO_INLINE constexpr T zero() { - return T{}; + return T(0.0); } @@ -651,7 +682,7 @@ GKO_INLINE constexpr T zero(const T&) template GKO_INLINE constexpr T one() { - return T(1); + return T(1.0); } @@ -841,7 +872,7 @@ template GKO_ATTRIBUTES GKO_INLINE constexpr std::enable_if_t::value, T> imag_impl(const T&) { - return T{}; + return T(0.0); } template @@ -945,7 +976,7 @@ template GKO_INLINE constexpr std::enable_if_t::value, T> abs( const T& x) { - return x >= zero() ? x : -x; + return x >= zero() ? x : static_cast(-x); } @@ -1039,7 +1070,8 @@ template GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_finite(const T& value) { - constexpr T infinity{detail::infinity_impl::value}; + constexpr typename detail::arth_type::type infinity{ + detail::infinity_impl::value}; return abs(value) < infinity; } @@ -1130,7 +1162,15 @@ GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( * @return NaN. */ template -GKO_INLINE constexpr std::enable_if_t::value, T> nan() +GKO_INLINE constexpr std::enable_if_t< + !is_complex_s::value && !std::is_same::value, T> +nan() +{ + return std::numeric_limits::quiet_NaN(); +} + +template +GKO_INLINE constexpr std::enable_if_t::value, float> nan() { return std::numeric_limits::quiet_NaN(); } diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index e375da15f9c..56a8604632e 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -410,11 +410,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ + template _macro(half); \ template _macro(float); \ template <> \ _macro(double) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ + template _macro(half); \ template _macro(float); \ template _macro(double) #endif @@ -431,12 +433,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + template _macro(std::complex); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + template _macro(std::complex); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -454,21 +458,27 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(half, half); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template _macro(std::complex, half); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ + template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -498,16 +508,20 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(half, int32); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ + template _macro(half, int64); \ template _macro(float, int64); \ template <> \ _macro(double, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(half, int32); \ template _macro(float, int32); \ template _macro(double, int32); \ + template _macro(half, int64); \ template _macro(float, int64); \ template _macro(double, int64) #endif @@ -540,17 +554,21 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ + template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ + template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -568,6 +586,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ + template _macro(half, int32, int32); \ + template _macro(half, int32, int64); \ + template _macro(half, int64, int64); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -580,6 +601,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ + template _macro(half, int32, int32); \ + template _macro(half, int32, int64); \ + template _macro(half, int64, int64); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -601,6 +625,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + template _macro(std::complex, int32, int32); \ + template _macro(std::complex, int32, int64); \ + template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -614,6 +641,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + template _macro(std::complex, int32, int32); \ + template _macro(std::complex, int32, int64); \ + template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -656,8 +686,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ template _macro(float, double); \ template _macro(double, float); \ + template _macro(half, double); \ + template _macro(double, half); \ + template _macro(float, half); \ + template _macro(half, float); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex) /** @@ -671,8 +709,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -687,10 +727,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * value and index types. */ #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ + template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -710,9 +753,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(int64, int64); \ template _macro(unsigned int, unsigned int); \ template _macro(unsigned long, unsigned long); \ + template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -725,6 +770,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * value and index types. */ #define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ + template _macro(half); \ template _macro(float); \ template _macro(double); \ template _macro(std::complex); \ @@ -803,5 +849,5 @@ using comm_index_type = int; } // namespace experimental } // namespace gko - +#include "core/base/extended_float.hpp" #endif // GKO_PUBLIC_CORE_BASE_TYPES_HPP_ diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index d7e9b1a10e0..0f8bc2235a0 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -80,6 +80,7 @@ template class Dense : public EnableLinOp>, public ConvertibleTo>>, + public ConvertibleTo>>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index c3580cd36bb..44cef8a6e1a 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -39,6 +39,36 @@ void atomic_add(ValueType& out, ValueType val) } +template +inline ResultType reinterpret(ValueType val) +{ + static_assert(sizeof(ValueType) == sizeof(ResultType), + "The type to reinterpret to must be of the same size as the " + "original type."); + return reinterpret_cast(val); +} + + +template <> +void atomic_add(half& out, half val) +{ + // UB? + uint16_t* address_as_converter = reinterpret_cast(&out); + uint16_t old = *address_as_converter; + uint16_t assumed; + do { + assumed = old; + auto answer = reinterpret(reinterpret(assumed) + val); +#pragma omp atomic capture +{ + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; +} + } while (assumed != old); + +} // namespace omp + + } // namespace omp } // namespace kernels } // namespace gko diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp index 028b7685c2b..1193c81ecdb 100644 --- a/reference/matrix/diagonal_kernels.cpp +++ b/reference/matrix/diagonal_kernels.cpp @@ -6,6 +6,7 @@ #include #include +#include "core/base/extended_float.hpp" namespace gko { From ddc7c16a751def9df8eef71970103abf382fd6dd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 5 Jan 2023 15:35:28 -0600 Subject: [PATCH 02/62] next_precision to itself when complex only float, double add empty conditional --- include/ginkgo/core/base/math.hpp | 16 +++++++++++++--- include/ginkgo/core/base/types.hpp | 26 +------------------------- include/ginkgo/core/matrix/dense.hpp | 8 +++++++- 3 files changed, 21 insertions(+), 29 deletions(-) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index fbefbbe20b6..ff9cb72b017 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -390,11 +390,21 @@ struct next_precision_impl { using type = half; }; -template -struct next_precision_impl> { - using type = std::complex::type>; +template <> +struct next_precision_impl> { + using type = std::complex; +}; + +template <> +struct next_precision_impl> { + using type = std::complex; }; +// template +// struct next_precision_impl> { +// using type = std::complex::type>; +// }; + template struct reduce_precision_impl { diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 56a8604632e..1845b565a8e 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -433,14 +433,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - template _macro(std::complex); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - template _macro(std::complex); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -462,11 +460,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, half); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; @@ -475,10 +471,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -554,21 +548,17 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ - template _macro(std::complex, int64); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -625,9 +615,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - template _macro(std::complex, int32, int32); \ - template _macro(std::complex, int32, int64); \ - template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -641,9 +628,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - template _macro(std::complex, int32, int32); \ - template _macro(std::complex, int32, int64); \ - template _macro(std::complex, int64, int64); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -691,11 +675,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, half); \ template _macro(half, float); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) + template _macro(std::complex, std::complex) /** @@ -712,7 +692,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -730,10 +709,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half, half); \ template _macro(float, float); \ template _macro(double, double); \ - template _macro(std::complex, half); \ template _macro(std::complex, float); \ template _macro(std::complex, double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -757,7 +734,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ - template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 0f8bc2235a0..45390ee7316 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -61,6 +61,8 @@ template class SparsityCsr; +class Empty {}; + /** * Dense is a matrix format which explicitly stores all values of the matrix. * @@ -80,7 +82,11 @@ template class Dense : public EnableLinOp>, public ConvertibleTo>>, - public ConvertibleTo>>>, + public std::conditional< + std::is_same>, + ValueType>::value, + Empty, + ConvertibleTo>>>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, From 501e6c7e922a514026a27d5b2f1c08c3fb98b89f Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Sat, 7 Jan 2023 21:14:29 -0600 Subject: [PATCH 03/62] can compile with cuda/omp/ref (without test) --- accessor/cuda_helper.hpp | 7 +- .../matrix/csr_kernels.instantiate.cpp | 24 +++ common/cuda_hip/solver/cb_gmres_kernels.cpp | 3 +- common/cuda_hip/solver/idr_kernels.cpp | 8 +- .../unified/components/fill_array_kernels.cpp | 2 +- core/base/extended_float.hpp | 147 ++++++++++++++---- core/base/mixed_precision_types.hpp | 93 ++++++++++- core/base/mtx_io.cpp | 17 +- core/matrix/coo.cpp | 19 +++ core/matrix/csr.cpp | 20 +++ core/matrix/dense.cpp | 26 +++- core/matrix/diagonal.cpp | 16 ++ core/matrix/ell.cpp | 20 +++ core/matrix/fbcsr.cpp | 21 +++ core/matrix/hybrid.cpp | 21 +++ core/matrix/row_gatherer.cpp | 10 +- core/matrix/sellp.cpp | 21 +++ core/multigrid/pgm.cpp | 2 +- core/preconditioner/jacobi.cpp | 2 +- core/solver/cb_gmres.cpp | 4 +- core/solver/multigrid.cpp | 14 +- core/stop/residual_norm.cpp | 4 +- cuda/CMakeLists.txt | 1 + cuda/base/types.hpp | 2 +- cuda/matrix/fft_kernels.cu | 6 +- dpcpp/solver/cb_gmres_kernels.dp.cpp | 3 +- include/ginkgo/core/base/math.hpp | 27 ++-- include/ginkgo/core/base/matrix_data.hpp | 2 +- .../ginkgo/core/base/precision_dispatch.hpp | 23 ++- include/ginkgo/core/base/types.hpp | 91 +++++++---- include/ginkgo/core/matrix/coo.hpp | 10 +- include/ginkgo/core/matrix/csr.hpp | 13 +- include/ginkgo/core/matrix/dense.hpp | 23 ++- include/ginkgo/core/matrix/diagonal.hpp | 9 +- include/ginkgo/core/matrix/ell.hpp | 9 +- include/ginkgo/core/matrix/fbcsr.hpp | 9 +- include/ginkgo/core/matrix/hybrid.hpp | 10 +- include/ginkgo/core/matrix/sellp.hpp | 9 +- omp/CMakeLists.txt | 1 + omp/components/atomic.hpp | 17 +- omp/matrix/fft_kernels.cpp | 6 +- omp/solver/cb_gmres_kernels.cpp | 2 +- omp/solver/idr_kernels.cpp | 11 +- reference/CMakeLists.txt | 1 + reference/matrix/fft_kernels.cpp | 6 +- reference/solver/cb_gmres_kernels.cpp | 2 +- reference/solver/idr_kernels.cpp | 10 +- 47 files changed, 655 insertions(+), 149 deletions(-) diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index 31d3599516d..1a5404b1738 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -27,6 +27,11 @@ struct cuda_type { using type = T; }; +template <> +struct cuda_type { + using type = __half; +}; + // Unpack cv and reference / pointer qualifiers template struct cuda_type { @@ -57,7 +62,7 @@ struct cuda_type { // Transform std::complex to thrust::complex template struct cuda_type> { - using type = thrust::complex; + using type = thrust::complex::type>; }; diff --git a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp index f62ca1c1815..151a7a43ded 100644 --- a/common/cuda_hip/matrix/csr_kernels.instantiate.cpp +++ b/common/cuda_hip/matrix/csr_kernels.instantiate.cpp @@ -34,6 +34,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, + int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(GKO_DECLARE_CSR_SPMV_KERNEL, int64); // split @@ -45,6 +51,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(GKO_DECLARE_CSR_SPMV_KERNEL, // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(GKO_DECLARE_CSR_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(GKO_DECLARE_CSR_SPMV_KERNEL, + int64); // split @@ -60,6 +72,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); // split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int32); +// split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split @@ -71,6 +89,12 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3( // split GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4( GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); +// split +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6( + GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL, int64); // split diff --git a/common/cuda_hip/solver/cb_gmres_kernels.cpp b/common/cuda_hip/solver/cb_gmres_kernels.cpp index 02d45a8d31e..eb03fd7d28e 100644 --- a/common/cuda_hip/solver/cb_gmres_kernels.cpp +++ b/common/cuda_hip/solver/cb_gmres_kernels.cpp @@ -633,7 +633,8 @@ void initialize(std::shared_ptr exec, as_device_type(stop_status->get_data())); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp index a0f605134eb..3aef25b0a48 100644 --- a/common/cuda_hip/solver/idr_kernels.cpp +++ b/common/cuda_hip/solver/idr_kernels.cpp @@ -383,10 +383,10 @@ void initialize_subspace_vectors(std::shared_ptr exec, auto gen = randlib::rand_generator(std::random_device{}(), RANDLIB_RNG_PSEUDO_DEFAULT, exec->get_stream()); - randlib::rand_vector( - gen, - subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - 0.0, 1.0, subspace_vectors->get_values()); + // randlib::rand_vector( + // gen, + // subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), + // 0.0, 1.0, subspace_vectors->get_values()); randlib::destroy(gen); } } diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index d78a6e9f346..5481be27e32 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -32,7 +32,7 @@ void fill_seq_array(std::shared_ptr exec, ValueType* array, size_type n) { run_kernel( - exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = idx; }, n, + exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = static_cast(idx); }, n, array); } diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 86620c9c01d..bf93a40a6b6 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -310,9 +310,8 @@ class half { {} explicit GKO_ATTRIBUTES half(int val) noexcept - : half(static_cast(val)) { - - } + : half(static_cast(val)) + {} GKO_ATTRIBUTES operator float() const noexcept { @@ -324,12 +323,12 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } -// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) -// GKO_ATTRIBUTES operator __half() noexcept -// { -// return reinterpret_cast(*this); -// } -// #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // GKO_ATTRIBUTES operator __half() noexcept + // { + // return reinterpret_cast(*this); + // } + // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) GKO_ATTRIBUTES half& operator+=(const float& rhs) @@ -417,32 +416,38 @@ class half { } - GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) - { - float flhs = lhs; - flhs += rhs; // reuse compound assignment - return half(flhs); - } + // GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs += rhs; // reuse compound assignment + // return half(flhs); + // } - GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) - { - float flhs = lhs; - flhs -= rhs; // reuse compound assignment - return half(flhs); - } + // GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs -= rhs; // reuse compound assignment + // return half(flhs); + // } - GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) - { - float flhs = lhs; - flhs *= rhs; // reuse compound assignment - return half(flhs); - } + // GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs *= rhs; // reuse compound assignment + // return half(flhs); + // } - GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) + // GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) + // { + // float flhs = lhs; + // flhs /= rhs; // reuse compound assignment + // return half(flhs); + // } + + GKO_ATTRIBUTES half& operator=(long long int val) { - float flhs = lhs; - flhs /= rhs; // reuse compound assignment - return half(flhs); + this->float2half(float(val)); + return *this; } GKO_ATTRIBUTES half& operator=(int val) @@ -463,6 +468,12 @@ class half { return *this; } + GKO_ATTRIBUTES half operator-() const + { + auto val = 0.0f - *this; + return half(val); + } + private: using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; @@ -627,6 +638,10 @@ class complex { : complex(static_cast(real), static_cast(imag)) {} + template + explicit complex(const T& real) : complex(static_cast(real)) + {} + template explicit complex(const complex& other) : complex(static_cast(other.real()), @@ -644,6 +659,76 @@ class complex { static_cast(imag_)); } + complex& operator=(const int& __re) + { + real_ = __re; + imag_ = value_type(); + return *this; + } + + complex& operator=(const value_type& __re) + { + real_ = __re; + imag_ = value_type(); + return *this; + } + complex& operator+=(const value_type& __re) + { + real_ += __re; + return *this; + } + complex& operator-=(const value_type& __re) + { + real_ -= __re; + return *this; + } + complex& operator*=(const value_type& __re) + { + real_ *= __re; + imag_ *= __re; + return *this; + } + complex& operator/=(const value_type& __re) + { + real_ /= __re; + imag_ /= __re; + return *this; + } + + template + complex& operator=(const complex<_Xp>& __c) + { + real_ = __c.real(); + imag_ = __c.imag(); + return *this; + } + template + complex& operator+=(const complex<_Xp>& __c) + { + real_ += __c.real(); + imag_ += __c.imag(); + return *this; + } + template + complex& operator-=(const complex<_Xp>& __c) + { + real_ -= __c.real(); + imag_ -= __c.imag(); + return *this; + } + template + complex& operator*=(const complex<_Xp>& __c) + { + *this = *this * complex(__c.real(), __c.imag()); + return *this; + } + template + complex& operator/=(const complex<_Xp>& __c) + { + *this = *this / complex(__c.real(), __c.imag()); + return *this; + } + private: value_type real_; value_type imag_; diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index d9747e5cad8..7a2d2463672 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -12,40 +12,103 @@ #ifdef GINKGO_MIXED_PRECISION + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ + template _macro(float, half, half, __VA_ARGS__); \ + template _macro(float, half, float, __VA_ARGS__); \ + template _macro(float, half, double, __VA_ARGS__); \ + template _macro(float, float, half, __VA_ARGS__); \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ + template _macro(float, double, half, __VA_ARGS__); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ + template _macro(double, half, half, __VA_ARGS__); \ + template _macro(double, half, float, __VA_ARGS__); \ + template _macro(double, half, double, __VA_ARGS__); \ + template _macro(double, float, half, __VA_ARGS__); \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ + template _macro(double, double, half, __VA_ARGS__); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ + template _macro(half, half, half, __VA_ARGS__); \ + template _macro(half, half, float, __VA_ARGS__); \ + template _macro(half, half, double, __VA_ARGS__); \ + template _macro(half, float, half, __VA_ARGS__); \ + template _macro(half, float, float, __VA_ARGS__); \ + template _macro(half, float, double, __VA_ARGS__); \ + template _macro(half, double, half, __VA_ARGS__); \ + template _macro(half, double, float, __VA_ARGS__); \ + template _macro(half, double, double, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + #else + #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ template _macro(float, float, float, __VA_ARGS__) @@ -60,6 +123,14 @@ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ + template _macro(half, half, half, __VA_ARGS__) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) + + #endif @@ -67,7 +138,9 @@ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, __VA_ARGS__); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, __VA_ARGS__); \ - GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__) + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, __VA_ARGS__); \ + GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(_macro) \ @@ -77,18 +150,36 @@ #ifdef GINKGO_MIXED_PRECISION #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + template _macro(half, half, __VA_ARGS__); \ + template _macro(half, float, __VA_ARGS__); \ + template _macro(half, double, __VA_ARGS__); \ + template _macro(float, half, __VA_ARGS__); \ template _macro(float, float, __VA_ARGS__); \ template _macro(float, double, __VA_ARGS__); \ + template _macro(double, half, __VA_ARGS__); \ template _macro(double, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + template _macro(half, half, __VA_ARGS__); \ template _macro(float, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_CPHF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #endif diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 33c3b07d487..3ca4f7e9d3a 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -757,19 +757,28 @@ static constexpr uint64 binary_format_magic() { constexpr auto is_int = std::is_same::value; constexpr auto is_long = std::is_same::value; + constexpr auto is_half = std::is_same::value; constexpr auto is_double = std::is_same::value; constexpr auto is_float = std::is_same::value; constexpr auto is_complex_double = std::is_same>::value; constexpr auto is_complex_float = std::is_same>::value; + constexpr auto is_complex_half = + std::is_same>::value; static_assert(is_int || is_long, "invalid storage index type"); - static_assert( - is_double || is_float || is_complex_double || is_complex_float, - "invalid storage value type"); + static_assert(is_half || is_complex_half || is_double || is_float || + is_complex_double || is_complex_float, + "invalid storage value type"); constexpr auto index_bit = is_int ? 'I' : 'L'; constexpr auto value_bit = - is_double ? 'D' : (is_float ? 'S' : (is_complex_double ? 'Z' : 'C')); + is_double + ? 'D' + : (is_float + ? 'S' + : (is_complex_double + ? 'Z' + : (is_complex_float ? 'C' : (is_half ? 'H' : 'X')))); constexpr uint64 shift = 256; constexpr uint64 type_bits = index_bit * shift + value_bit; return 'G' + diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 1368dc261c3..8834ca60b12 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -231,6 +231,25 @@ void Coo::move_to( } +template +void Coo::convert_to( + Coo>, IndexType>* result) const +{ + result->values_ = this->values_; + result->row_idxs_ = this->row_idxs_; + result->col_idxs_ = this->col_idxs_; + result->set_size(this->get_size()); +} + + +template +void Coo::move_to( + Coo>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Coo::convert_to( Csr* result) const diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index e50732a3be9..0c89394021c 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -322,6 +322,26 @@ void Csr::move_to( } +template +void Csr::convert_to( + Csr>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + convert_strategy_helper(result); +} + + +template +void Csr::move_to( + Csr>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Csr::convert_to( Coo* result) const diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 171ff007b4a..f94547a687a 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -603,6 +603,30 @@ void Dense::move_to(Dense>* result) } +template +void Dense::convert_to( + Dense>>* result) const +{ + if (result->get_size() != this->get_size()) { + result->set_size(this->get_size()); + result->stride_ = stride_; + result->values_.resize_and_reset(result->get_size()[0] * + result->stride_); + } + auto exec = this->get_executor(); + exec->run(dense::make_copy( + this, make_temporary_output_clone(exec, result).get())); +} + + +template +void Dense::move_to( + Dense>>* result) +{ + this->convert_to(result); +} + + template template void Dense::convert_impl(Coo* result) const @@ -1519,7 +1543,7 @@ template void gather_mixed_real_complex(Function fn, LinOp* out) { #ifdef GINKGO_MIXED_PRECISION - run>(out, fn); + run, next_precision>>(out, fn); #else precision_dispatch(fn, out); #endif diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 1a442ffc789..921087c1a96 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -162,6 +162,22 @@ void Diagonal::move_to(Diagonal>* result) this->convert_to(result); } +template +void Diagonal::convert_to( + Diagonal>>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void Diagonal::move_to( + Diagonal>>* result) +{ + this->convert_to(result); +} + template void Diagonal::convert_to(Csr* result) const diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 87b74c7f417..b8bdf4b8e4a 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -172,6 +172,26 @@ void Ell::move_to( } +template +void Ell::convert_to( + Ell>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_stored_elements_per_row_ = this->num_stored_elements_per_row_; + result->stride_ = this->stride_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Ell::convert_to(Dense* result) const { diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index a48e32be088..e6c00a93180 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -167,6 +167,27 @@ void Fbcsr::move_to( } +template +void Fbcsr::convert_to( + Fbcsr>, IndexType>* const result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); + // block sizes are immutable except for assignment/conversion + result->bs_ = this->bs_; +} + + +template +void Fbcsr::move_to( + Fbcsr>, IndexType>* const result) +{ + this->convert_to(result); +} + + template void Fbcsr::convert_to( Dense* const result) const diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index d450a0dfc35..56dc7dd290b 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -222,6 +222,27 @@ void Hybrid::move_to( } +template +void Hybrid::convert_to( + Hybrid>, IndexType>* result) const +{ + this->ell_->convert_to(result->ell_.get()); + this->coo_->convert_to(result->coo_.get()); + // TODO set strategy correctly + // There is no way to correctly clone the strategy like in + // Csr::convert_to + result->set_size(this->get_size()); +} + + +template +void Hybrid::move_to( + Hybrid>, IndexType>* result) +{ + this->convert_to(result); +} + + template void Hybrid::convert_to(Dense* result) const { diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index fecc60a0ca9..836855b89a9 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -64,7 +64,8 @@ RowGatherer::create_const( template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { - run, std::complex>( + run, + std::complex, std::complex>( in, [&](auto gather) { gather->row_gather(&row_idxs_, out); }); } @@ -72,9 +73,10 @@ template void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { - run, std::complex>( - in, - [&](auto gather) { gather->row_gather(alpha, &row_idxs_, beta, out); }); + run, + std::complex, std::complex>(in, [&](auto gather) { + gather->row_gather(alpha, &row_idxs_, beta, out); + }); } diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index a4787e758bf..bbbabe6c36b 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -195,6 +195,27 @@ void Sellp::move_to( this->convert_to(result); } +template +void Sellp::convert_to( + Sellp>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->slice_lengths_ = this->slice_lengths_; + result->slice_sets_ = this->slice_sets_; + result->slice_size_ = this->slice_size_; + result->stride_factor_ = this->stride_factor_; + result->set_size(this->get_size()); +} + + +template +void Sellp::move_to( + Sellp>, IndexType>* result) +{ + this->convert_to(result); +} + template void Sellp::convert_to(Dense* result) const diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 9f1f5b50ba6..16cacb1fa09 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -189,7 +189,7 @@ Pgm::generate_local( auto abs_mtx = local_matrix->compute_absolute(); // abs_mtx is already real valuetype, so transpose is enough auto weight_mtx = gko::as(abs_mtx->transpose()); - auto half_scalar = initialize>({0.5}, exec); + auto half_scalar = initialize>({half(0.5)}, exec); auto identity = matrix::Identity::create(exec, num_rows); // W = (abs_mtx + transpose(abs_mtx))/2 abs_mtx->apply(half_scalar, identity, half_scalar, weight_mtx); diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index f6d5b042a23..f3fbf799f59 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -330,7 +330,7 @@ void Jacobi::generate(const LinOp* system_matrix, ->extract_diagonal_linop()); auto diag_vt = ::gko::detail::temporary_conversion>:: - template create>>( + template create>>( diag.get()); if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index 274948531ab..4ba329d7252 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -518,8 +518,8 @@ void CbGmres::apply_impl(const LinOp* alpha, const LinOp* b, #define GKO_DECLARE_CB_GMRES(_type1) class CbGmres<_type1> #define GKO_DECLARE_CB_GMRES_TRAITS(_type1) \ struct workspace_traits> -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_TRAITS); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_TRAITS); } // namespace solver diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 35ad7b5d1fe..967a861e339 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -316,7 +316,7 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto next_nrows = mg_level_list.at(i)->get_coarse_op()->get_size()[0]; auto mg_level = mg_level_list.at(i); - run, std::complex>( mg_level, [&, this](auto mg_level, auto i, auto cycle, auto current_nrows, @@ -454,7 +454,7 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, return; } auto mg_level = multigrid->get_mg_level_list().at(level); - run, std::complex>( mg_level, [&, this](auto mg_level) { #if GINKGO_BUILD_MPI @@ -703,7 +703,7 @@ void Multigrid::generate() break; } - run, std::complex>( mg_level, [this](auto mg_level, auto index, auto matrix) { @@ -741,7 +741,7 @@ void Multigrid::generate() auto last_mg_level = mg_level_list_.back(); // generate coarsest solver - run, std::complex>( last_mg_level, [this](auto mg_level, auto level, auto matrix) { @@ -858,7 +858,7 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, b, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex>(first_mg_level, lambda, b, x); } @@ -897,7 +897,7 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, alpha, b, beta, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex>(first_mg_level, lambda, alpha, b, beta, x); } @@ -962,7 +962,7 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, auto first_mg_level = this->get_mg_level_list().front(); - run, std::complex>(first_mg_level, lambda, b, x); } diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index 4e73cc8d56a..672d273db65 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -97,8 +97,8 @@ ResidualNormBase::ResidualNormBase( baseline_{baseline}, system_matrix_{args.system_matrix}, b_{args.b}, - one_{gko::initialize({1}, exec)}, - neg_one_{gko::initialize({-1}, exec)}, + one_{gko::initialize({one()}, exec)}, + neg_one_{gko::initialize({-one()}, exec)}, reduction_tmp_{exec} { switch (baseline_) { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 000cb7b215f..fcf9ac4b885 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -61,6 +61,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") endif() endif() +target_compile_definitions(ginkgo_cuda PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 8c4f0a93d0c..99cb1835b7c 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -189,7 +189,7 @@ struct cuda_type_impl { template struct cuda_type_impl> { - using type = thrust::complex; + using type = thrust::complex::type>; }; template <> diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu index 80e938fbbff..ba84c8a8d3c 100644 --- a/cuda/matrix/fft_kernels.cu +++ b/cuda/matrix/fft_kernels.cu @@ -120,7 +120,7 @@ void fft(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -136,7 +136,7 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); template @@ -152,7 +152,7 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 7ab010ba29f..8747dcb60a7 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -946,7 +946,8 @@ void initialize(std::shared_ptr exec, stop_status->get_data()); } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index ff9cb72b017..644ea72aaad 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -40,6 +40,11 @@ inline half abs(std::complex a) } inline half sqrt(half a) { return half(sqrt(float(a))); } +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex(a))); +} + } // namespace reference } // namespace kernels @@ -61,6 +66,11 @@ inline half abs(std::complex a) } inline half sqrt(half a) { return half(sqrt(float(a))); } +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex(a))); +} + } // namespace omp } // namespace kernels @@ -390,21 +400,12 @@ struct next_precision_impl { using type = half; }; -template <> -struct next_precision_impl> { - using type = std::complex; -}; -template <> -struct next_precision_impl> { - using type = std::complex; +template +struct next_precision_impl> { + using type = std::complex::type>; }; -// template -// struct next_precision_impl> { -// using type = std::complex::type>; -// }; - template struct reduce_precision_impl { @@ -509,7 +510,7 @@ using next_precision = typename detail::next_precision_impl::type; * next_precision. */ template -using previous_precision = next_precision; +using previous_precision = next_precision>; /** diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 0edb39a9c6d..218c79a6fea 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -38,7 +38,7 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return dist(gen); + return ValueType(dist(gen)); } diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index d3b52949b76..7697d014c61 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -54,7 +54,13 @@ make_temporary_conversion(Ptr&& matrix) auto result = detail::temporary_conversion< MaybeConstDense>::template create(matrix); if (!result) { - GKO_NOT_SUPPORTED(*matrix); + result = detail::temporary_conversion>:: + template create< + matrix::Dense>>>( + matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } } return result; } @@ -227,11 +233,14 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) #ifdef GINKGO_MIXED_PRECISION using fst_type = matrix::Dense; using snd_type = matrix::Dense>; + using trd_type = matrix::Dense>>; if (auto dense_in = dynamic_cast(in)) { if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } @@ -240,6 +249,18 @@ void mixed_precision_dispatch(Function fn, const LinOp* in, LinOp* out) fn(dense_in, dense_out); } else if (auto dense_out = dynamic_cast(out)) { fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else { + GKO_NOT_SUPPORTED(out); + } + } else if (auto dense_in = dynamic_cast(in)) { + if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); + } else if (auto dense_out = dynamic_cast(out)) { + fn(dense_in, dense_out); } else { GKO_NOT_SUPPORTED(out); } diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 1845b565a8e..0c81361080a 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -400,6 +400,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) +#if GINKGO_COMPILE_KERNEL +#define GKO_ADAPT_CPHF(_macro) \ + template <> \ + _macro GKO_NOT_IMPLEMENTED +#else +#define GKO_ADAPT_CPHF(_macro) template _macro +#endif + + /** * Instantiates a template for each non-complex value type compiled by Ginkgo. * @@ -421,6 +430,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(double) #endif +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(_macro) \ + template _macro(float); \ + template _macro(double) + /** * Instantiates a template for each value type compiled by Ginkgo. @@ -439,10 +452,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + GKO_ADAPT_CPHF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex) #endif +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(_macro) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(_macro); \ + template _macro(std::complex); \ + template _macro(std::complex) + /** * Instantiates a template for each value and scalar type compiled by Ginkgo. @@ -467,13 +486,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, float); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(half, half); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_CPHF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -557,8 +578,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + GKO_ADAPT_CPHF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ + GKO_ADAPT_CPHF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -628,6 +651,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + GKO_ADAPT_CPHF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_CPHF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_CPHF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -667,14 +693,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - template _macro(half, double); \ - template _macro(double, half); \ - template _macro(float, half); \ - template _macro(half, float); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + template _macro(half, double); \ + template _macro(double, half); \ + template _macro(float, half); \ + template _macro(half, float); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -687,12 +717,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + template _macro(half, half); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -705,13 +736,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + template _macro(half, half); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ + template _macro(std::complex, double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -734,6 +767,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ + GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -749,6 +783,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(half); \ template _macro(float); \ template _macro(double); \ + GKO_ADAPT_CPHF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex); \ template _macro(size_type); \ diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 9373107df69..8d8797ef9ed 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -49,6 +49,7 @@ class Hybrid; template class Coo : public EnableLinOp>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -80,13 +81,20 @@ class Coo : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Coo, IndexType>; + friend class Coo, IndexType>; + + friend class Coo>, IndexType>; void convert_to( Coo, IndexType>* result) const override; void move_to(Coo, IndexType>* result) override; + void convert_to( + Coo>, IndexType>* result) const override; + + void move_to(Coo>, IndexType>* result) override; + void convert_to(Csr* other) const override; void move_to(Csr* other) override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index f27fe12a934..bd8c8c0f21b 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -100,6 +100,8 @@ void strategy_rebuild_helper(Csr* result); template class Csr : public EnableLinOp>, public ConvertibleTo, IndexType>>, + public ConvertibleTo< + Csr>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -688,13 +690,22 @@ class Csr : public EnableLinOp>, index_type max_length_per_row_; }; - friend class Csr, IndexType>; + friend class Csr, IndexType>; + + friend class Csr>, + IndexType>; void convert_to( Csr, IndexType>* result) const override; void move_to(Csr, IndexType>* result) override; + void convert_to(Csr>, IndexType>* + result) const override; + + void move_to(Csr>, IndexType>* + result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 45390ee7316..efe69328f08 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -63,6 +63,15 @@ class SparsityCsr; class Empty {}; +template +using next2_type = next_precision>; + + +// template +// using conditional_type = typename std::conditional< +// std::is_same>::value, Empty, +// Dense>>::type; + /** * Dense is a matrix format which explicitly stores all values of the matrix. * @@ -82,11 +91,7 @@ template class Dense : public EnableLinOp>, public ConvertibleTo>>, - public std::conditional< - std::is_same>, - ValueType>::value, - Empty, - ConvertibleTo>>>>, + public ConvertibleTo>>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -276,6 +281,14 @@ class Dense friend class Dense>; + friend class Dense>>; + + void convert_to(Dense>>* result) + const override; + + void move_to( + Dense>>* result) override; + void convert_to(Dense>* result) const override; void move_to(Dense>* result) override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 56906a4d96f..a3c92aca6b0 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -42,6 +42,7 @@ class Diagonal public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>>, + public ConvertibleTo>>>, public Transposable, public WritableToMatrixData, public WritableToMatrixData, @@ -71,7 +72,9 @@ class Diagonal using device_mat_data32 = device_matrix_data; using absolute_type = remove_complex; - friend class Diagonal>; + friend class Diagonal>; + + friend class Diagonal>>; std::unique_ptr transpose() const override; @@ -81,6 +84,10 @@ class Diagonal void move_to(Diagonal>* result) override; + void convert_to(Diagonal>>* result) const override; + + void move_to(Diagonal>>* result) override; + void convert_to(Csr* result) const override; void move_to(Csr* result) override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index 37f4c0e7f55..a2b13f0b8e3 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -51,6 +51,7 @@ class Hybrid; template class Ell : public EnableLinOp>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -63,7 +64,8 @@ class Ell : public EnableLinOp>, friend class Coo; friend class Csr; friend class Ell, IndexType>; - friend class Ell, IndexType>; + friend class Ell, IndexType>; + friend class Ell>, IndexType>; friend class Hybrid; public: @@ -88,6 +90,11 @@ class Ell : public EnableLinOp>, void move_to(Ell, IndexType>* result) override; + void convert_to( + Ell>, IndexType>* result) const override; + + void move_to(Ell>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index ce327e7e8a0..58ac3afe307 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -98,6 +98,7 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size) template class Fbcsr : public EnableLinOp>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -145,13 +146,19 @@ class Fbcsr : public EnableLinOp>, using ConvertibleTo>::convert_to; using ConvertibleTo>::move_to; - friend class Fbcsr, IndexType>; + friend class Fbcsr, IndexType>; + friend class Fbcsr>, IndexType>; void convert_to( Fbcsr, IndexType>* result) const override; void move_to(Fbcsr, IndexType>* result) override; + void convert_to( + Fbcsr>, IndexType>* result) const override; + + void move_to(Fbcsr>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 5e995cb0ba0..9e030b5fc44 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -42,6 +42,7 @@ template class Hybrid : public EnableLinOp>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -355,13 +356,20 @@ class Hybrid imbalance_bounded_limit strategy_; }; - friend class Hybrid, IndexType>; + friend class Hybrid, IndexType>; + + friend class Hybrid>, IndexType>; void convert_to( Hybrid, IndexType>* result) const override; void move_to(Hybrid, IndexType>* result) override; + void convert_to( + Hybrid>, IndexType>* result) const override; + + void move_to(Hybrid>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index e6520324030..ae7db46a081 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -42,6 +42,7 @@ class Csr; template class Sellp : public EnableLinOp>, public ConvertibleTo, IndexType>>, + public ConvertibleTo>, IndexType>>, public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -72,13 +73,19 @@ class Sellp : public EnableLinOp>, using device_mat_data = device_matrix_data; using absolute_type = remove_complex; - friend class Sellp, IndexType>; + friend class Sellp, IndexType>; + friend class Sellp>, IndexType>; void convert_to( Sellp, IndexType>* result) const override; void move_to(Sellp, IndexType>* result) override; + void convert_to( + Sellp>, IndexType>* result) const override; + + void move_to(Sellp>, IndexType>* result) override; + void convert_to(Dense* other) const override; void move_to(Dense* other) override; diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 41bec80673f..e8379a77535 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -81,6 +81,7 @@ target_include_directories(ginkgo_omp PRIVATE "${OpenMP_CXX_INCLUDE_DIRS}") # and the compiler is unhappy with the quotation marks. separate_arguments(OpenMP_SEP_FLAGS NATIVE_COMMAND "${OpenMP_CXX_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${OpenMP_SEP_FLAGS}") +target_compile_definitions(ginkgo_omp PRIVATE GINKGO_COMPILE_KERNEL=1) # Need to link against ginkgo_cuda for the `raw_copy_to(CudaExecutor ...)` method target_link_libraries(ginkgo_omp PRIVATE ginkgo_cuda) diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 44cef8a6e1a..d45eb1a68cd 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -32,10 +32,8 @@ void atomic_add(ValueType& out, ValueType val) // The C++ standard explicitly allows casting complex* to double* // [complex.numbers.general] auto values = reinterpret_cast*>(&out); -#pragma omp atomic - values[0] += real(val); -#pragma omp atomic - values[1] += imag(val); + atomic_add(values[0], real(val)); + atomic_add(values[1], imag(val)); } @@ -60,13 +58,12 @@ void atomic_add(half& out, half val) assumed = old; auto answer = reinterpret(reinterpret(assumed) + val); #pragma omp atomic capture -{ - old = *address_as_converter; - *address_as_converter = (old == assumed) ? answer : old; -} + { + old = *address_as_converter; + *address_as_converter = (old == assumed) ? answer : old; + } } while (assumed != old); - -} // namespace omp +} } // namespace omp diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp index 0301b9093ff..ca1f21c36b1 100644 --- a/omp/matrix/fft_kernels.cpp +++ b/omp/matrix/fft_kernels.cpp @@ -119,7 +119,7 @@ void fft(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -190,7 +190,7 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); template @@ -295,7 +295,7 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index a53294b9fbe..c60e848d501 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -330,7 +330,7 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index a93002e4833..388e10342ad 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -135,15 +135,16 @@ void initialize(std::shared_ptr exec, const size_type nrhs, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + // auto dist = + // std::normal_distribution>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - for (size_type col = 0; col < num_cols; col++) { - subspace_vectors->at(row, col) = - get_rand_value(dist, gen); - } + // for (size_type col = 0; col < num_cols; col++) { + // subspace_vectors->at(row, col) = + // get_rand_value(dist, gen); + // } } for (size_type i = 0; i < row; i++) { diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 85b8f33e38b..7a02998e927 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -80,6 +80,7 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC") set_source_files_properties(preconditioner/jacobi_kernels.cpp PROPERTIES COMPILE_FLAGS "-O1") endif() +target_compile_definitions(ginkgo_reference PRIVATE GINKGO_COMPILE_KERNEL=1) if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_reference "") endif() diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp index 00af068803c..e8617592265 100644 --- a/reference/matrix/fft_kernels.cpp +++ b/reference/matrix/fft_kernels.cpp @@ -116,7 +116,7 @@ void fft(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -183,7 +183,7 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); template @@ -283,7 +283,7 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index 5d41a0d0e00..e0c5ea22b1c 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -294,7 +294,7 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index 606def8a18b..df8f67075fb 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -122,15 +122,15 @@ void initialize(std::shared_ptr exec, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - auto dist = std::normal_distribution>(0.0, 1.0); + // auto dist = std::normal_distribution>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - for (size_type col = 0; col < num_cols; col++) { - subspace_vectors->at(row, col) = - get_rand_value(dist, gen); - } + // for (size_type col = 0; col < num_cols; col++) { + // subspace_vectors->at(row, col) = + // // get_rand_value(dist, gen); + // } } for (size_type i = 0; i < row; i++) { From 957d29c7a62b58e50825d793d69e1263fd71f125 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 16:35:58 -0600 Subject: [PATCH 04/62] compile for cuda/sycl/test/mpi (hip needs trick) --- accessor/hip_helper.hpp | 9 +- common/cuda_hip/components/atomic.hpp | 50 ++- common/cuda_hip/components/warp_blas.hpp | 2 +- .../factorization/par_ilut_select_kernels.hpp | 2 +- .../jacobi_generate_kernels.instantiate.cpp | 12 + common/unified/multigrid/pgm_kernels.cpp | 10 +- core/base/extended_float.hpp | 318 +++++++----------- core/distributed/matrix.cpp | 39 +++ core/distributed/vector.cpp | 19 ++ core/test/utils.hpp | 9 +- dpcpp/CMakeLists.txt | 1 + dpcpp/components/atomic.dp.hpp | 30 +- dpcpp/components/cooperative_groups.dp.hpp | 6 + .../par_ilut_select_kernels.hpp.inc | 2 +- dpcpp/matrix/csr_kernels.dp.cpp | 8 + dpcpp/matrix/dense_kernels.dp.cpp | 67 ++-- dpcpp/solver/idr_kernels.dp.cpp | 8 +- hip/CMakeLists.txt | 1 + hip/base/types.hip.hpp | 72 +++- hip/components/cooperative_groups.hip.hpp | 10 +- hip/matrix/fft_kernels.hip.cpp | 8 +- include/ginkgo/core/base/math.hpp | 59 ++-- include/ginkgo/core/base/mpi.hpp | 5 + .../ginkgo/core/base/precision_dispatch.hpp | 16 +- include/ginkgo/core/distributed/matrix.hpp | 14 +- include/ginkgo/core/distributed/vector.hpp | 19 +- test/matrix/matrix.cpp | 10 +- test/mpi/matrix.cpp | 12 +- test/mpi/solver/solver.cpp | 10 +- test/solver/solver.cpp | 10 +- 30 files changed, 522 insertions(+), 316 deletions(-) diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp index 6b76b726c10..cd2f4f67a13 100644 --- a/accessor/hip_helper.hpp +++ b/accessor/hip_helper.hpp @@ -17,6 +17,9 @@ #include "utils.hpp" +struct __half; + + namespace gko { namespace acc { namespace detail { @@ -53,11 +56,15 @@ struct hip_type { using type = typename hip_type::type&&; }; +template <> +struct hip_type { + using type = __half; +}; // Transform std::complex to thrust::complex template struct hip_type> { - using type = thrust::complex; + using type = thrust::complex::type>; }; diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp index 2fbb1664165..4798ef48129 100644 --- a/common/cuda_hip/components/atomic.hpp +++ b/common/cuda_hip/components/atomic.hpp @@ -95,15 +95,63 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) } \ }; + +#define GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(CONVERTER_TYPE) \ + template \ + struct atomic_helper< \ + ValueType, \ + std::enable_if_t<(sizeof(ValueType) == sizeof(CONVERTER_TYPE))>> { \ + __forceinline__ __device__ static ValueType atomic_add( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + old = *c_addr; \ + *c_addr = reinterpret( \ + val + reinterpret(assumed)); \ + }); \ + } \ + __forceinline__ __device__ static ValueType atomic_max( \ + ValueType* __restrict__ addr, ValueType val) \ + { \ + using c_type = CONVERTER_TYPE; \ + return atomic_wrapper( \ + addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ + if (reinterpret(assumed) < val) { \ + old = *c_addr; \ + *c_addr = reinterpret(assumed); \ + } \ + }); \ + } \ + \ + private: \ + template \ + __forceinline__ __device__ static ValueType atomic_wrapper( \ + ValueType* __restrict__ addr, Callable set_old) \ + { \ + CONVERTER_TYPE* address_as_converter = \ + reinterpret_cast(addr); \ + CONVERTER_TYPE old = *address_as_converter; \ + CONVERTER_TYPE assumed = old; \ + set_old(old, assumed, address_as_converter); \ + return reinterpret(old); \ + } \ + }; + // Support 64-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD and ATOMIC_MAX GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); -#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) +#if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) && \ + !(defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC) // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS +// required the CC>70 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); +#else +GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int) #endif // !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE diff --git a/common/cuda_hip/components/warp_blas.hpp b/common/cuda_hip/components/warp_blas.hpp index 116b963ad11..0df0612152c 100644 --- a/common/cuda_hip/components/warp_blas.hpp +++ b/common/cuda_hip/components/warp_blas.hpp @@ -425,7 +425,7 @@ __device__ __forceinline__ remove_complex compute_infinity_norm( } } return reduce(group, sum, - [](result_type x, result_type y) { return max(x, y); }); + [](result_type x, result_type y) { return gko::max(x, y); }); } diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp index 6f5940c2b14..86f58717963 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp @@ -259,7 +259,7 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select( __shared__ ValueType sh_local[basecase_size]; for (int i = 0; i < basecase_local_size; ++i) { auto idx = threadIdx.x + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local); if (threadIdx.x == rank / basecase_local_size) { diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp index ca0c480c08e..e58a1ed4cf6 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp @@ -23,6 +23,18 @@ namespace gko { +namespace detail { +#if !defined(__HIP_DEVICE_COMPILE__) +template <> +struct basic_float_traits<__half> { + using type = __half; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; +#endif +} // namespace detail namespace kernels { namespace GKO_DEVICE_NAMESPACE { namespace jacobi { diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp index af6d8f198d8..a16ce8b8adc 100644 --- a/common/unified/multigrid/pgm_kernels.cpp +++ b/common/unified/multigrid/pgm_kernels.cpp @@ -183,7 +183,7 @@ void find_strongest_neighbor( continue; } auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + weight_vals[idx] / gko::max(abs(diag[row]), abs(diag[col])); if (agg[col] == -1 && device_std::tie(weight, col) > device_std::tie(max_weight_unagg, strongest_unagg)) { @@ -247,8 +247,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_const_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { @@ -286,8 +286,8 @@ void assign_to_exist_agg(std::shared_ptr exec, if (col == row) { continue; } - auto weight = - weight_vals[idx] / max(abs(diag[row]), abs(diag[col])); + auto weight = weight_vals[idx] / + gko::max(abs(diag[row]), abs(diag[col])); if (agg_val[col] != -1 && device_std::tie(weight, col) > device_std::tie(max_weight_agg, strongest_agg)) { diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index bf93a40a6b6..b08c443d3d0 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -293,25 +293,20 @@ class half { public: GKO_ATTRIBUTES half() noexcept = default; - GKO_ATTRIBUTES half& operator=(const half& val) = default; - GKO_ATTRIBUTES half(const half& val) = default; - // GKO_ATTRIBUTES half(half const&) = default; - // complex() = default; - - // complex(const complex& z) = default; - - explicit GKO_ATTRIBUTES half(float32 val) noexcept + template ::value>> + GKO_ATTRIBUTES half(const T val) { - this->float2half(val); + this->float2half(static_cast(val)); } - explicit GKO_ATTRIBUTES half(float64 val) noexcept - : half(static_cast(val)) - {} + GKO_ATTRIBUTES half(const half& val) = default; - explicit GKO_ATTRIBUTES half(int val) noexcept - : half(static_cast(val)) - {} + template + GKO_ATTRIBUTES half& operator=(const V val) + { + this->float2half(static_cast(val)); + return *this; + } GKO_ATTRIBUTES operator float() const noexcept { @@ -323,151 +318,65 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } - // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - // GKO_ATTRIBUTES operator __half() noexcept - // { - // return reinterpret_cast(*this); - // } - // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - - - GKO_ATTRIBUTES half& operator+=(const float& rhs) - { - auto val = *this + rhs; - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator/=(const float& rhs) - { - auto val = *this / rhs; - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator*=(const float& rhs) - { - auto val = *this * rhs; - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator-=(const float& rhs) - { - auto val = *this - rhs; - this->float2half(val); - return *this; - } - - // half& operator+=(const half& rhs) - // { - // auto val = *this + float(rhs); - // this->float2half(val); - // return *this; - // } - - // half& operator/=(const half& rhs) - // { - // auto val = *this / float(rhs); - // this->float2half(val); - // return *this; - // } - - // half& operator*=(const half& rhs) - // { - // auto val = *this * float(rhs); - // this->float2half(val); - // return *this; - // } - - // half& operator-=(const half& rhs) - // { - // auto val = *this - float(rhs); - // this->float2half(val); - // return *this; - // } - - GKO_ATTRIBUTES friend half operator+(half lhs, const half& rhs) - { - float flhs = lhs; - flhs += rhs; // reuse compound assignment - return half(flhs); - } - - GKO_ATTRIBUTES friend half operator-(half lhs, const half& rhs) - { - float flhs = lhs; - flhs -= rhs; // reuse compound assignment - return half(flhs); - } - - GKO_ATTRIBUTES friend half operator*(half lhs, const half& rhs) - { - float flhs = lhs; - flhs *= rhs; // reuse compound assignment - return half(flhs); - } - - GKO_ATTRIBUTES friend half operator/(half lhs, const half& rhs) - { - float flhs = lhs; - flhs /= rhs; // reuse compound assignment - return half(flhs); - } - - - // GKO_ATTRIBUTES friend half operator+(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs += rhs; // reuse compound assignment - // return half(flhs); - // } - - // GKO_ATTRIBUTES friend half operator-(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs -= rhs; // reuse compound assignment - // return half(flhs); - // } - - // GKO_ATTRIBUTES friend half operator*(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs *= rhs; // reuse compound assignment - // return half(flhs); - // } - - // GKO_ATTRIBUTES friend half operator/(half lhs, const float& rhs) - // { - // float flhs = lhs; - // flhs /= rhs; // reuse compound assignment - // return half(flhs); - // } - - GKO_ATTRIBUTES half& operator=(long long int val) - { - this->float2half(float(val)); - return *this; - } - - GKO_ATTRIBUTES half& operator=(int val) - { - this->float2half(float(val)); - return *this; - } - - GKO_ATTRIBUTES half& operator=(float val) - { - this->float2half(val); - return *this; - } - - GKO_ATTRIBUTES half& operator=(double val) - { - this->float2half(static_cast(val)); - return *this; - } - + // can not use half operator _op(const half) for half + half + // operation will cast it to float and then do float operation such that it + // becomes float in the end. +#define HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ + { \ + auto result = *this _op hf; \ + this->float2half(result); \ + return *this; \ + } + HALF_OPERATOR(+, +=) + HALF_OPERATOR(-, -=) + HALF_OPERATOR(*, *=) + HALF_OPERATOR(/, /=) + + // Do operation with different type + // If it is floating point, using floating point as type. + // If it is integer, using half as type +#define HALF_FRIEND_OPERATOR(_op, _opeq) \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const half hf, const T val) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const T val, const half hf) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } + + HALF_FRIEND_OPERATOR(+, +=) + HALF_FRIEND_OPERATOR(-, -=) + HALF_FRIEND_OPERATOR(*, *=) + HALF_FRIEND_OPERATOR(/, /=) + + // the negative GKO_ATTRIBUTES half operator-() const { auto val = 0.0f - *this; @@ -478,6 +387,8 @@ class half { using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; + // TODO: do we really need this one? + // Without it, everything can be constexpr, which might make stuff easier. GKO_ATTRIBUTES void float2half(float val) noexcept { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) @@ -488,7 +399,7 @@ class half { #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } - static uint16 float2half(uint32 data_) noexcept + static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept { using conv = detail::precision_converter; if (f32_traits::is_inf(data_)) { @@ -510,7 +421,7 @@ class half { } } - static uint32 half2float(uint16 data_) noexcept + static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept { using conv = detail::precision_converter; if (f16_traits::is_inf(data_)) { @@ -639,7 +550,7 @@ class complex { {} template - explicit complex(const T& real) : complex(static_cast(real)) + complex(const T& real) : complex(static_cast(real)) {} template @@ -659,73 +570,69 @@ class complex { static_cast(imag_)); } - complex& operator=(const int& __re) + template + complex& operator=(const V& val) { - real_ = __re; + real_ = val; imag_ = value_type(); return *this; } - complex& operator=(const value_type& __re) + template + complex& operator=(const std::complex& val) { - real_ = __re; - imag_ = value_type(); + real_ = val.real(); + imag_ = val.imag(); return *this; } - complex& operator+=(const value_type& __re) + + complex& operator+=(const value_type& real) { - real_ += __re; + real_ += real; return *this; } - complex& operator-=(const value_type& __re) + complex& operator-=(const value_type& real) { - real_ -= __re; + real_ -= real; return *this; } - complex& operator*=(const value_type& __re) + complex& operator*=(const value_type& real) { - real_ *= __re; - imag_ *= __re; + real_ *= real; + imag_ *= real; return *this; } - complex& operator/=(const value_type& __re) + complex& operator/=(const value_type& real) { - real_ /= __re; - imag_ /= __re; + real_ /= real; + imag_ /= real; return *this; } - template - complex& operator=(const complex<_Xp>& __c) - { - real_ = __c.real(); - imag_ = __c.imag(); - return *this; - } - template - complex& operator+=(const complex<_Xp>& __c) + template + complex& operator+=(const complex& val) { - real_ += __c.real(); - imag_ += __c.imag(); + real_ += val.real(); + imag_ += val.imag(); return *this; } - template - complex& operator-=(const complex<_Xp>& __c) + template + complex& operator-=(const complex& val) { - real_ -= __c.real(); - imag_ -= __c.imag(); + real_ -= val.real(); + imag_ -= val.imag(); return *this; } - template - complex& operator*=(const complex<_Xp>& __c) + template + complex& operator*=(const complex& val) { - *this = *this * complex(__c.real(), __c.imag()); + *this = *this * complex(val.real(), val.imag()); return *this; } - template - complex& operator/=(const complex<_Xp>& __c) + template + complex& operator/=(const complex& val) { - *this = *this / complex(__c.real(), __c.imag()); + *this = *this / complex(val.real(), val.imag()); return *this; } @@ -766,10 +673,6 @@ class complex> { }; -template <> -struct is_scalar : std::true_type {}; - - template <> struct numeric_limits { static constexpr bool is_specialized{true}; @@ -805,6 +708,15 @@ struct numeric_limits { } }; +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + } // namespace std diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 63f359cc40a..c9337e6d80b 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -237,6 +237,45 @@ void Matrix::move_to( } +template +void Matrix::convert_to( + Matrix>, local_index_type, + global_index_type>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_.get()); + result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void Matrix::move_to( + Matrix>, local_index_type, + global_index_type>* result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_.get()); + result->non_local_mtx_->move_from(this->non_local_mtx_.get()); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} + + template void Matrix::read_distributed( const device_matrix_data& data, diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index ae7ab182a85..21b7c334af7 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -299,6 +299,25 @@ void Vector::move_to(Vector>* result) } +template +void Vector::convert_to( + Vector>>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->set_size(this->get_size()); + this->get_local_vector()->convert_to(&result->local_); +} + + +template +void Vector::move_to( + Vector>>* result) +{ + this->convert_to(result); +} + + template std::unique_ptr::absolute_type> Vector::compute_absolute() const diff --git a/core/test/utils.hpp b/core/test/utils.hpp index cacc7191bbf..d6d999b1d94 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -365,15 +365,14 @@ template struct reduction_factor { using nc_output = remove_complex; using nc_precision = remove_complex; - static constexpr nc_output value{ - std::numeric_limits::epsilon() * nc_output{10} * - (gko::is_complex() ? nc_output{1.4142} : one())}; + static nc_output value; }; template -constexpr remove_complex - reduction_factor::value; +remove_complex reduction_factor::value = + std::numeric_limits::epsilon() * nc_output{10} * + (gko::is_complex() ? nc_output{1.4142} : one()); } // namespace test diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 851ef9a3dc6..8d658bb6994 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -120,6 +120,7 @@ target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() +target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_default_includes(ginkgo_dpcpp) ginkgo_install_library(ginkgo_dpcpp) diff --git a/dpcpp/components/atomic.dp.hpp b/dpcpp/components/atomic.dp.hpp index 8168421a488..e5fa6cae426 100644 --- a/dpcpp/components/atomic.dp.hpp +++ b/dpcpp/components/atomic.dp.hpp @@ -145,6 +145,21 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); +// sycl does not support 16bit +template +struct atomic_helper> { + __dpct_inline__ static ValueType atomic_add(ValueType* __restrict__ addr, + ValueType val) + { + // GKO_NOT_IMPLEMENTED; + // wrong implementation because sycl can not use exception in kernel + auto old = *addr; + *addr += val; + return old; + } +}; + #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE @@ -212,7 +227,20 @@ struct atomic_helper< GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned long long int); // Support 32-bit ATOMIC_ADD GKO_BIND_ATOMIC_MAX_STRUCTURE(unsigned int); - +// not support 16bit +template +struct atomic_max_helper> { + __dpct_inline__ static ValueType atomic_max(ValueType* __restrict__ addr, + ValueType val) + { + // GKO_NOT_IMPLEMENTED; + // wrong implementation because sycl can not use exception in kernel + auto old = *addr; + *addr = std::max(*addr, val); + return old; + } +}; #undef GKO_BIND_ATOMIC_MAX_STRUCTURE diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index 034bf4baf28..89f5839676e 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -13,6 +13,12 @@ #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" +// namespace sycl { +// namespace detail { +// template <> +// struct is_arithmetic : public std::false_type {}; +// } // namespace detail +// } // namespace sycl namespace gko { namespace kernels { diff --git a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc index 67cc9cdba15..1ebfe6ed320 100644 --- a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc @@ -337,7 +337,7 @@ void basecase_select(const ValueType* __restrict__ input, IndexType size, for (int i = 0; i < basecase_local_size; ++i) { auto idx = item_ct1.get_local_id(2) + i * basecase_block_size; - local[i] = idx < size ? input[idx] : sentinel; + local[i] = idx < size ? input[idx] : static_cast(sentinel); } bitonic_sort(local, sh_local, item_ct1); if (item_ct1.get_local_id(2) == rank / basecase_local_size) { diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 7e5d0229c86..4a1382c3bb3 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1384,6 +1384,14 @@ void load_balance_spmv(std::shared_ptr exec, } } +template +struct onemkl_support : std::false_type {}; + +template <> +struct onemkl_support : std::true_type {}; + +template <> +struct onemkl_support : std::true_type {}; template bool try_general_sparselib_spmv(std::shared_ptr exec, diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 04f3229eaed..01c0cc8b3ba 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -209,6 +209,20 @@ void compute_norm2_dispatch(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); +template +struct onemkl_support : std::false_type {}; + +template <> +struct onemkl_support : std::true_type {}; + +template <> +struct onemkl_support : std::true_type {}; + +template <> +struct onemkl_support> : std::true_type {}; + +template <> +struct onemkl_support> : std::true_type {}; template void simple_apply(std::shared_ptr exec, @@ -217,17 +231,21 @@ void simple_apply(std::shared_ptr exec, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - one(), a->get_const_values(), a->get_stride(), - b->get_const_values(), b->get_stride(), zero(), - c->get_values(), c->get_stride()); - } else { - dense::fill(exec, c, zero()); + if constexpr (onemkl_support::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], one(), a->get_const_values(), + a->get_stride(), b->get_const_values(), b->get_stride(), + zero(), c->get_values(), c->get_stride()); + } else { + dense::fill(exec, c, zero()); + } } + } else { + GKO_NOT_IMPLEMENTED; } } @@ -241,19 +259,24 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { using namespace oneapi::mkl; - if (b->get_stride() != 0 && c->get_stride() != 0) { - if (a->get_size()[1] > 0) { - oneapi::mkl::blas::row_major::gemm( - *exec->get_queue(), transpose::nontrans, transpose::nontrans, - c->get_size()[0], c->get_size()[1], a->get_size()[1], - exec->copy_val_to_host(alpha->get_const_values()), - a->get_const_values(), a->get_stride(), b->get_const_values(), - b->get_stride(), - exec->copy_val_to_host(beta->get_const_values()), - c->get_values(), c->get_stride()); - } else { - dense::scale(exec, beta, c); + if constexpr (onemkl_support::value) { + if (b->get_stride() != 0 && c->get_stride() != 0) { + if (a->get_size()[1] > 0) { + oneapi::mkl::blas::row_major::gemm( + *exec->get_queue(), transpose::nontrans, + transpose::nontrans, c->get_size()[0], c->get_size()[1], + a->get_size()[1], + exec->copy_val_to_host(alpha->get_const_values()), + a->get_const_values(), a->get_stride(), + b->get_const_values(), b->get_stride(), + exec->copy_val_to_host(beta->get_const_values()), + c->get_values(), c->get_stride()); + } else { + dense::scale(exec, beta, c); + } } + } else { + GKO_NOT_IMPLEMENTED; } } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index d59ada362f9..915a3a8478e 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -603,11 +603,11 @@ void initialize_subspace_vectors(std::shared_ptr exec, cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { std::uint64_t offset = idx.get_linear_id(); oneapi::dpl::minstd_rand engine(seed, offset); - oneapi::dpl::normal_distribution> - distr(0, 1); - auto res = distr(engine); + // oneapi::dpl::normal_distribution> + // distr(0, 1); + // auto res = distr(engine); - work[idx] = res; + // work[idx] = res; }); }); } diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 7d914d57a81..2c882af99de 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -70,6 +70,7 @@ if (GINKGO_HAVE_ROCTX) endif() target_compile_options(ginkgo_hip PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) +target_compile_definitions(ginkgo_hip PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_compile_features(ginkgo_hip) diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index bb0d4a2d0c9..0bcfeff14a9 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -26,11 +26,55 @@ #include "common/cuda_hip/base/runtime.hpp" +namespace std { + +template <> +struct is_scalar<__half> : std::true_type {}; + +} // namespace std + + namespace gko { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return is_nan(float(val)); +} +template <> +GKO_INLINE GKO_ATTRIBUTES constexpr __half abs(const __half& val) +{ + return __habs(val); +} +#endif namespace kernels { namespace hip { + +#if defined(__HIPCC__) +// #endif +// __device__ __half sqrt(__half val) { return hsqrt(val); } +// if directly using above, it will lead all double, float goes to half version +__device__ __half sqrt(__half val) { return hsqrt(val); } +__device__ float sqrt(float val) { return sqrtf(val); } +__device__ double sqrt(double val) { return sqrt(val); } +__device__ thrust::complex sqrt(thrust::complex val) +{ + return thrust::sqrt(val); +} +__device__ thrust::complex sqrt(thrust::complex val) +{ + return thrust::sqrt(val); +} +// template +// __device__ __forceinline__ +// std::enable_if_t::value, __half> +// sqrt(const T& val) +// { +// return hsqrt(val); +// } +#endif namespace detail { @@ -130,6 +174,17 @@ struct hiplibs_type_impl> { using type = hipDoubleComplex; }; +template <> +struct hiplibs_type_impl { + using type = __half; +}; + +template <> +struct hiplibs_type_impl> { + using type = __half2; +}; + + template struct hiplibs_type_impl> { using type = typename hiplibs_type_impl>::type; @@ -202,9 +257,14 @@ struct hip_type_impl { using type = volatile typename hip_type_impl::type; }; +template <> +struct hip_type_impl { + using type = __half; +}; + template struct hip_type_impl> { - using type = thrust::complex; + using type = thrust::complex::type>; }; template <> @@ -217,6 +277,11 @@ struct hip_type_impl { using type = thrust::complex; }; +template <> +struct hip_type_impl<__half2> { + using type = thrust::complex<__half>; +}; + template struct hip_struct_member_type_impl { using type = T; @@ -227,6 +292,11 @@ struct hip_struct_member_type_impl> { using type = fake_complex; }; +template <> +struct hip_struct_member_type_impl { + using type = __half; +}; + template struct hip_type_impl> { using type = diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 36618bb7f3e..87fd6cedacf 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -306,7 +306,7 @@ class enable_extended_shuffle : public Group { SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint32 v, SelectorType s) { \ + [this](uint16 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -326,12 +326,12 @@ class enable_extended_shuffle : public Group { shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint32) == 0, + static_assert(sizeof(ValueType) % sizeof(uint16) == 0, "Unable to shuffle sizes which are not 4-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); diff --git a/hip/matrix/fft_kernels.hip.cpp b/hip/matrix/fft_kernels.hip.cpp index 6b14aaf067d..d34ba82bbbd 100644 --- a/hip/matrix/fft_kernels.hip.cpp +++ b/hip/matrix/fft_kernels.hip.cpp @@ -163,7 +163,7 @@ void fft(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT_KERNEL); template @@ -179,7 +179,8 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -195,7 +196,8 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 644ea72aaad..66cd2eea3e3 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -18,6 +18,34 @@ #include +// Using SYCL_LANGUAGE_VERSION will lead the mismatch sycl namespace from 6.0.0 +// when using dpcpp compiler without dpcpp module +#if GINKGO_DPCPP_MAJOR_VERSION +#include +#endif + + +namespace std { + + +inline gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } + +inline gko::half abs(std::complex a) +{ + return gko::half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); +} + +inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } + +inline std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex(a))); +} + + +} // namespace std + + namespace gko { @@ -33,18 +61,6 @@ using std::abs; using std::sqrt; -inline half abs(half a) { return half((a > 0) ? a : -a); } -inline half abs(std::complex a) -{ - return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); -} -inline half sqrt(half a) { return half(sqrt(float(a))); } - -inline std::complex sqrt(std::complex a) -{ - return std::complex(sqrt(std::complex(a))); -} - } // namespace reference } // namespace kernels @@ -59,19 +75,6 @@ using std::abs; using std::sqrt; -inline half abs(half a) { return half((a > 0) ? a : -a); } -inline half abs(std::complex a) -{ - return half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); -} -inline half sqrt(half a) { return half(sqrt(float(a))); } - -inline std::complex sqrt(std::complex a) -{ - return std::complex(sqrt(std::complex(a))); -} - - } // namespace omp } // namespace kernels @@ -176,8 +179,12 @@ struct is_complex_impl> template struct is_complex_or_scalar_impl : std::is_scalar {}; +template <> +struct is_complex_or_scalar_impl : std::true_type {}; + template -struct is_complex_or_scalar_impl> : std::is_scalar {}; +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; /** diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 64c04e1805a..32d2e5d899a 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -79,6 +79,9 @@ struct type_impl {}; GKO_REGISTER_MPI_TYPE(char, MPI_CHAR); GKO_REGISTER_MPI_TYPE(unsigned char, MPI_UNSIGNED_CHAR); GKO_REGISTER_MPI_TYPE(unsigned, MPI_UNSIGNED); +// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 +// TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(int, MPI_INT); GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG); @@ -88,6 +91,8 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); +// TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_FLOAT_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 7697d014c61..e028336f202 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -361,7 +361,13 @@ make_temporary_conversion(LinOp* matrix) experimental::distributed::Vector>>( matrix); if (!result) { - GKO_NOT_SUPPORTED(matrix); + result = detail::temporary_conversion< + experimental::distributed::Vector>:: + template create>>>(matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } } return result; } @@ -380,7 +386,13 @@ make_temporary_conversion(const LinOp* matrix) experimental::distributed::Vector>>( matrix); if (!result) { - GKO_NOT_SUPPORTED(matrix); + result = detail::temporary_conversion< + const experimental::distributed::Vector>:: + template create>>>(matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } } return result; } diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index de719bb9315..a64f8395297 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -250,12 +250,17 @@ class Matrix Matrix>, public ConvertibleTo< Matrix, LocalIndexType, GlobalIndexType>>, + public ConvertibleTo>, + LocalIndexType, GlobalIndexType>>, public DistributedBase { friend class EnableDistributedPolymorphicObject; - friend class Matrix, LocalIndexType, + friend class Matrix, LocalIndexType, GlobalIndexType>; + friend class Matrix>, + LocalIndexType, GlobalIndexType>; friend class multigrid::Pgm; + public: using value_type = ValueType; using index_type = GlobalIndexType; @@ -278,6 +283,13 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; + void convert_to( + Matrix>, local_index_type, + global_index_type>* result) const override; + + void move_to(Matrix>, + local_index_type, global_index_type>* result) override; + /** * Reads a square matrix from the device_matrix_data structure and a global * partition. diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index e068f29ea26..a476f2f2661 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -59,12 +59,14 @@ template class Vector : public EnableDistributedLinOp>, public ConvertibleTo>>, + public ConvertibleTo>>>, public EnableAbsoluteComputation>>, public DistributedBase { friend class EnableDistributedPolymorphicObject; friend class Vector>; friend class Vector>; - friend class Vector>; + friend class Vector>; + friend class Vector>>; public: using EnableDistributedLinOp::convert_to; @@ -163,6 +165,12 @@ class Vector void move_to(Vector>* result) override; + void convert_to(Vector>>* result) + const override; + + void move_to( + Vector>>* result) override; + std::unique_ptr compute_absolute() const override; void compute_absolute_inplace() override; @@ -664,12 +672,21 @@ struct conversion_target_helper> { using target_type = experimental::distributed::Vector; using source_type = experimental::distributed::Vector>; + using snd_source_type = experimental::distributed::Vector< + previous_precision>>; static std::unique_ptr create_empty(const source_type* source) { return target_type::create(source->get_executor(), source->get_communicator()); } + + static std::unique_ptr create_empty( + const snd_source_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } }; diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index eea1a67ef5f..0b06f76df85 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -586,10 +586,7 @@ class Matrix : public CommonTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -609,10 +606,7 @@ class Matrix : public CommonTestFixture { return {gko::initialize( {gko::test::detail::get_rand_value< typename VecType::value_type>( - std::normal_distribution< - gko::remove_complex>( - 0.0, 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, ref), exec}; } diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index f4b8af2fb19..e1bdf026f58 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -320,12 +320,10 @@ class Matrix : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); beta = gko::test::generate_random_matrix( 1, 1, std::uniform_int_distribution(1, 1), - std::normal_distribution>(), - this->engine, this->exec); + std::normal_distribution<>(), this->engine, this->exec); } void SetUp() override { ASSERT_EQ(comm.size(), 3); } @@ -365,14 +363,12 @@ class Matrix : public CommonMpiTestFixture { num_rows, num_cols, std::uniform_int_distribution(static_cast(num_cols), static_cast(num_cols)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto mat_md = gko::test::generate_random_matrix_data( num_rows, num_rows, std::uniform_int_distribution(0, static_cast(num_rows)), - std::normal_distribution>(), - engine); + std::normal_distribution<>(), engine); auto row_mapping = gko::test::generate_random_array< gko::experimental::distributed::comm_index_type>( diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index be9f6865c86..f3d36f0dcfe 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -268,10 +268,7 @@ class Solver : public CommonMpiTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -298,10 +295,7 @@ class Solver : public CommonMpiTestFixture { { return gko::share(gko::initialize( {gko::test::detail::get_rand_value( - std::normal_distribution< - gko::remove_complex>(0.0, - 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, exec)); } diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index 47414f83041..bc2ce343756 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -549,10 +549,7 @@ class Solver : public CommonTestFixture { template gko::matrix_data gen_dense_data(gko::dim<2> size) { - return { - size, - std::normal_distribution>(0.0, 1.0), - rand_engine}; + return {size, std::normal_distribution<>(0.0, 1.0), rand_engine}; } template @@ -573,10 +570,7 @@ class Solver : public CommonTestFixture { return {gko::initialize( {gko::test::detail::get_rand_value< typename VecType::value_type>( - std::normal_distribution< - gko::remove_complex>( - 0.0, 1.0), - rand_engine)}, + std::normal_distribution<>(0.0, 1.0), rand_engine)}, ref), exec}; } From a0c389c686d7fdfb120f1dd2515c8e8679cf2a1f Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 19:17:28 -0600 Subject: [PATCH 05/62] hip finally --- common/cuda_hip/base/math.hpp | 3 +- cuda/base/types.hpp | 15 +++----- hip/base/types.hip.hpp | 63 +++++++++++++++++++------------ include/ginkgo/core/base/math.hpp | 6 +++ 4 files changed, 51 insertions(+), 36 deletions(-) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 8c655174524..01d0910cc97 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -39,7 +39,8 @@ struct is_complex_impl> template -struct is_complex_or_scalar_impl> : std::is_scalar {}; +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; template diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 99cb1835b7c..a8208dd58d2 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -18,31 +18,26 @@ #include -namespace std { - -template <> -struct is_scalar<__half> : std::true_type {}; - -} // namespace std - namespace gko { -#if defined(__CUDA_ARCH__) + template <> __device__ __forceinline__ bool is_nan(const __half& val) { return is_nan(float(val)); } -#endif + namespace kernels { namespace cuda { + +// __habs only defined when CUDA_ARCH #if defined(__CUDA_ARCH__) -// template <> __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } __device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } #endif + namespace detail { /** diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 0bcfeff14a9..cc9ebb2e8ea 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -26,14 +26,6 @@ #include "common/cuda_hip/base/runtime.hpp" -namespace std { - -template <> -struct is_scalar<__half> : std::true_type {}; - -} // namespace std - - namespace gko { #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // template <> @@ -47,34 +39,55 @@ GKO_INLINE GKO_ATTRIBUTES constexpr __half abs(const __half& val) { return __habs(val); } -#endif -namespace kernels { -namespace hip { +#endif #if defined(__HIPCC__) -// #endif -// __device__ __half sqrt(__half val) { return hsqrt(val); } -// if directly using above, it will lead all double, float goes to half version -__device__ __half sqrt(__half val) { return hsqrt(val); } -__device__ float sqrt(float val) { return sqrtf(val); } -__device__ double sqrt(double val) { return sqrt(val); } -__device__ thrust::complex sqrt(thrust::complex val) +GKO_INLINE +GKO_ATTRIBUTES __half sqrt(__half val) { return hsqrt(val); } +GKO_INLINE +GKO_ATTRIBUTES float sqrt(float val) { return sqrtf(val); } +GKO_INLINE +GKO_ATTRIBUTES double sqrt(double val) { return sqrt(val); } +GKO_INLINE +GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) { return thrust::sqrt(val); } -__device__ thrust::complex sqrt(thrust::complex val) +GKO_INLINE +GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) { return thrust::sqrt(val); } -// template -// __device__ __forceinline__ -// std::enable_if_t::value, __half> -// sqrt(const T& val) +#endif + +// #if defined(__HIPCC__) +// // #endif +// // __device__ __half sqrt(__half val) { return hsqrt(val); } +// // if directly using above, it will lead all double, float goes to half +// version +// __device__ __half sqrt(__half val) { return hsqrt(val); } +// __device__ float sqrt(float val) { return sqrtf(val); } +// __device__ double sqrt(double val) { return sqrt(val); } +// __device__ thrust::complex sqrt(thrust::complex val) // { -// return hsqrt(val); +// return thrust::sqrt(val); // } -#endif +// __device__ thrust::complex sqrt(thrust::complex val) +// { +// return thrust::sqrt(val); +// } +// // template +// // __device__ __forceinline__ +// // std::enable_if_t::value, __half> +// // sqrt(const T& val) +// // { +// // return hsqrt(val); +// // } +// #endif + +namespace kernels { +namespace hip { namespace detail { diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 66cd2eea3e3..bde1fe54d94 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -25,6 +25,9 @@ #endif +class __half; + + namespace std { @@ -182,6 +185,9 @@ struct is_complex_or_scalar_impl : std::is_scalar {}; template <> struct is_complex_or_scalar_impl : std::true_type {}; +template <> +struct is_complex_or_scalar_impl<__half> : std::true_type {}; + template struct is_complex_or_scalar_impl> : is_complex_or_scalar_impl {}; From fe5e49113bbdf9aafeadf766cbf56f135d6de841 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 20:56:21 -0600 Subject: [PATCH 06/62] fix the narrow issue and atomic support --- common/cuda_hip/components/atomic.hpp | 6 ++-- core/log/papi.cpp | 5 ++- cuda/base/types.hpp | 42 ++++++++++++++++++++-- reference/test/matrix/coo_kernels.cpp | 8 +++-- reference/test/matrix/csr_kernels.cpp | 8 +++-- reference/test/matrix/dense_kernels.cpp | 16 +++++---- reference/test/matrix/diagonal_kernels.cpp | 8 +++-- reference/test/matrix/ell_kernels.cpp | 8 +++-- reference/test/matrix/fbcsr_kernels.cpp | 8 +++-- reference/test/matrix/hybrid_kernels.cpp | 8 +++-- reference/test/matrix/sellp_kernels.cpp | 8 +++-- test/mpi/matrix.cpp | 4 +-- 12 files changed, 99 insertions(+), 30 deletions(-) diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp index 4798ef48129..55b4acd2dcc 100644 --- a/common/cuda_hip/components/atomic.hpp +++ b/common/cuda_hip/components/atomic.hpp @@ -104,6 +104,7 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) __forceinline__ __device__ static ValueType atomic_add( \ ValueType* __restrict__ addr, ValueType val) \ { \ + assert(false); \ using c_type = CONVERTER_TYPE; \ return atomic_wrapper( \ addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ @@ -115,6 +116,7 @@ __forceinline__ __device__ ResultType reinterpret(ValueType val) __forceinline__ __device__ static ValueType atomic_max( \ ValueType* __restrict__ addr, ValueType val) \ { \ + assert(false); \ using c_type = CONVERTER_TYPE; \ return atomic_wrapper( \ addr, [&val](c_type& old, c_type assumed, c_type* c_addr) { \ @@ -146,9 +148,9 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); #if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) && \ - !(defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC) + (__CUDA_ARCH__ >= 700) && !(defined(__HIPCC__) && GINKGO_HIP_PLATFORM_HCC) // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS -// required the CC>70 +// required the CC>=70 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); #else GKO_BIND_ATOMIC_HELPER_FAKE_STRUCTURE(unsigned short int) diff --git a/core/log/papi.cpp b/core/log/papi.cpp index 5ced377ca38..e5aa588aee1 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -213,15 +213,14 @@ void Papi::on_criterion_check_completed( double residual_norm_d = 0.0; if (residual_norm != nullptr) { auto dense_r_norm = as(residual_norm); - residual_norm_d = - static_cast(std::real(dense_r_norm->at(0, 0))); + residual_norm_d = static_cast(real(dense_r_norm->at(0, 0))); } else if (residual != nullptr) { detail::vector_dispatch(residual, [&](const auto* dense_r) { auto tmp_res_norm = Vector::create( residual->get_executor(), dim<2>{1, residual->get_size()[1]}); dense_r->compute_norm2(tmp_res_norm); residual_norm_d = - static_cast(std::real(tmp_res_norm->at(0, 0))); + static_cast(real(tmp_res_norm->at(0, 0))); }); } diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index a8208dd58d2..58b7f862d42 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -21,23 +21,59 @@ namespace gko { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + + template <> __device__ __forceinline__ bool is_nan(const __half& val) { - return is_nan(float(val)); + return __hisnan(val); } +#else + + +template <> +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return isnan(static_cast(val)); +} + + +#endif + + namespace kernels { namespace cuda { -// __habs only defined when CUDA_ARCH -#if defined(__CUDA_ARCH__) +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + + __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } + __device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } + + +#else + + +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} + + +__device__ __forceinline__ __half sqrt(const __half& val) +{ + return sqrt(static_cast(val)); +} + + #endif + namespace detail { /** diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 42b68d1cb4c..4ff7141cd8e 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -87,7 +87,9 @@ TYPED_TEST(Coo, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -108,7 +110,9 @@ TYPED_TEST(Coo, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 2d4c61786ad..07fb526e8cd 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -796,7 +796,9 @@ TYPED_TEST(Csr, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; // use mtx2 as mtx's strategy would involve creating a CudaExecutor this->mtx2->convert_to(tmp); @@ -821,7 +823,9 @@ TYPED_TEST(Csr, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; // use mtx2 as mtx's strategy would involve creating a CudaExecutor this->mtx2->move_to(tmp); diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index 41294c89d49..c7e26589dd7 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -751,9 +751,11 @@ TYPED_TEST(Dense, ConvertsToPrecision) auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -771,9 +773,11 @@ TYPED_TEST(Dense, MovesToPrecision) auto tmp = OtherDense::create(this->exec); auto res = Dense::create(this->exec); // If OtherT is more precise: 0, otherwise r - auto residual = r::value < r::value - ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + auto residual = + r::value < r::value + ? gko::remove_complex{0} + : gko::remove_complex{ + static_cast>(r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index 208c9d98639..437ba3a1746 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -93,7 +93,9 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->diag1->convert_to(tmp); tmp->convert_to(res); @@ -113,7 +115,9 @@ TYPED_TEST(Diagonal, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->diag1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index c96dcae773a..6d60961663a 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -451,7 +451,9 @@ TYPED_TEST(Ell, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -472,7 +474,9 @@ TYPED_TEST(Ell, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index cd82bade8b7..dd220a40172 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -279,7 +279,9 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->convert_to(tmp); tmp->convert_to(res); @@ -300,7 +302,9 @@ TYPED_TEST(Fbcsr, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 014b5bb1024..817d188147e 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -241,7 +241,9 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -262,7 +264,9 @@ TYPED_TEST(Hybrid, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index 18cf793c7f3..b5e6a9ce69f 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -197,7 +197,9 @@ TYPED_TEST(Sellp, ConvertsToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->convert_to(tmp); tmp->convert_to(res); @@ -218,7 +220,9 @@ TYPED_TEST(Sellp, MovesToPrecision) // If OtherType is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{ + static_cast>( + r::value)}; this->mtx1->move_to(tmp); tmp->move_to(res); diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index e1bdf026f58..6d6812dea12 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -690,7 +690,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{static_cast>(r::value)}; this->dist_mat->convert_to(tmp); tmp->convert_to(res); @@ -717,7 +717,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : gko::remove_complex{static_cast>(r::value)}; this->dist_mat->move_to(tmp); tmp->convert_to(res); From 6c3c12b4344323b69212810855b3093ae09a5b27 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 11 Jan 2023 23:29:26 -0600 Subject: [PATCH 07/62] fixed more error --- cuda/components/cooperative_groups.cuh | 12 ++-- cuda/solver/common_trs_kernels.cuh | 9 +-- hip/base/types.hip.hpp | 77 +++++++++++++------------- 3 files changed, 48 insertions(+), 50 deletions(-) diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index 983ec32f9ac..73cb2152458 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -302,7 +302,7 @@ public: SelectorType selector) const \ { \ return shuffle_impl( \ - [this](uint32 v, SelectorType s) { \ + [this](uint16 v, SelectorType s) { \ return static_cast(this)->_name(v, s); \ }, \ var, selector); \ @@ -322,12 +322,12 @@ private: shuffle_impl(ShuffleOperator intrinsic_shuffle, const ValueType var, SelectorType selector) { - static_assert(sizeof(ValueType) % sizeof(uint32) == 0, - "Unable to shuffle sizes which are not 4-byte multiples"); - constexpr auto value_size = sizeof(ValueType) / sizeof(uint32); + static_assert(sizeof(ValueType) % sizeof(uint16) == 0, + "Unable to shuffle sizes which are not 2-byte multiples"); + constexpr auto value_size = sizeof(ValueType) / sizeof(uint16); ValueType result; - auto var_array = reinterpret_cast(&var); - auto result_array = reinterpret_cast(&result); + auto var_array = reinterpret_cast(&var); + auto result_array = reinterpret_cast(&result); #pragma unroll for (std::size_t i = 0; i < value_size; ++i) { result_array[i] = intrinsic_shuffle(var_array[i], selector); diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 686d8026a64..90cdb362855 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -212,13 +212,14 @@ struct CudaSolveStruct : gko::solver::SolveStruct { size_type work_size{}; + // In nullptr is considered nullptr_t not casted to const ValueType* sparselib::buffer_size_ext( handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - &work_size); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, &work_size); // allocate workspace work.resize_and_reset(work_size); @@ -228,8 +229,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - work.get_data()); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, work.get_data()); } void solve(const matrix::Csr* matrix, diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index cc9ebb2e8ea..fc582488cbb 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -27,64 +27,61 @@ namespace gko { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) -// template <> +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ >= 700 __device__ __forceinline__ bool is_nan(const __half& val) { - return is_nan(float(val)); + return __hisnan(val); } -template <> -GKO_INLINE GKO_ATTRIBUTES constexpr __half abs(const __half& val) +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } +#else +__device__ __forceinline__ bool is_nan(const __half& val) { - return __habs(val); + return is_nan(static_cast(val)); } +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif + +#elif defined(__HIP_DEVICE_COMPILE__) +__device__ __forceinline__ bool is_nan(const __half& val) +{ + return __hisnan(val); +} + +// rocm40 __habs is not constexpr +__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } + #endif #if defined(__HIPCC__) -GKO_INLINE -GKO_ATTRIBUTES __half sqrt(__half val) { return hsqrt(val); } -GKO_INLINE -GKO_ATTRIBUTES float sqrt(float val) { return sqrtf(val); } -GKO_INLINE -GKO_ATTRIBUTES double sqrt(double val) { return sqrt(val); } -GKO_INLINE -GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) +__device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } +__device__ __forceinline__ double sqrt(double val) { return sqrt(val); } +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) { return thrust::sqrt(val); } -GKO_INLINE -GKO_ATTRIBUTES thrust::complex sqrt(thrust::complex val) +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) { return thrust::sqrt(val); } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +__device__ __forceinline__ __half sqrt(__half val) +{ + return sqrt(static_cast(val)); +} +#else +__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } +#endif #endif -// #if defined(__HIPCC__) -// // #endif -// // __device__ __half sqrt(__half val) { return hsqrt(val); } -// // if directly using above, it will lead all double, float goes to half -// version -// __device__ __half sqrt(__half val) { return hsqrt(val); } -// __device__ float sqrt(float val) { return sqrtf(val); } -// __device__ double sqrt(double val) { return sqrt(val); } -// __device__ thrust::complex sqrt(thrust::complex val) -// { -// return thrust::sqrt(val); -// } -// __device__ thrust::complex sqrt(thrust::complex val) -// { -// return thrust::sqrt(val); -// } -// // template -// // __device__ __forceinline__ -// // std::enable_if_t::value, __half> -// // sqrt(const T& val) -// // { -// // return hsqrt(val); -// // } -// #endif namespace kernels { namespace hip { From a0ee872c6a1d6021877574add85f16ff00450db0 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 12 Jan 2023 08:34:02 -0600 Subject: [PATCH 08/62] fix the op order and gdb Co-authored-by: Marcel Koch --- core/base/extended_float.hpp | 4 ++-- dev_tools/scripts/gdb-ginkgo.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index b08c443d3d0..f12f53ed296 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -366,8 +366,8 @@ class half { using type = \ typename std::conditional::value, T, \ half>::type; \ - auto result = static_cast(hf); \ - result _opeq static_cast(val); \ + auto result = static_cast(val); \ + result _opeq static_cast(hf); \ return result; \ } diff --git a/dev_tools/scripts/gdb-ginkgo.py b/dev_tools/scripts/gdb-ginkgo.py index d3de8f09a25..122d177031f 100644 --- a/dev_tools/scripts/gdb-ginkgo.py +++ b/dev_tools/scripts/gdb-ginkgo.py @@ -51,6 +51,7 @@ def next(self): _versioned_namespace = '__8::' + # new version adapted from https://gcc.gnu.org/pipermail/gcc-cvs/2021-November/356230.html # necessary due to empty class optimization def is_specialization_of(x, template_name): @@ -64,6 +65,7 @@ def is_specialization_of(x, template_name): expr = '^std::{}<.*>$'.format(template_name) return re.match(expr, x) is not None + def get_template_arg_list(type_obj): "Return a type's template arguments as a list" n = 0 @@ -75,6 +77,7 @@ def get_template_arg_list(type_obj): return template_args n += 1 + def _tuple_impl_get(val): "Return the tuple element stored in a _Tuple_impl base class." bases = val.type.fields() @@ -95,6 +98,7 @@ def _tuple_impl_get(val): else: raise ValueError("Unsupported implementation for std::tuple: %s" % str(val.type)) + def tuple_get(n, val): "Return the result of std::get(val) on a std::tuple" tuple_size = len(get_template_arg_list(val.type)) @@ -108,6 +112,7 @@ def tuple_get(n, val): n -= 1 return _tuple_impl_get(node) + def get_unique_ptr_data_ptr(val): "Return the result of val.get() on a std::unique_ptr" # std::unique_ptr contains a std::tuple, @@ -220,12 +225,28 @@ def display_hint(self): return 'array' +class GkoHalfPrinter: + "Print a gko::half" + + def __init__(self, val): + # GDB doesn't seem to consider the user-defined conversion in its Value.cast, + # so we need to call the conversion operator explicitly + address = hex(val.address) + self.float_val = gdb.parse_and_eval(f"reinterpret_cast({address})->operator float()") + + def to_string(self): + self.float_val.fetch_lazy() + return self.float_val + + def lookup_type(val): if not str(val.type.unqualified()).startswith('gko::'): return None suffix = str(val.type.unqualified())[5:] if suffix.startswith('array<') and val.type.code == gdb.TYPE_CODE_STRUCT: return GkoArrayPrinter(val) + if suffix.startswith("half") and val.type.code == gdb.TYPE_CODE_STRUCT: + return GkoHalfPrinter(val) return None From 5acbf27f35384d10d2f1e45fc19185c69e06db1d Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 12 Jan 2023 09:30:49 -0600 Subject: [PATCH 09/62] add the rand template not_implemented --- common/cuda_hip/solver/idr_kernels.cpp | 8 ++++---- cuda/base/curand_bindings.hpp | 13 +++++++++++++ dpcpp/solver/idr_kernels.dp.cpp | 10 ++++++---- hip/base/hiprand_bindings.hip.hpp | 13 +++++++++++++ 4 files changed, 36 insertions(+), 8 deletions(-) diff --git a/common/cuda_hip/solver/idr_kernels.cpp b/common/cuda_hip/solver/idr_kernels.cpp index 3aef25b0a48..a0f605134eb 100644 --- a/common/cuda_hip/solver/idr_kernels.cpp +++ b/common/cuda_hip/solver/idr_kernels.cpp @@ -383,10 +383,10 @@ void initialize_subspace_vectors(std::shared_ptr exec, auto gen = randlib::rand_generator(std::random_device{}(), RANDLIB_RNG_PSEUDO_DEFAULT, exec->get_stream()); - // randlib::rand_vector( - // gen, - // subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), - // 0.0, 1.0, subspace_vectors->get_values()); + randlib::rand_vector( + gen, + subspace_vectors->get_size()[0] * subspace_vectors->get_stride(), + 0.0, 1.0, subspace_vectors->get_values()); randlib::destroy(gen); } } diff --git a/cuda/base/curand_bindings.hpp b/cuda/base/curand_bindings.hpp index eb3dbee6b7b..80ceff2dacd 100644 --- a/cuda/base/curand_bindings.hpp +++ b/cuda/base/curand_bindings.hpp @@ -23,6 +23,17 @@ namespace cuda { * @ingroup curand */ namespace curand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(CURAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -77,6 +88,8 @@ GKO_BIND_CURAND_RANDOM_VECTOR(float, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(double, curandGenerateNormalDouble); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormal); GKO_BIND_CURAND_RANDOM_VECTOR(std::complex, curandGenerateNormalDouble); +template +GKO_BIND_CURAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_CURAND_RANDOM_VECTOR diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 915a3a8478e..a5531f2dc40 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -603,11 +603,13 @@ void initialize_subspace_vectors(std::shared_ptr exec, cgh.parallel_for(sycl::range<1>(n), [=](sycl::item<1> idx) { std::uint64_t offset = idx.get_linear_id(); oneapi::dpl::minstd_rand engine(seed, offset); - // oneapi::dpl::normal_distribution> - // distr(0, 1); - // auto res = distr(engine); + oneapi::dpl::normal_distribution< + typename ::gko::detail::arth_type< + remove_complex>::type> + distr(0, 1); + auto res = distr(engine); - // work[idx] = res; + work[idx] = res; }); }); } diff --git a/hip/base/hiprand_bindings.hip.hpp b/hip/base/hiprand_bindings.hip.hpp index 7cd76b9d320..76a7f4e79ce 100644 --- a/hip/base/hiprand_bindings.hip.hpp +++ b/hip/base/hiprand_bindings.hip.hpp @@ -29,6 +29,17 @@ namespace hip { * @ingroup hiprand */ namespace hiprand { +namespace detail { + + +template +inline int64 not_implemented(Args...) +{ + return static_cast(HIPRAND_STATUS_TYPE_ERROR); +} + + +} // namespace detail template @@ -83,6 +94,8 @@ GKO_BIND_HIPRAND_RANDOM_VECTOR(double, hiprandGenerateNormalDouble); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormal); GKO_BIND_HIPRAND_RANDOM_VECTOR(std::complex, hiprandGenerateNormalDouble); +template +GKO_BIND_HIPRAND_RANDOM_VECTOR(ValueType, detail::not_implemented); #undef GKO_BIND_HIPRAND_RANDOM_VECTOR From b171312bcc2f44a8a66b7f321e54b616a25cd1a3 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 12 Jan 2023 16:26:23 -0600 Subject: [PATCH 10/62] this version can compile/run complex on cuda114 stack size can not be determined statically issue some tests are failed --- common/cuda_hip/base/math.hpp | 12 ++++---- core/base/extended_float.hpp | 25 ++++++++++++---- core/test/utils/assertions.hpp | 10 +++++-- cuda/base/types.hpp | 47 +++++++++++++++++++++++++++++- include/ginkgo/core/base/math.hpp | 16 ++++++++-- include/ginkgo/core/base/types.hpp | 4 +-- 6 files changed, 94 insertions(+), 20 deletions(-) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 01d0910cc97..0278fbbc711 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -33,14 +33,14 @@ struct remove_complex_impl> { }; -template -struct is_complex_impl> - : public std::integral_constant {}; +// template +// struct is_complex_impl> +// : public std::integral_constant {}; -template -struct is_complex_or_scalar_impl> - : is_complex_or_scalar_impl {}; +// template +// struct is_complex_or_scalar_impl> +// : is_complex_or_scalar_impl {}; template diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index f12f53ed296..851f4e293e5 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -549,16 +549,18 @@ class complex { : complex(static_cast(real), static_cast(imag)) {} - template + template ::value>> complex(const T& real) : complex(static_cast(real)) {} - template - explicit complex(const complex& other) + template ::value>> + explicit complex(const complex& other) : complex(static_cast(other.real()), static_cast(other.imag())) {} + // explicit complex(const complex& other) = default; + value_type real() const noexcept { return real_; } value_type imag() const noexcept { return imag_; } @@ -570,6 +572,12 @@ class complex { static_cast(imag_)); } + operator std::complex() const noexcept + { + return std::complex(static_cast(real_), + static_cast(imag_)); + } + template complex& operator=(const V& val) { @@ -626,13 +634,18 @@ class complex { template complex& operator*=(const complex& val) { - *this = *this * complex(val.real(), val.imag()); + auto tmp = real_; + real_ = real_ * val.real() - imag_ * val.imag(); + imag_ = tmp * val.imag() + imag_ * val.real(); return *this; } template complex& operator/=(const complex& val) { - *this = *this / complex(val.real(), val.imag()); + auto real = val.real(); + auto imag = val.imag(); + (*this) *= complex{val.real(), -val.imag()}; + (*this) /= (real * real + imag * imag); return *this; } @@ -708,6 +721,8 @@ struct numeric_limits { } }; +// complex using a template on operator= for any kind of complex, so we can +// do full specialization for half template <> inline complex& complex::operator=( const std::complex& a) diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 7bdc71ea94e..823ed9f0102 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -680,15 +680,19 @@ ::testing::AssertionResult values_near, std::complex>( std::complex val2, double abs_error) { using T = std::complex; - const double diff = abs(T{val1} - T{val2}); + T Tval1; + T Tval2; + Tval1 = val1; + Tval2 = val2; + const double diff = abs(Tval1 - Tval2); if (diff <= abs_error) return ::testing::AssertionSuccess(); return ::testing::AssertionFailure() << "The difference between " << first_expression << " and " << second_expression << " is " << diff << ", which exceeds " << tolerance_expression << ", where\n" - << first_expression << " evaluates to " << T{val1} << ",\n" - << second_expression << " evaluates to " << T{val2} << ", and\n" + << first_expression << " evaluates to " << Tval1 << ",\n" + << second_expression << " evaluates to " << Tval2 << ", and\n" << tolerance_expression << " evaluates to " << abs_error << "."; } diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 58b7f862d42..a3872dfaeb8 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -17,6 +17,43 @@ #include #include +// namespace std { +GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( + thrust::complex<__half> a) +{ + return sqrt(static_cast>(a)); +} + +// } // namespace std + +namespace thrust { +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return hypot(z.real(), z.imag()); +} + +} // namespace thrust + + +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + auto result = lhs; \ + result _opeq rhs; \ + return result; \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { @@ -44,6 +81,13 @@ __device__ __forceinline__ bool is_nan(const __half& val) #endif +template <> +__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) +{ + return is_nan(val.real()) || is_nan(val.imag()); +} + + namespace kernels { namespace cuda { @@ -245,7 +289,7 @@ struct cuda_struct_member_type_impl { template struct cuda_struct_member_type_impl> { - using type = fake_complex; + using type = fake_complex::type>; }; template <> @@ -274,6 +318,7 @@ GKO_CUDA_DATA_TYPE(float, CUDA_R_32F); GKO_CUDA_DATA_TYPE(double, CUDA_R_64F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_32F); GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_64F); +GKO_CUDA_DATA_TYPE(std::complex, CUDA_C_16F); GKO_CUDA_DATA_TYPE(int32, CUDA_R_32I); GKO_CUDA_DATA_TYPE(int8, CUDA_R_8I); diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index bde1fe54d94..963fc4d0da0 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -27,7 +27,10 @@ class __half; - +namespace thrust { +template +class complex; +} namespace std { @@ -42,7 +45,8 @@ inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } inline std::complex sqrt(std::complex a) { - return std::complex(sqrt(std::complex(a))); + return std::complex(sqrt(std::complex( + static_cast(a.real()), static_cast(a.imag())))); } @@ -178,6 +182,10 @@ template struct is_complex_impl> : public std::integral_constant {}; +template +struct is_complex_impl> + : public std::integral_constant {}; + template struct is_complex_or_scalar_impl : std::is_scalar {}; @@ -192,6 +200,10 @@ template struct is_complex_or_scalar_impl> : is_complex_or_scalar_impl {}; +template +struct is_complex_or_scalar_impl> + : is_complex_or_scalar_impl {}; + /** * template_converter is converting the template parameters of a class by diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 0c81361080a..a2739d8b8b7 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -401,9 +401,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_COMPILE_KERNEL -#define GKO_ADAPT_CPHF(_macro) \ - template <> \ - _macro GKO_NOT_IMPLEMENTED +#define GKO_ADAPT_CPHF(_macro) template _macro #else #define GKO_ADAPT_CPHF(_macro) template _macro #endif From 6c17701274319af0093db0357af7281496dde5cd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 13 Jan 2023 21:47:30 -0600 Subject: [PATCH 11/62] does not work for the other executor --- core/base/extended_float.hpp | 4 +++- core/test/utils/assertions.hpp | 8 +++---- cuda/base/types.hpp | 14 ++++++++---- hip/base/types.hip.hpp | 42 +++++++++++++++++++++++++++++++++- 4 files changed, 57 insertions(+), 11 deletions(-) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 851f4e293e5..a4a434a0239 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -544,7 +544,9 @@ class complex { const value_type& imag = value_type(0.f)) : real_(real), imag_(imag) {} - template + template ::value && + std::is_scalar::value>> explicit complex(const T& real, const U& imag) : complex(static_cast(real), static_cast(imag)) {} diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 823ed9f0102..d082d7992ec 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -680,10 +680,10 @@ ::testing::AssertionResult values_near, std::complex>( std::complex val2, double abs_error) { using T = std::complex; - T Tval1; - T Tval2; - Tval1 = val1; - Tval2 = val2; + // T{val1} calls the constructor of complex() -> which gives the + // complex(double/float) ambiguous + T Tval1 = val1; + T Tval2 = val2; const double diff = abs(Tval1 - Tval2); if (diff <= abs_error) return ::testing::AssertionSuccess(); diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index a3872dfaeb8..2dc22cf3712 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -17,7 +17,9 @@ #include #include -// namespace std { + +// thrust calls the c function not the function from std +// Maybe override the function from thrust directlry GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) { return hypot(static_cast(a), static_cast(b)); @@ -29,15 +31,18 @@ GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( return sqrt(static_cast>(a)); } -// } // namespace std namespace thrust { + + +// Dircetly call float versrion from here? template <> GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) { return hypot(z.real(), z.imag()); } + } // namespace thrust @@ -45,9 +50,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ - auto result = lhs; \ - result _opeq rhs; \ - return result; \ + return thrust::complex{lhs} + thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) @@ -55,6 +58,7 @@ THRUST_HALF_FRIEND_OPERATOR(-, -=) THRUST_HALF_FRIEND_OPERATOR(*, *=) THRUST_HALF_FRIEND_OPERATOR(/, /=) + namespace gko { diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index fc582488cbb..a2299ab9f84 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -26,6 +26,46 @@ #include "common/cuda_hip/base/runtime.hpp" +// thrust calls the c function not the function from std +// Maybe override the function from thrust directlry +GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) +{ + return hypot(static_cast(a), static_cast(b)); +} + +GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( + thrust::complex<__half> a) +{ + return sqrt(static_cast>(a)); +} + + +namespace thrust { + + +// Dircetly call float versrion from here? +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return hypot(static_cast(z.real()), static_cast(z.imag())); +} + + +} // namespace thrust + +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + return thrust::complex{lhs} + thrust::complex(rhs); \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) + + namespace gko { #if defined(__CUDA_ARCH__) #if __CUDA_ARCH__ >= 700 @@ -299,7 +339,7 @@ struct hip_struct_member_type_impl { template struct hip_struct_member_type_impl> { - using type = fake_complex; + using type = fake_complex::type>; }; template <> From cdbf0a01a316edab3bcf555d3997d2a77fc239f1 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 6 Feb 2023 22:41:38 +0100 Subject: [PATCH 12/62] fix complex issue and sqrt issue sqrt need to be global namespace to live with hip sqrt --- .../jacobi_generate_kernels.instantiate.cpp | 12 ----- core/base/extended_float.hpp | 22 +++++---- core/preconditioner/jacobi.cpp | 4 +- hip/base/types.hip.hpp | 46 ++++++++++--------- include/ginkgo/core/base/half.hpp | 24 ++++++++++ reference/matrix/ell_kernels.cpp | 3 +- reference/solver/idr_kernels.cpp | 12 +++-- test/solver/solver.cpp | 2 + 8 files changed, 74 insertions(+), 51 deletions(-) create mode 100644 include/ginkgo/core/base/half.hpp diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp index e58a1ed4cf6..ca0c480c08e 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp @@ -23,18 +23,6 @@ namespace gko { -namespace detail { -#if !defined(__HIP_DEVICE_COMPILE__) -template <> -struct basic_float_traits<__half> { - using type = __half; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 10; - static constexpr int exponent_bits = 5; - static constexpr bool rounds_to_nearest = true; -}; -#endif -} // namespace detail namespace kernels { namespace GKO_DEVICE_NAMESPACE { namespace jacobi { diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index a4a434a0239..ee67fa65a70 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -24,6 +24,8 @@ #include +#else +class __half; #endif // __CUDA_ARCH__ @@ -71,7 +73,7 @@ struct basic_float_traits { static constexpr bool rounds_to_nearest = true; }; -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) template <> struct basic_float_traits<__half> { using type = __half; @@ -80,7 +82,7 @@ struct basic_float_traits<__half> { static constexpr int exponent_bits = 5; static constexpr bool rounds_to_nearest = true; }; -#endif +// #endif template <> struct basic_float_traits { @@ -568,17 +570,17 @@ class complex { value_type imag() const noexcept { return imag_; } - operator std::complex() const noexcept + operator std::complex() const noexcept { - return std::complex(static_cast(real_), - static_cast(imag_)); + return std::complex(static_cast(real_), + static_cast(imag_)); } - operator std::complex() const noexcept - { - return std::complex(static_cast(real_), - static_cast(imag_)); - } + // operator std::complex() const noexcept + // { + // return std::complex(static_cast(real_), + // static_cast(imag_)); + // } template complex& operator=(const V& val) diff --git a/core/preconditioner/jacobi.cpp b/core/preconditioner/jacobi.cpp index f3fbf799f59..1164f6c0cb8 100644 --- a/core/preconditioner/jacobi.cpp +++ b/core/preconditioner/jacobi.cpp @@ -330,7 +330,9 @@ void Jacobi::generate(const LinOp* system_matrix, ->extract_diagonal_linop()); auto diag_vt = ::gko::detail::temporary_conversion>:: - template create>>( + template create>, + matrix::Diagonal>>>( diag.get()); if (!diag_vt) { GKO_NOT_SUPPORTED(system_matrix); diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index a2299ab9f84..8b397802e84 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -39,6 +39,28 @@ GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( return sqrt(static_cast>(a)); } +// __device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } +// __device__ __forceinline__ double sqrt(double val) { return ::sqrt(val); } +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) +{ + return thrust::sqrt(val); +} +__device__ __forceinline__ thrust::complex sqrt( + thrust::complex val) +{ + return thrust::sqrt(val); +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +__device__ __forceinline__ __half sqrt(__half val) +{ + return sqrt(static_cast(val)); +} +#else +__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } +#endif + namespace thrust { @@ -98,29 +120,9 @@ __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } #endif -#if defined(__HIPCC__) -__device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } -__device__ __forceinline__ double sqrt(double val) { return sqrt(val); } -__device__ __forceinline__ thrust::complex sqrt( - thrust::complex val) -{ - return thrust::sqrt(val); -} -__device__ __forceinline__ thrust::complex sqrt( - thrust::complex val) -{ - return thrust::sqrt(val); -} +// #if defined(__HIPCC__) -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 -__device__ __forceinline__ __half sqrt(__half val) -{ - return sqrt(static_cast(val)); -} -#else -__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } -#endif -#endif +// #endif namespace kernels { diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp new file mode 100644 index 00000000000..7ef38f68247 --- /dev/null +++ b/include/ginkgo/core/base/half.hpp @@ -0,0 +1,24 @@ +#ifndef GKO_BASE_HALF_HPP_ +#define GKO_BASE_HALF_HPP_ +#include +#include + + +#ifdef __CUDA_ARCH__ + + +#include + + +#elif defined(__HIP_DEVICE_COMPILE__) + + +#include + + +#endif // __CUDA_ARCH__ + + +namespace gko {} + +#endif // GKO_BASE_HALF_HPP_ diff --git a/reference/matrix/ell_kernels.cpp b/reference/matrix/ell_kernels.cpp index 1fa37c4e250..00e83a971de 100644 --- a/reference/matrix/ell_kernels.cpp +++ b/reference/matrix/ell_kernels.cpp @@ -107,7 +107,8 @@ void advanced_spmv(std::shared_ptr exec, for (size_type j = 0; j < c->get_size()[1]; j++) { for (size_type row = 0; row < a->get_size()[0]; row++) { - arithmetic_type result = c->at(row, j); + arithmetic_type result = + static_cast(c->at(row, j)); result *= beta_val; for (size_type i = 0; i < num_stored_elements_per_row; i++) { arithmetic_type val = a_vals(row + i * stride); diff --git a/reference/solver/idr_kernels.cpp b/reference/solver/idr_kernels.cpp index df8f67075fb..e52f26d2f37 100644 --- a/reference/solver/idr_kernels.cpp +++ b/reference/solver/idr_kernels.cpp @@ -122,15 +122,17 @@ void initialize(std::shared_ptr exec, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - // auto dist = std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution< + typename ::gko::detail::arth_type>::type>( + 0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - // for (size_type col = 0; col < num_cols; col++) { - // subspace_vectors->at(row, col) = - // // get_rand_value(dist, gen); - // } + for (size_type col = 0; col < num_cols; col++) { + subspace_vectors->at(row, col) = + get_rand_value(dist, gen); + } } for (size_type i = 0; i < row; i++) { diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index bc2ce343756..2ef82997c9e 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -955,6 +955,7 @@ TYPED_TEST(Solver, MixedApplyIsEquivalentToRef) solver.ref->apply(b.ref, x.ref); solver.dev->apply(b.dev, x.dev); + // TODO: in double with half, 4 iterations leads inf GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol(x)); }); }); @@ -973,6 +974,7 @@ TYPED_TEST(Solver, MixedAdvancedApplyIsEquivalentToRef) solver.ref->apply(alpha.ref, b.ref, beta.ref, x.ref); solver.dev->apply(alpha.dev, b.dev, beta.dev, x.dev); + // TODO: in double with half, 4 iterations leads inf GKO_ASSERT_MTX_NEAR(x.ref, x.dev, this->mixed_tol(x)); }); }); From 209c799d82bdc06a69e7669a296e2ab9f411459c Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 6 Feb 2023 23:18:22 +0100 Subject: [PATCH 13/62] try fix the compilation issue from MSVC and MacOS It seems to use complex version even using half only --- core/base/extended_float.hpp | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index ee67fa65a70..275943f9b91 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -653,6 +653,23 @@ class complex { return *this; } +// It's for MacOS. +// TODO: check whether mac compiler always use complex version even when real +// half +#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend complex operator _op( \ + const complex lhf, const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ + } + + COMPLEX_HALF_OPERATOR(+, +=) + COMPLEX_HALF_OPERATOR(-, -=) + COMPLEX_HALF_OPERATOR(*, *=) + COMPLEX_HALF_OPERATOR(/, /=) + private: value_type real_; value_type imag_; @@ -736,6 +753,17 @@ inline complex& complex::operator=( return *this; } +// For MSVC +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + } // namespace std From 75b54fa45282b6dbf0854d00ac6d4db495eecf8a Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 7 Feb 2023 13:08:01 +0100 Subject: [PATCH 14/62] move the half to public and use sycl::half for dpcpp --- core/base/extended_float.hpp | 608 -------------------------- include/ginkgo/core/base/half.hpp | 679 ++++++++++++++++++++++++++++- include/ginkgo/core/base/types.hpp | 10 +- 3 files changed, 683 insertions(+), 614 deletions(-) diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index 275943f9b91..f6b2e6e5309 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -32,418 +32,6 @@ class __half; namespace gko { -template -class truncated; - - -namespace detail { - - -template -struct uint_of_impl {}; - -template -struct uint_of_impl> { - using type = uint16; -}; - -template -struct uint_of_impl> { - using type = uint32; -}; - -template -struct uint_of_impl> { - using type = uint64; -}; - -template -using uint_of = typename uint_of_impl::type; - - -template -struct basic_float_traits {}; - -template <> -struct basic_float_traits { - using type = float16; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 10; - static constexpr int exponent_bits = 5; - static constexpr bool rounds_to_nearest = true; -}; - -// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) -template <> -struct basic_float_traits<__half> { - using type = __half; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 10; - static constexpr int exponent_bits = 5; - static constexpr bool rounds_to_nearest = true; -}; -// #endif - -template <> -struct basic_float_traits { - using type = float32; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 23; - static constexpr int exponent_bits = 8; - static constexpr bool rounds_to_nearest = true; -}; - -template <> -struct basic_float_traits { - using type = float64; - static constexpr int sign_bits = 1; - static constexpr int significand_bits = 52; - static constexpr int exponent_bits = 11; - static constexpr bool rounds_to_nearest = true; -}; - -template -struct basic_float_traits> { - using type = truncated; - static constexpr int sign_bits = ComponentId == 0 ? 1 : 0; - static constexpr int exponent_bits = - ComponentId == 0 ? basic_float_traits::exponent_bits : 0; - static constexpr int significand_bits = - ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1 - : sizeof(type) * byte_size; - static constexpr bool rounds_to_nearest = false; -}; - - -template -constexpr UintType create_ones(int n) -{ - return (n == sizeof(UintType) * byte_size ? static_cast(0) - : static_cast(1) << n) - - static_cast(1); -} - -template -struct float_traits { - using type = typename basic_float_traits::type; - using bits_type = uint_of; - static constexpr int sign_bits = basic_float_traits::sign_bits; - static constexpr int significand_bits = - basic_float_traits::significand_bits; - static constexpr int exponent_bits = basic_float_traits::exponent_bits; - static constexpr bits_type significand_mask = - create_ones(significand_bits); - static constexpr bits_type exponent_mask = - create_ones(significand_bits + exponent_bits) - - significand_mask; - static constexpr bits_type bias_mask = - create_ones(significand_bits + exponent_bits - 1) - - significand_mask; - static constexpr bits_type sign_mask = - create_ones(sign_bits + significand_bits + exponent_bits) - - exponent_mask - significand_mask; - static constexpr bool rounds_to_nearest = - basic_float_traits::rounds_to_nearest; - - static constexpr auto eps = - 1.0 / (1ll << (significand_bits + rounds_to_nearest)); - - static constexpr bool is_inf(bits_type data) - { - return (data & exponent_mask) == exponent_mask && - (data & significand_mask) == bits_type{}; - } - - static constexpr bool is_nan(bits_type data) - { - return (data & exponent_mask) == exponent_mask && - (data & significand_mask) != bits_type{}; - } - - static constexpr bool is_denom(bits_type data) - { - return (data & exponent_mask) == bits_type{}; - } -}; - - -template -struct precision_converter; - -// upcasting implementation details -template -struct precision_converter { - using source_traits = float_traits; - using result_traits = float_traits; - using source_bits = typename source_traits::bits_type; - using result_bits = typename result_traits::bits_type; - - static_assert(source_traits::exponent_bits <= - result_traits::exponent_bits && - source_traits::significand_bits <= - result_traits::significand_bits, - "SourceType has to have both lower range and precision or " - "higher range and precision than ResultType"); - - static constexpr int significand_offset = - result_traits::significand_bits - source_traits::significand_bits; - static constexpr int exponent_offset = significand_offset; - static constexpr int sign_offset = result_traits::exponent_bits - - source_traits::exponent_bits + - exponent_offset; - static constexpr result_bits bias_change = - result_traits::bias_mask - - (static_cast(source_traits::bias_mask) << exponent_offset); - - static constexpr result_bits shift_significand(source_bits data) noexcept - { - return static_cast(data & source_traits::significand_mask) - << significand_offset; - } - - static constexpr result_bits shift_exponent(source_bits data) noexcept - { - return update_bias( - static_cast(data & source_traits::exponent_mask) - << exponent_offset); - } - - static constexpr result_bits shift_sign(source_bits data) noexcept - { - return static_cast(data & source_traits::sign_mask) - << sign_offset; - } - -private: - static constexpr result_bits update_bias(result_bits data) noexcept - { - return data == typename result_traits::bits_type{} ? data - : data + bias_change; - } -}; - -// downcasting implementation details -template -struct precision_converter { - using source_traits = float_traits; - using result_traits = float_traits; - using source_bits = typename source_traits::bits_type; - using result_bits = typename result_traits::bits_type; - - static_assert(source_traits::exponent_bits >= - result_traits::exponent_bits && - source_traits::significand_bits >= - result_traits::significand_bits, - "SourceType has to have both lower range and precision or " - "higher range and precision than ResultType"); - - static constexpr int significand_offset = - source_traits::significand_bits - result_traits::significand_bits; - static constexpr int exponent_offset = significand_offset; - static constexpr int sign_offset = source_traits::exponent_bits - - result_traits::exponent_bits + - exponent_offset; - static constexpr source_bits bias_change = - (source_traits::bias_mask >> exponent_offset) - - static_cast(result_traits::bias_mask); - - static constexpr result_bits shift_significand(source_bits data) noexcept - { - return static_cast( - (data & source_traits::significand_mask) >> significand_offset); - } - - static constexpr result_bits shift_exponent(source_bits data) noexcept - { - return static_cast(update_bias( - (data & source_traits::exponent_mask) >> exponent_offset)); - } - - static constexpr result_bits shift_sign(source_bits data) noexcept - { - return static_cast((data & source_traits::sign_mask) >> - sign_offset); - } - -private: - static constexpr source_bits update_bias(source_bits data) noexcept - { - return data <= bias_change ? typename source_traits::bits_type{} - : limit_exponent(data - bias_change); - } - - static constexpr source_bits limit_exponent(source_bits data) noexcept - { - return data >= static_cast(result_traits::exponent_mask) - ? static_cast(result_traits::exponent_mask) - : data; - } -}; - - -} // namespace detail - - -/** - * A class providing basic support for half precision floating point types. - * - * For now the only features are reduced storage compared to single precision - * and conversions from and to single precision floating point type. - */ -class half { -public: - GKO_ATTRIBUTES half() noexcept = default; - - template ::value>> - GKO_ATTRIBUTES half(const T val) - { - this->float2half(static_cast(val)); - } - - GKO_ATTRIBUTES half(const half& val) = default; - - template - GKO_ATTRIBUTES half& operator=(const V val) - { - this->float2half(static_cast(val)); - return *this; - } - - GKO_ATTRIBUTES operator float() const noexcept - { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __half2float(reinterpret_cast(data_)); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto bits = half2float(data_); - return reinterpret_cast(bits); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - } - - // can not use half operator _op(const half) for half + half - // operation will cast it to float and then do float operation such that it - // becomes float in the end. -#define HALF_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ - { \ - return static_cast(static_cast(lhf) \ - _op static_cast(rhf)); \ - } \ - GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ - { \ - auto result = *this _op hf; \ - this->float2half(result); \ - return *this; \ - } - HALF_OPERATOR(+, +=) - HALF_OPERATOR(-, -=) - HALF_OPERATOR(*, *=) - HALF_OPERATOR(/, /=) - - // Do operation with different type - // If it is floating point, using floating point as type. - // If it is integer, using half as type -#define HALF_FRIEND_OPERATOR(_op, _opeq) \ - template \ - GKO_ATTRIBUTES friend std::enable_if_t< \ - !std::is_same::value && std::is_scalar::value, \ - typename std::conditional::value, T, \ - half>::type> \ - operator _op(const half hf, const T val) \ - { \ - using type = \ - typename std::conditional::value, T, \ - half>::type; \ - auto result = static_cast(hf); \ - result _opeq static_cast(val); \ - return result; \ - } \ - template \ - GKO_ATTRIBUTES friend std::enable_if_t< \ - !std::is_same::value && std::is_scalar::value, \ - typename std::conditional::value, T, \ - half>::type> \ - operator _op(const T val, const half hf) \ - { \ - using type = \ - typename std::conditional::value, T, \ - half>::type; \ - auto result = static_cast(val); \ - result _opeq static_cast(hf); \ - return result; \ - } - - HALF_FRIEND_OPERATOR(+, +=) - HALF_FRIEND_OPERATOR(-, -=) - HALF_FRIEND_OPERATOR(*, *=) - HALF_FRIEND_OPERATOR(/, /=) - - // the negative - GKO_ATTRIBUTES half operator-() const - { - auto val = 0.0f - *this; - return half(val); - } - -private: - using f16_traits = detail::float_traits; - using f32_traits = detail::float_traits; - - // TODO: do we really need this one? - // Without it, everything can be constexpr, which might make stuff easier. - GKO_ATTRIBUTES void float2half(float val) noexcept - { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto tmp = __float2half_rn(val); - data_ = reinterpret_cast(tmp); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - data_ = float2half(reinterpret_cast(val)); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - } - - static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept - { - using conv = detail::precision_converter; - if (f32_traits::is_inf(data_)) { - return conv::shift_sign(data_) | f16_traits::exponent_mask; - } else if (f32_traits::is_nan(data_)) { - return conv::shift_sign(data_) | f16_traits::exponent_mask | - f16_traits::significand_mask; - } else { - const auto exp = conv::shift_exponent(data_); - if (f16_traits::is_inf(exp)) { - return conv::shift_sign(data_) | exp; - } else if (f16_traits::is_denom(exp)) { - // TODO: handle denormals - return conv::shift_sign(data_); - } else { - return conv::shift_sign(data_) | exp | - conv::shift_significand(data_); - } - } - } - - static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept - { - using conv = detail::precision_converter; - if (f16_traits::is_inf(data_)) { - return conv::shift_sign(data_) | f32_traits::exponent_mask; - } else if (f16_traits::is_nan(data_)) { - return conv::shift_sign(data_) | f32_traits::exponent_mask | - f32_traits::significand_mask; - } else if (f16_traits::is_denom(data_)) { - // TODO: handle denormals - return conv::shift_sign(data_); - } else { - return conv::shift_sign(data_) | conv::shift_exponent(data_) | - conv::shift_significand(data_); - } - } - - uint16 data_; -}; - - /** * This template implements the truncated (or split) storage of a floating point * type. @@ -537,145 +125,6 @@ class truncated { namespace std { -template <> -class complex { -public: - using value_type = gko::half; - - complex(const value_type& real = value_type(0.f), - const value_type& imag = value_type(0.f)) - : real_(real), imag_(imag) - {} - template ::value && - std::is_scalar::value>> - explicit complex(const T& real, const U& imag) - : complex(static_cast(real), static_cast(imag)) - {} - - template ::value>> - complex(const T& real) : complex(static_cast(real)) - {} - - template ::value>> - explicit complex(const complex& other) - : complex(static_cast(other.real()), - static_cast(other.imag())) - {} - - // explicit complex(const complex& other) = default; - - value_type real() const noexcept { return real_; } - - value_type imag() const noexcept { return imag_; } - - - operator std::complex() const noexcept - { - return std::complex(static_cast(real_), - static_cast(imag_)); - } - - // operator std::complex() const noexcept - // { - // return std::complex(static_cast(real_), - // static_cast(imag_)); - // } - - template - complex& operator=(const V& val) - { - real_ = val; - imag_ = value_type(); - return *this; - } - - template - complex& operator=(const std::complex& val) - { - real_ = val.real(); - imag_ = val.imag(); - return *this; - } - - complex& operator+=(const value_type& real) - { - real_ += real; - return *this; - } - complex& operator-=(const value_type& real) - { - real_ -= real; - return *this; - } - complex& operator*=(const value_type& real) - { - real_ *= real; - imag_ *= real; - return *this; - } - complex& operator/=(const value_type& real) - { - real_ /= real; - imag_ /= real; - return *this; - } - - template - complex& operator+=(const complex& val) - { - real_ += val.real(); - imag_ += val.imag(); - return *this; - } - template - complex& operator-=(const complex& val) - { - real_ -= val.real(); - imag_ -= val.imag(); - return *this; - } - template - complex& operator*=(const complex& val) - { - auto tmp = real_; - real_ = real_ * val.real() - imag_ * val.imag(); - imag_ = tmp * val.imag() + imag_ * val.real(); - return *this; - } - template - complex& operator/=(const complex& val) - { - auto real = val.real(); - auto imag = val.imag(); - (*this) *= complex{val.real(), -val.imag()}; - (*this) /= (real * real + imag * imag); - return *this; - } - -// It's for MacOS. -// TODO: check whether mac compiler always use complex version even when real -// half -#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES friend complex operator _op( \ - const complex lhf, const complex rhf) \ - { \ - auto a = lhf; \ - a _opeq rhf; \ - return a; \ - } - - COMPLEX_HALF_OPERATOR(+, +=) - COMPLEX_HALF_OPERATOR(-, -=) - COMPLEX_HALF_OPERATOR(*, *=) - COMPLEX_HALF_OPERATOR(/, /=) - -private: - value_type real_; - value_type imag_; -}; - - template class complex> { public: @@ -707,63 +156,6 @@ class complex> { }; -template <> -struct numeric_limits { - static constexpr bool is_specialized{true}; - static constexpr bool is_signed{true}; - static constexpr bool is_integer{false}; - static constexpr bool is_exact{false}; - static constexpr bool is_bounded{true}; - static constexpr bool is_modulo{false}; - static constexpr int digits{ - gko::detail::float_traits::significand_bits + 1}; - // 3/10 is approx. log_10(2) - static constexpr int digits10{digits * 3 / 10}; - - // Note: gko::half can't return gko::half here because it does not have - // a constexpr constructor. - static constexpr float epsilon() - { - return gko::detail::float_traits::eps; - } - - static constexpr float infinity() - { - return numeric_limits::infinity(); - } - - static constexpr float min() { return numeric_limits::min(); } - - static constexpr float max() { return numeric_limits::max(); } - - static constexpr float quiet_NaN() - { - return numeric_limits::quiet_NaN(); - } -}; - -// complex using a template on operator= for any kind of complex, so we can -// do full specialization for half -template <> -inline complex& complex::operator=( - const std::complex& a) -{ - complex t(a.real(), a.imag()); - operator=(t); - return *this; -} - -// For MSVC -template <> -inline complex& complex::operator=( - const std::complex& a) -{ - complex t(a.real(), a.imag()); - operator=(t); - return *this; -} - - } // namespace std diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 7ef38f68247..09b3c7a0686 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -1,9 +1,50 @@ -#ifndef GKO_BASE_HALF_HPP_ -#define GKO_BASE_HALF_HPP_ +/************************************************************* +Copyright (c) 2017-2023, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_HALF_HPP_ +#define GKO_PUBLIC_CORE_BASE_HALF_HPP_ + + #include #include +#include +#include + +#ifdef SYCL_LANGUAGE_VERSION +#include +#endif + #ifdef __CUDA_ARCH__ @@ -16,9 +57,639 @@ #include +#else + + +class __half; + + #endif // __CUDA_ARCH__ -namespace gko {} +namespace gko { + + +template +class truncated; + + +namespace detail { + + +template +struct uint_of_impl {}; + +template +struct uint_of_impl> { + using type = uint16; +}; + +template +struct uint_of_impl> { + using type = uint32; +}; + +template +struct uint_of_impl> { + using type = uint64; +}; + +template +using uint_of = typename uint_of_impl::type; + + +template +struct basic_float_traits {}; + +template <> +struct basic_float_traits { + using type = float16; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; + +// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +template <> +struct basic_float_traits<__half> { + using type = __half; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 10; + static constexpr int exponent_bits = 5; + static constexpr bool rounds_to_nearest = true; +}; +// #endif + +template <> +struct basic_float_traits { + using type = float32; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 23; + static constexpr int exponent_bits = 8; + static constexpr bool rounds_to_nearest = true; +}; + +template <> +struct basic_float_traits { + using type = float64; + static constexpr int sign_bits = 1; + static constexpr int significand_bits = 52; + static constexpr int exponent_bits = 11; + static constexpr bool rounds_to_nearest = true; +}; + +template +struct basic_float_traits> { + using type = truncated; + static constexpr int sign_bits = ComponentId == 0 ? 1 : 0; + static constexpr int exponent_bits = + ComponentId == 0 ? basic_float_traits::exponent_bits : 0; + static constexpr int significand_bits = + ComponentId == 0 ? sizeof(type) * byte_size - exponent_bits - 1 + : sizeof(type) * byte_size; + static constexpr bool rounds_to_nearest = false; +}; + + +template +constexpr UintType create_ones(int n) +{ + return (n == sizeof(UintType) * byte_size ? static_cast(0) + : static_cast(1) << n) - + static_cast(1); +} + +template +struct float_traits { + using type = typename basic_float_traits::type; + using bits_type = uint_of; + static constexpr int sign_bits = basic_float_traits::sign_bits; + static constexpr int significand_bits = + basic_float_traits::significand_bits; + static constexpr int exponent_bits = basic_float_traits::exponent_bits; + static constexpr bits_type significand_mask = + create_ones(significand_bits); + static constexpr bits_type exponent_mask = + create_ones(significand_bits + exponent_bits) - + significand_mask; + static constexpr bits_type bias_mask = + create_ones(significand_bits + exponent_bits - 1) - + significand_mask; + static constexpr bits_type sign_mask = + create_ones(sign_bits + significand_bits + exponent_bits) - + exponent_mask - significand_mask; + static constexpr bool rounds_to_nearest = + basic_float_traits::rounds_to_nearest; + + static constexpr auto eps = + 1.0 / (1ll << (significand_bits + rounds_to_nearest)); + + static constexpr bool is_inf(bits_type data) + { + return (data & exponent_mask) == exponent_mask && + (data & significand_mask) == bits_type{}; + } + + static constexpr bool is_nan(bits_type data) + { + return (data & exponent_mask) == exponent_mask && + (data & significand_mask) != bits_type{}; + } + + static constexpr bool is_denom(bits_type data) + { + return (data & exponent_mask) == bits_type{}; + } +}; + + +template +struct precision_converter; + +// upcasting implementation details +template +struct precision_converter { + using source_traits = float_traits; + using result_traits = float_traits; + using source_bits = typename source_traits::bits_type; + using result_bits = typename result_traits::bits_type; + + static_assert(source_traits::exponent_bits <= + result_traits::exponent_bits && + source_traits::significand_bits <= + result_traits::significand_bits, + "SourceType has to have both lower range and precision or " + "higher range and precision than ResultType"); + + static constexpr int significand_offset = + result_traits::significand_bits - source_traits::significand_bits; + static constexpr int exponent_offset = significand_offset; + static constexpr int sign_offset = result_traits::exponent_bits - + source_traits::exponent_bits + + exponent_offset; + static constexpr result_bits bias_change = + result_traits::bias_mask - + (static_cast(source_traits::bias_mask) << exponent_offset); + + static constexpr result_bits shift_significand(source_bits data) noexcept + { + return static_cast(data & source_traits::significand_mask) + << significand_offset; + } + + static constexpr result_bits shift_exponent(source_bits data) noexcept + { + return update_bias( + static_cast(data & source_traits::exponent_mask) + << exponent_offset); + } + + static constexpr result_bits shift_sign(source_bits data) noexcept + { + return static_cast(data & source_traits::sign_mask) + << sign_offset; + } + +private: + static constexpr result_bits update_bias(result_bits data) noexcept + { + return data == typename result_traits::bits_type{} ? data + : data + bias_change; + } +}; + +// downcasting implementation details +template +struct precision_converter { + using source_traits = float_traits; + using result_traits = float_traits; + using source_bits = typename source_traits::bits_type; + using result_bits = typename result_traits::bits_type; + + static_assert(source_traits::exponent_bits >= + result_traits::exponent_bits && + source_traits::significand_bits >= + result_traits::significand_bits, + "SourceType has to have both lower range and precision or " + "higher range and precision than ResultType"); + + static constexpr int significand_offset = + source_traits::significand_bits - result_traits::significand_bits; + static constexpr int exponent_offset = significand_offset; + static constexpr int sign_offset = source_traits::exponent_bits - + result_traits::exponent_bits + + exponent_offset; + static constexpr source_bits bias_change = + (source_traits::bias_mask >> exponent_offset) - + static_cast(result_traits::bias_mask); + + static constexpr result_bits shift_significand(source_bits data) noexcept + { + return static_cast( + (data & source_traits::significand_mask) >> significand_offset); + } + + static constexpr result_bits shift_exponent(source_bits data) noexcept + { + return static_cast(update_bias( + (data & source_traits::exponent_mask) >> exponent_offset)); + } + + static constexpr result_bits shift_sign(source_bits data) noexcept + { + return static_cast((data & source_traits::sign_mask) >> + sign_offset); + } + +private: + static constexpr source_bits update_bias(source_bits data) noexcept + { + return data <= bias_change ? typename source_traits::bits_type{} + : limit_exponent(data - bias_change); + } + + static constexpr source_bits limit_exponent(source_bits data) noexcept + { + return data >= static_cast(result_traits::exponent_mask) + ? static_cast(result_traits::exponent_mask) + : data; + } +}; + + +} // namespace detail + +#ifdef SYCL_LANGUAGE_VERSION +using half = sycl::half; +#else +/** + * A class providing basic support for half precision floating point types. + * + * For now the only features are reduced storage compared to single precision + * and conversions from and to single precision floating point type. + */ +class half { +public: + GKO_ATTRIBUTES half() noexcept = default; + + template ::value>> + GKO_ATTRIBUTES half(const T val) + { + this->float2half(static_cast(val)); + } + + GKO_ATTRIBUTES half(const half& val) = default; + + template + GKO_ATTRIBUTES half& operator=(const V val) + { + this->float2half(static_cast(val)); + return *this; + } + + GKO_ATTRIBUTES operator float() const noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __half2float(reinterpret_cast(data_)); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto bits = half2float(data_); + return reinterpret_cast(bits); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + + // can not use half operator _op(const half) for half + half + // operation will cast it to float and then do float operation such that it + // becomes float in the end. +#define HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ + { \ + auto result = *this _op hf; \ + this->float2half(result); \ + return *this; \ + } + HALF_OPERATOR(+, +=) + HALF_OPERATOR(-, -=) + HALF_OPERATOR(*, *=) + HALF_OPERATOR(/, /=) + + // Do operation with different type + // If it is floating point, using floating point as type. + // If it is integer, using half as type +#define HALF_FRIEND_OPERATOR(_op, _opeq) \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const half hf, const T val) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(hf); \ + result _opeq static_cast(val); \ + return result; \ + } \ + template \ + GKO_ATTRIBUTES friend std::enable_if_t< \ + !std::is_same::value && std::is_scalar::value, \ + typename std::conditional::value, T, \ + half>::type> \ + operator _op(const T val, const half hf) \ + { \ + using type = \ + typename std::conditional::value, T, \ + half>::type; \ + auto result = static_cast(val); \ + result _opeq static_cast(hf); \ + return result; \ + } + + HALF_FRIEND_OPERATOR(+, +=) + HALF_FRIEND_OPERATOR(-, -=) + HALF_FRIEND_OPERATOR(*, *=) + HALF_FRIEND_OPERATOR(/, /=) + + // the negative + GKO_ATTRIBUTES half operator-() const + { + auto val = 0.0f - *this; + return half(val); + } + +private: + using f16_traits = detail::float_traits; + using f32_traits = detail::float_traits; + + // TODO: do we really need this one? + // Without it, everything can be constexpr, which might make stuff easier. + GKO_ATTRIBUTES void float2half(float val) noexcept + { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + const auto tmp = __float2half_rn(val); + data_ = reinterpret_cast(tmp); +#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + data_ = float2half(reinterpret_cast(val)); +#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + } + + static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept + { + using conv = detail::precision_converter; + if (f32_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask; + } else if (f32_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f16_traits::exponent_mask | + f16_traits::significand_mask; + } else { + const auto exp = conv::shift_exponent(data_); + if (f16_traits::is_inf(exp)) { + return conv::shift_sign(data_) | exp; + } else if (f16_traits::is_denom(exp)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + return conv::shift_sign(data_) | exp | + conv::shift_significand(data_); + } + } + } + + static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept + { + using conv = detail::precision_converter; + if (f16_traits::is_inf(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask; + } else if (f16_traits::is_nan(data_)) { + return conv::shift_sign(data_) | f32_traits::exponent_mask | + f32_traits::significand_mask; + } else if (f16_traits::is_denom(data_)) { + // TODO: handle denormals + return conv::shift_sign(data_); + } else { + return conv::shift_sign(data_) | conv::shift_exponent(data_) | + conv::shift_significand(data_); + } + } + + uint16 data_; +}; +#endif + + +} // namespace gko + + +namespace std { + + +template <> +class complex { +public: + using value_type = gko::half; + + complex(const value_type& real = value_type(0.f), + const value_type& imag = value_type(0.f)) + : real_(real), imag_(imag) + {} + template ::value && + std::is_scalar::value>> + explicit complex(const T& real, const U& imag) + : complex(static_cast(real), static_cast(imag)) + {} + + template ::value>> + complex(const T& real) : complex(static_cast(real)) + {} + + template ::value>> + explicit complex(const complex& other) + : complex(static_cast(other.real()), + static_cast(other.imag())) + {} + + // explicit complex(const complex& other) = default; + + value_type real() const noexcept { return real_; } + + value_type imag() const noexcept { return imag_; } + + + operator std::complex() const noexcept + { + return std::complex(static_cast(real_), + static_cast(imag_)); + } + + // operator std::complex() const noexcept + // { + // return std::complex(static_cast(real_), + // static_cast(imag_)); + // } + + template + complex& operator=(const V& val) + { + real_ = val; + imag_ = value_type(); + return *this; + } + + template + complex& operator=(const std::complex& val) + { + real_ = val.real(); + imag_ = val.imag(); + return *this; + } + + complex& operator+=(const value_type& real) + { + real_ += real; + return *this; + } + complex& operator-=(const value_type& real) + { + real_ -= real; + return *this; + } + complex& operator*=(const value_type& real) + { + real_ *= real; + imag_ *= real; + return *this; + } + complex& operator/=(const value_type& real) + { + real_ /= real; + imag_ /= real; + return *this; + } + + template + complex& operator+=(const complex& val) + { + real_ += val.real(); + imag_ += val.imag(); + return *this; + } + template + complex& operator-=(const complex& val) + { + real_ -= val.real(); + imag_ -= val.imag(); + return *this; + } + template + complex& operator*=(const complex& val) + { + auto tmp = real_; + real_ = real_ * val.real() - imag_ * val.imag(); + imag_ = tmp * val.imag() + imag_ * val.real(); + return *this; + } + template + complex& operator/=(const complex& val) + { + auto real = val.real(); + auto imag = val.imag(); + (*this) *= complex{val.real(), -val.imag()}; + (*this) /= (real * real + imag * imag); + return *this; + } + +// It's for MacOS. +// TODO: check whether mac compiler always use complex version even when real +// half +#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES friend complex operator _op( \ + const complex lhf, const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ + } + + COMPLEX_HALF_OPERATOR(+, +=) + COMPLEX_HALF_OPERATOR(-, -=) + COMPLEX_HALF_OPERATOR(*, *=) + COMPLEX_HALF_OPERATOR(/, /=) + +private: + value_type real_; + value_type imag_; +}; + +#ifndef SYCL_LANGUAGE_VERSION +template <> +struct numeric_limits { + static constexpr bool is_specialized{true}; + static constexpr bool is_signed{true}; + static constexpr bool is_integer{false}; + static constexpr bool is_exact{false}; + static constexpr bool is_bounded{true}; + static constexpr bool is_modulo{false}; + static constexpr int digits{ + gko::detail::float_traits::significand_bits + 1}; + // 3/10 is approx. log_10(2) + static constexpr int digits10{digits * 3 / 10}; + + // Note: gko::half can't return gko::half here because it does not have + // a constexpr constructor. + static constexpr float epsilon() + { + return gko::detail::float_traits::eps; + } + + static constexpr float infinity() + { + return numeric_limits::infinity(); + } + + static constexpr float min() { return numeric_limits::min(); } + + static constexpr float max() { return numeric_limits::max(); } + + static constexpr float quiet_NaN() + { + return numeric_limits::quiet_NaN(); + } +}; + +#endif + +// complex using a template on operator= for any kind of complex, so we can +// do full specialization for half +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + +// For MSVC +template <> +inline complex& complex::operator=( + const std::complex& a) +{ + complex t(a.real(), a.imag()); + operator=(t); + return *this; +} + + +} // namespace std + -#endif // GKO_BASE_HALF_HPP_ +#endif // GKO_PUBLIC_CORE_BASE_HALF_HPP_ diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index a2739d8b8b7..272c71d3638 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -22,6 +22,9 @@ #include #endif // __HIPCC__ +#ifdef SYCL_LANGUAGE_VERSION +#include +#endif // Macros for handling different compilers / architectures uniformly #if defined(__CUDACC__) || defined(__HIPCC__) @@ -137,8 +140,11 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; - +#ifdef SYCL_LANGUAGE_VERSION +using half = sycl::half; +#else class half; +#endif /** @@ -858,5 +864,5 @@ using comm_index_type = int; } // namespace experimental } // namespace gko -#include "core/base/extended_float.hpp" +#include #endif // GKO_PUBLIC_CORE_BASE_TYPES_HPP_ From 48ea3381099d52bfecb4b9ab7d7c243a4c921c19 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 7 Feb 2023 15:36:55 +0100 Subject: [PATCH 15/62] limit the next precision in test and benchmark next_precision = float not half --- benchmark/utils/types.hpp | 29 +++++++++++++ core/test/utils.hpp | 30 +++++++++++++ reference/test/base/combination.cpp | 8 ++-- reference/test/base/composition.cpp | 8 ++-- reference/test/base/perturbation.cpp | 8 ++-- reference/test/matrix/coo_kernels.cpp | 22 ++++------ reference/test/matrix/csr_kernels.cpp | 16 ++++--- reference/test/matrix/diagonal_kernels.cpp | 12 +++--- reference/test/matrix/ell_kernels.cpp | 42 +++++++++---------- reference/test/matrix/fbcsr_kernels.cpp | 8 ++-- reference/test/matrix/hybrid_kernels.cpp | 16 ++++--- reference/test/matrix/identity.cpp | 2 +- reference/test/matrix/sellp_kernels.cpp | 12 +++--- .../test/matrix/sparsity_csr_kernels.cpp | 11 +++-- reference/test/preconditioner/ic.cpp | 10 ++--- reference/test/preconditioner/ilu.cpp | 10 ++--- reference/test/preconditioner/jacobi.cpp | 2 +- .../test/preconditioner/jacobi_kernels.cpp | 8 ++-- reference/test/reorder/scaled_reordered.cpp | 4 +- reference/test/solver/bicg_kernels.cpp | 10 ++--- reference/test/solver/bicgstab_kernels.cpp | 10 ++--- reference/test/solver/cb_gmres_kernels.cpp | 10 ++--- reference/test/solver/cg_kernels.cpp | 10 ++--- reference/test/solver/cgs_kernels.cpp | 10 ++--- reference/test/solver/fcg_kernels.cpp | 10 ++--- reference/test/solver/gmres_kernels.cpp | 10 ++--- reference/test/solver/idr_kernels.cpp | 10 ++--- reference/test/solver/ir_kernels.cpp | 8 ++-- reference/test/solver/lower_trs_kernels.cpp | 8 ++-- reference/test/solver/multigrid_kernels.cpp | 2 +- reference/test/solver/upper_trs_kernels.cpp | 8 ++-- test/matrix/matrix.cpp | 2 +- test/mpi/matrix.cpp | 4 +- test/mpi/solver/solver.cpp | 4 +- test/mpi/vector.cpp | 4 +- test/solver/solver.cpp | 2 +- 36 files changed, 213 insertions(+), 167 deletions(-) diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp index de7a8a0e45e..eadb8650463 100644 --- a/benchmark/utils/types.hpp +++ b/benchmark/utils/types.hpp @@ -38,4 +38,33 @@ using etype = double; using rc_etype = gko::remove_complex; +namespace detail { + + +// singly linked list of all our supported precisions +template +struct next_precision_impl {}; + +template <> +struct next_precision_impl { + using type = double; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + + +template +struct next_precision_impl> { + using type = std::complex::type>; +}; + + +} // namespace detail + +template +using next_precision = typename detail::next_precision_impl::type; + #endif // GKO_BENCHMARK_UTILS_TYPES_HPP_ diff --git a/core/test/utils.hpp b/core/test/utils.hpp index d6d999b1d94..bfdcf1ee35a 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -454,4 +454,34 @@ struct TupleTypenameNameGenerator { }; +namespace detail { + + +// singly linked list of all our supported precisions +template +struct next_precision_impl {}; + +template <> +struct next_precision_impl { + using type = double; +}; + +template <> +struct next_precision_impl { + using type = float; +}; + + +template +struct next_precision_impl> { + using type = std::complex::type>; +}; + + +} // namespace detail + +template +using next_precision = typename detail::next_precision_impl::type; + + #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/reference/test/base/combination.cpp b/reference/test/base/combination.cpp index aea578f4e7e..c95fb11fd46 100644 --- a/reference/test/base/combination.cpp +++ b/reference/test/base/combination.cpp @@ -114,7 +114,7 @@ TYPED_TEST(Combination, AppliesToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -156,7 +156,7 @@ TYPED_TEST(Combination, AppliesToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::to_complex>; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -200,7 +200,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedVector) cmb = [ 8 7 ] [ 5 4 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmb = gko::Combination::create( this->coefficients[0], this->operators[0], this->coefficients[1], @@ -248,7 +248,7 @@ TYPED_TEST(Combination, AppliesLinearCombinationToMixedComplexVector) cmb = [ 8 7 ] [ 5 4 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmb = gko::Combination::create( diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp index f736edb53f9..5722e5edd0f 100644 --- a/reference/test/base/composition.cpp +++ b/reference/test/base/composition.cpp @@ -142,7 +142,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize({1.0, 2.0}, this->exec); @@ -182,7 +182,7 @@ TYPED_TEST(Composition, AppliesSingleToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision>; + using value_type = next_precision>; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto x = gko::initialize( @@ -222,7 +222,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmp = gko::Composition::create(this->product); auto alpha = gko::initialize({3.0}, this->exec); @@ -267,7 +267,7 @@ TYPED_TEST(Composition, AppliesSingleLinearCombinationToMixedComplexVector) cmp = [ -9 -2 ] [ 27 26 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Composition::create(this->product); diff --git a/reference/test/base/perturbation.cpp b/reference/test/base/perturbation.cpp index b6be9ab1563..7f8e7d3c7d1 100644 --- a/reference/test/base/perturbation.cpp +++ b/reference/test/base/perturbation.cpp @@ -101,7 +101,7 @@ TYPED_TEST(Perturbation, AppliesToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using Mtx = gko::matrix::Dense>; + using Mtx = gko::matrix::Dense>; using value_type = typename Mtx::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -143,7 +143,7 @@ TYPED_TEST(Perturbation, AppliesToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::to_complex>; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -185,7 +185,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto cmp = gko::Perturbation::create(this->scalar, this->basis, this->projector); @@ -232,7 +232,7 @@ TYPED_TEST(Perturbation, AppliesLinearCombinationToMixedComplexVector) cmp = I + 2 * [ 2 ] * [ 3 2 ] [ 1 ] */ - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using value_type = typename MixedDenseComplex::value_type; auto cmp = gko::Perturbation::create(this->scalar, this->basis, diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index 4ff7141cd8e..b434c7064c5 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -32,7 +32,7 @@ class Coo : public ::testing::Test { using Csr = gko::matrix::Csr; using Mtx = gko::matrix::Coo; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Coo() : exec(gko::ReferenceExecutor::create()), mtx(Mtx::create(exec)) { @@ -79,7 +79,7 @@ TYPED_TEST(Coo, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -102,7 +102,7 @@ TYPED_TEST(Coo, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto tmp = OtherCoo::create(this->exec); @@ -216,7 +216,7 @@ TYPED_TEST(Coo, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -233,7 +233,7 @@ TYPED_TEST(Coo, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Coo = typename TestFixture::Mtx; using OtherCoo = gko::matrix::Coo; auto empty = OtherCoo::create(this->exec); @@ -703,8 +703,7 @@ TYPED_TEST(Coo, AppliesToComplex) TYPED_TEST(Coo, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -759,8 +758,7 @@ TYPED_TEST(Coo, AdvancedAppliesToComplex) TYPED_TEST(Coo, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; @@ -817,8 +815,7 @@ TYPED_TEST(Coo, ApplyAddsToComplex) TYPED_TEST(Coo, ApplyAddsToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedVec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -874,8 +871,7 @@ TYPED_TEST(Coo, ApplyAddsScaledToComplex) TYPED_TEST(Coo, ApplyAddsScaledToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/csr_kernels.cpp b/reference/test/matrix/csr_kernels.cpp index 07fb526e8cd..a1e36df92f0 100644 --- a/reference/test/matrix/csr_kernels.cpp +++ b/reference/test/matrix/csr_kernels.cpp @@ -46,7 +46,7 @@ class Csr : public ::testing::Test { using Ell = gko::matrix::Ell; using Hybrid = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; using Perm = gko::matrix::Permutation; using ScaledPerm = gko::matrix::ScaledPermutation; @@ -788,7 +788,7 @@ TYPED_TEST(Csr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -815,7 +815,7 @@ TYPED_TEST(Csr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto tmp = OtherCsr::create(this->exec); @@ -994,7 +994,7 @@ TYPED_TEST(Csr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -1013,7 +1013,7 @@ TYPED_TEST(Csr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Csr = typename TestFixture::Mtx; using OtherCsr = gko::matrix::Csr; auto empty = OtherCsr::create(this->exec); @@ -2049,8 +2049,7 @@ TYPED_TEST(Csr, AppliesToComplex) TYPED_TEST(Csr, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -2105,8 +2104,7 @@ TYPED_TEST(Csr, AdvancedAppliesToComplex) TYPED_TEST(Csr, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp index 437ba3a1746..a092b3076ea 100644 --- a/reference/test/matrix/diagonal_kernels.cpp +++ b/reference/test/matrix/diagonal_kernels.cpp @@ -30,7 +30,7 @@ class Diagonal : public ::testing::Test { using Csr = gko::matrix::Csr; using Diag = gko::matrix::Diagonal; using Dense = gko::matrix::Dense; - using MixedDense = gko::matrix::Dense>; + using MixedDense = gko::matrix::Dense>; Diagonal() : exec(gko::ReferenceExecutor::create()), @@ -85,7 +85,7 @@ TYPED_TEST_SUITE(Diagonal, gko::test::ValueTypes, TypenameNameGenerator); TYPED_TEST(Diagonal, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -107,7 +107,7 @@ TYPED_TEST(Diagonal, ConvertsToPrecision) TYPED_TEST(Diagonal, MovesToPrecision) { using ValueType = typename TestFixture::value_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Diagonal = typename TestFixture::Diag; using OtherDiagonal = gko::matrix::Diagonal; auto tmp = OtherDiagonal::create(this->exec); @@ -574,8 +574,7 @@ TYPED_TEST(Diagonal, AppliesToComplex) TYPED_TEST(Diagonal, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -634,8 +633,7 @@ TYPED_TEST(Diagonal, AppliesLinearCombinationToComplex) TYPED_TEST(Diagonal, AppliesLinearCombinationToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; using Scalar = gko::matrix::Dense; diff --git a/reference/test/matrix/ell_kernels.cpp b/reference/test/matrix/ell_kernels.cpp index 6d60961663a..27d8614925d 100644 --- a/reference/test/matrix/ell_kernels.cpp +++ b/reference/test/matrix/ell_kernels.cpp @@ -30,7 +30,7 @@ class Ell : public ::testing::Test { using Mtx = gko::matrix::Ell; using Csr = gko::matrix::Csr; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Ell() : exec(gko::ReferenceExecutor::create()), @@ -91,7 +91,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = typename gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -106,7 +106,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); @@ -122,9 +122,9 @@ TYPED_TEST(Ell, MixedAppliesToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; - using Vec2 = gko::matrix::Dense>; + using Vec2 = gko::matrix::Dense>; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec1::create(this->exec, gko::dim<2>{2, 1}); @@ -160,7 +160,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; // clang-format off auto x = gko::initialize( @@ -184,7 +184,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -209,7 +209,7 @@ TYPED_TEST(Ell, MixedAppliesToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; // clang-format off @@ -248,7 +248,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -265,7 +265,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -283,7 +283,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseVector3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -327,7 +327,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix1) { // Both vectors have the same value type which differs from the matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -355,7 +355,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix2) { // Input vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -384,7 +384,7 @@ TYPED_TEST(Ell, MixedAppliesLinearCombinationToDenseMatrix3) { // Output vector has same value type as matrix using T = typename TestFixture::value_type; - using next_T = gko::next_precision; + using next_T = next_precision; using Vec1 = typename TestFixture::Vec; using Vec2 = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); @@ -443,7 +443,7 @@ TYPED_TEST(Ell, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -466,7 +466,7 @@ TYPED_TEST(Ell, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto tmp = OtherEll::create(this->exec); @@ -736,7 +736,7 @@ TYPED_TEST(Ell, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -753,7 +753,7 @@ TYPED_TEST(Ell, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Ell = typename TestFixture::Mtx; using OtherEll = gko::matrix::Ell; auto empty = Ell::create(this->exec); @@ -897,8 +897,7 @@ TYPED_TEST(Ell, AppliesToComplex) TYPED_TEST(Ell, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -954,8 +953,7 @@ TYPED_TEST(Ell, AdvancedAppliesToComplex) TYPED_TEST(Ell, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index dd220a40172..4415216fd01 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -271,7 +271,7 @@ TYPED_TEST(Fbcsr, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -294,7 +294,7 @@ TYPED_TEST(Fbcsr, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto tmp = OtherFbcsr::create(this->exec); @@ -393,7 +393,7 @@ TYPED_TEST(Fbcsr, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); @@ -412,7 +412,7 @@ TYPED_TEST(Fbcsr, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Fbcsr = typename TestFixture::Mtx; using OtherFbcsr = gko::matrix::Fbcsr; auto empty = OtherFbcsr::create(this->exec); diff --git a/reference/test/matrix/hybrid_kernels.cpp b/reference/test/matrix/hybrid_kernels.cpp index 817d188147e..755a6675d0e 100644 --- a/reference/test/matrix/hybrid_kernels.cpp +++ b/reference/test/matrix/hybrid_kernels.cpp @@ -32,7 +32,7 @@ class Hybrid : public ::testing::Test { using Mtx = gko::matrix::Hybrid; using Vec = gko::matrix::Dense; using Csr = gko::matrix::Csr; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; Hybrid() : exec(gko::ReferenceExecutor::create()), @@ -233,7 +233,7 @@ TYPED_TEST(Hybrid, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -256,7 +256,7 @@ TYPED_TEST(Hybrid, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto tmp = OtherHybrid::create(this->exec); @@ -368,7 +368,7 @@ TYPED_TEST(Hybrid, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -385,7 +385,7 @@ TYPED_TEST(Hybrid, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Hybrid = typename TestFixture::Mtx; using OtherHybrid = gko::matrix::Hybrid; auto other = Hybrid::create(this->exec); @@ -699,8 +699,7 @@ TYPED_TEST(Hybrid, AppliesToComplex) TYPED_TEST(Hybrid, AppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using Vec = gko::matrix::Dense; auto exec = gko::ReferenceExecutor::create(); @@ -756,8 +755,7 @@ TYPED_TEST(Hybrid, AdvancedAppliesToComplex) TYPED_TEST(Hybrid, AdvancedAppliesToMixedComplex) { - using mixed_value_type = - gko::next_precision; + using mixed_value_type = next_precision; using mixed_complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::matrix::Dense; diff --git a/reference/test/matrix/identity.cpp b/reference/test/matrix/identity.cpp index 11953de338a..ebf55b7ee03 100644 --- a/reference/test/matrix/identity.cpp +++ b/reference/test/matrix/identity.cpp @@ -19,7 +19,7 @@ class Identity : public ::testing::Test { using value_type = T; using Id = gko::matrix::Identity; using Vec = gko::matrix::Dense; - using MixedVec = gko::matrix::Dense>; + using MixedVec = gko::matrix::Dense>; using ComplexVec = gko::to_complex; using MixedComplexVec = gko::to_complex; diff --git a/reference/test/matrix/sellp_kernels.cpp b/reference/test/matrix/sellp_kernels.cpp index b5e6a9ce69f..ba0f21d68f3 100644 --- a/reference/test/matrix/sellp_kernels.cpp +++ b/reference/test/matrix/sellp_kernels.cpp @@ -67,7 +67,7 @@ TYPED_TEST(Sellp, AppliesToDenseVector) TYPED_TEST(Sellp, AppliesToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -116,7 +116,7 @@ TYPED_TEST(Sellp, AppliesLinearCombinationToDenseVector) TYPED_TEST(Sellp, AppliesLinearCombinationToMixedDenseVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -189,7 +189,7 @@ TYPED_TEST(Sellp, ConvertsToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -212,7 +212,7 @@ TYPED_TEST(Sellp, MovesToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto tmp = OtherSellp::create(this->exec); @@ -310,7 +310,7 @@ TYPED_TEST(Sellp, ConvertsEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); @@ -329,7 +329,7 @@ TYPED_TEST(Sellp, MovesEmptyToPrecision) { using ValueType = typename TestFixture::value_type; using IndexType = typename TestFixture::index_type; - using OtherType = typename gko::next_precision; + using OtherType = next_precision; using Sellp = typename TestFixture::Mtx; using OtherSellp = gko::matrix::Sellp; auto empty = OtherSellp::create(this->exec); diff --git a/reference/test/matrix/sparsity_csr_kernels.cpp b/reference/test/matrix/sparsity_csr_kernels.cpp index f08d6c352ca..5503119f5ca 100644 --- a/reference/test/matrix/sparsity_csr_kernels.cpp +++ b/reference/test/matrix/sparsity_csr_kernels.cpp @@ -145,7 +145,7 @@ TYPED_TEST(SparsityCsr, AppliesToDenseVector) TYPED_TEST(SparsityCsr, AppliesToMixedDenseVector) { - using T = gko::next_precision; + using T = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({2.0, 1.0, 4.0}, this->exec); auto y = Vec::create(this->exec, gko::dim<2>{2, 1}); @@ -192,7 +192,7 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToDenseVector) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedDenseVector) { - using T = gko::next_precision; + using T = next_precision; using Vec = gko::matrix::Dense; auto alpha = gko::initialize({-1.0}, this->exec); auto beta = gko::initialize({2.0}, this->exec); @@ -243,8 +243,7 @@ TYPED_TEST(SparsityCsr, AppliesToComplex) TYPED_TEST(SparsityCsr, AppliesToMixedComplex) { - using T = - gko::next_precision>; + using T = next_precision>; using Vec = gko::matrix::Dense; auto x = gko::initialize({T{2.0, 4.0}, T{1.0, 2.0}, T{4.0, 8.0}}, this->exec); @@ -278,8 +277,8 @@ TYPED_TEST(SparsityCsr, AppliesLinearCombinationToComplex) TYPED_TEST(SparsityCsr, AppliesLinearCombinationToMixedComplex) { - using Vec = gko::matrix::Dense< - gko::next_precision>; + using Vec = + gko::matrix::Dense>; using ComplexVec = gko::to_complex; using T = typename ComplexVec::value_type; auto alpha = gko::initialize({-1.0}, this->exec); diff --git a/reference/test/preconditioner/ic.cpp b/reference/test/preconditioner/ic.cpp index 16ffc8d7b3c..ef681129fe3 100644 --- a/reference/test/preconditioner/ic.cpp +++ b/reference/test/preconditioner/ic.cpp @@ -245,7 +245,7 @@ TYPED_TEST(Ic, SolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Vec::create(this->exec, gko::dim<2>{3, 1}); auto preconditioner = @@ -279,7 +279,7 @@ TYPED_TEST(Ic, SolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using Vec = gko::matrix::Dense< - gko::next_precision>>; + next_precision>>; using T = typename Vec::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -315,7 +315,7 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); @@ -354,8 +354,8 @@ TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplex) TYPED_TEST(Ic, AdvancedSolvesSingleRhsComplexMixed) { using ic_prec_type = typename TestFixture::ic_prec_type; - using MixedDense = gko::matrix::Dense< - gko::next_precision>; + using MixedDense = + gko::matrix::Dense>; using MixedDenseComplex = gko::to_complex; using T = typename MixedDenseComplex::value_type; const auto b = gko::initialize( diff --git a/reference/test/preconditioner/ilu.cpp b/reference/test/preconditioner/ilu.cpp index 180b92be9ec..c6dacac9f50 100644 --- a/reference/test/preconditioner/ilu.cpp +++ b/reference/test/preconditioner/ilu.cpp @@ -315,8 +315,8 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedMtx) { - using Mtx = gko::matrix::Dense< - gko::next_precision>; + using Mtx = + gko::matrix::Dense>; const auto b = gko::initialize({1.0, 3.0, 6.0}, this->exec); auto x = Mtx::create(this->exec, gko::dim<2>{3, 1}); x->copy_from(b); @@ -350,7 +350,7 @@ TYPED_TEST(Ilu, SolvesSingleRhsWithComplexMtx) TYPED_TEST(Ilu, SolvesSingleRhsWithMixedComplexMtx) { using Mtx = gko::matrix::Dense< - gko::to_complex>>; + gko::to_complex>>; using T = typename Mtx::value_type; const auto b = gko::initialize( {T{1.0, 2.0}, T{3.0, 6.0}, T{6.0, 12.0}}, this->exec); @@ -403,7 +403,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhs) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; const value_type alpha{2.0}; const auto alpha_linop = gko::initialize({alpha}, this->exec); @@ -453,7 +453,7 @@ TYPED_TEST(Ilu, SolvesAdvancedSingleRhsComplex) TYPED_TEST(Ilu, SolvesAdvancedSingleRhsMixedComplex) { - using value_type = gko::next_precision; + using value_type = next_precision; using complex_type = gko::to_complex; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; diff --git a/reference/test/preconditioner/jacobi.cpp b/reference/test/preconditioner/jacobi.cpp index 79c276579ad..ae20e24d84c 100644 --- a/reference/test/preconditioner/jacobi.cpp +++ b/reference/test/preconditioner/jacobi.cpp @@ -477,7 +477,7 @@ TYPED_TEST(Jacobi, ScalarJacobiGeneratesOnDifferentPrecision) { using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; - using next_type = gko::next_precision; + using next_type = next_precision; using Bj = typename TestFixture::Bj; auto csr = gko::share(gko::matrix::Csr::create(this->exec)); diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index 97d9951be7a..c038dc475b4 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -642,7 +642,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesToVector) TYPED_TEST(Jacobi, AppliesToMixedVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -683,7 +683,7 @@ TYPED_TEST(Jacobi, AppliesToComplexVector) TYPED_TEST(Jacobi, AppliesToMixedComplexVector) { using value_type = - gko::to_complex>; + gko::to_complex>; using Vec = gko::matrix::Dense; auto x = gko::initialize( {value_type{1.0, 2.0}, value_type{-1.0, -2.0}, value_type{2.0, 4.0}, @@ -888,7 +888,7 @@ TYPED_TEST(Jacobi, ScalarJacobiAppliesLinearCombinationToVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto x = gko::initialize({1.0, -1.0, 2.0, -2.0, 3.0}, this->exec); auto b = gko::initialize({4.0, -1.0, -2.0, 4.0, -1.0}, this->exec); @@ -931,7 +931,7 @@ TYPED_TEST(Jacobi, AppliesLinearCombinationToComplexVector) TYPED_TEST(Jacobi, AppliesLinearCombinationToMixedComplexVector) { - using value_type = gko::next_precision; + using value_type = next_precision; using MixedDense = gko::matrix::Dense; using MixedDenseComplex = gko::to_complex; using T = gko::to_complex; diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index 75ab3728a30..aa6e963e6b1 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -445,7 +445,7 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed) { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using Vec = gko::matrix::Dense>; + using Vec = gko::matrix::Dense>; auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) @@ -489,7 +489,7 @@ TYPED_TEST(ScaledReordered, { using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Vec = gko::matrix::Dense; auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index 837920ec520..69bea370b90 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -268,7 +268,7 @@ TYPED_TEST(Bicg, SolvesStencilSystem) TYPED_TEST(Bicg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -305,7 +305,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemComplex) TYPED_TEST(Bicg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto b = gko::initialize( @@ -360,7 +360,7 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -401,8 +401,8 @@ TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Bicg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicg_factory->generate(this->mtx); diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index f09e78137b3..e2974d664c4 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -383,7 +383,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystem) TYPED_TEST(Bicgstab, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -420,7 +420,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemComplex) TYPED_TEST(Bicgstab, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto b = gko::initialize( @@ -495,7 +495,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->bicgstab_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -536,8 +536,8 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->bicgstab_factory->generate(this->mtx); diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp index a027c02705b..26c19bb8787 100644 --- a/reference/test/solver/cb_gmres_kernels.cpp +++ b/reference/test/solver/cb_gmres_kernels.cpp @@ -159,7 +159,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystem) TYPED_TEST(CbGmres, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -198,7 +198,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystemComplex) TYPED_TEST(CbGmres, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto b = @@ -279,7 +279,7 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cb_gmres_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -322,8 +322,8 @@ TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(CbGmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cb_gmres_factory->generate(this->mtx); diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 7cbc629717c..1419170cb61 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -228,7 +228,7 @@ TYPED_TEST(Cg, SolvesStencilSystem) TYPED_TEST(Cg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -265,7 +265,7 @@ TYPED_TEST(Cg, SolvesStencilSystemComplex) TYPED_TEST(Cg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto b = gko::initialize( @@ -320,7 +320,7 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -361,8 +361,8 @@ TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Cg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cg_factory->generate(this->mtx); diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 9024623ade8..7df07a770bd 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -293,7 +293,7 @@ TYPED_TEST(Cgs, SolvesDenseSystem) TYPED_TEST(Cgs, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -330,7 +330,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemComplex) TYPED_TEST(Cgs, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto b = gko::initialize( @@ -386,7 +386,7 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -427,8 +427,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->cgs_factory->generate(this->mtx); diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index 2b7b97ffc3b..e8ac009b9dd 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -242,7 +242,7 @@ TYPED_TEST(Fcg, SolvesStencilSystem) TYPED_TEST(Fcg, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -279,7 +279,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemComplex) TYPED_TEST(Fcg, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto b = gko::initialize( @@ -334,7 +334,7 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->fcg_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -375,8 +375,8 @@ TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Fcg, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->fcg_factory->generate(this->mtx); diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 3f11b087bb7..588b225c658 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -434,7 +434,7 @@ TYPED_TEST(Gmres, SolvesStencilSystem) TYPED_TEST(Gmres, SolvesStencilSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = gko::initialize({13.0, 7.0, 1.0}, this->exec); @@ -472,7 +472,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemComplex) TYPED_TEST(Gmres, SolvesStencilSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto b = @@ -528,7 +528,7 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApply) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->gmres_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -570,8 +570,8 @@ TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyComplex) TYPED_TEST(Gmres, SolvesStencilSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->gmres_factory->generate(this->mtx); diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index c3ca4fc1bd9..c8a2e45d14e 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -76,7 +76,7 @@ TYPED_TEST(Idr, SolvesDenseSystem) TYPED_TEST(Idr, SolvesDenseSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -113,7 +113,7 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex) TYPED_TEST(Idr, SolvesDenseSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize( @@ -231,7 +231,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApply) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); @@ -272,8 +272,8 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyComplex) TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); diff --git a/reference/test/solver/ir_kernels.cpp b/reference/test/solver/ir_kernels.cpp index b0c1029f693..08a62e66de2 100644 --- a/reference/test/solver/ir_kernels.cpp +++ b/reference/test/solver/ir_kernels.cpp @@ -82,7 +82,7 @@ TYPED_TEST(Ir, SolvesTriangularSystem) TYPED_TEST(Ir, SolvesTriangularSystemMixed) { - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize({3.9, 9.0, 2.2}, this->exec); @@ -119,7 +119,7 @@ TYPED_TEST(Ir, SolvesTriangularSystemComplex) TYPED_TEST(Ir, SolvesTriangularSystemMixedComplex) { using value_type = - gko::to_complex>; + gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->ir_factory->generate(this->mtx); auto b = gko::initialize( @@ -243,8 +243,8 @@ TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(Ir, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { - using Scalar = gko::matrix::Dense< - gko::next_precision>; + using Scalar = + gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->ir_factory->generate(this->mtx); diff --git a/reference/test/solver/lower_trs_kernels.cpp b/reference/test/solver/lower_trs_kernels.cpp index 3680f19681f..ed06a0126be 100644 --- a/reference/test/solver/lower_trs_kernels.cpp +++ b/reference/test/solver/lower_trs_kernels.cpp @@ -108,7 +108,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystem) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({1.0, 2.0, 1.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -146,7 +146,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -217,7 +217,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -259,7 +259,7 @@ TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(LowerTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index 57ba8fba84d..d3083d5819f 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -233,7 +233,7 @@ class Multigrid : public ::testing::Test { using Smoother = gko::solver::Ir; using InnerSolver = gko::preconditioner::Jacobi; using CoarsestSolver = gko::solver::Cg; - using CoarsestNextSolver = gko::solver::Cg>; + using CoarsestNextSolver = gko::solver::Cg>; using DummyRPFactory = DummyMultigridLevelWithFactory; using DummyFactory = DummyLinOpWithFactory; Multigrid() diff --git a/reference/test/solver/upper_trs_kernels.cpp b/reference/test/solver/upper_trs_kernels.cpp index a60f3b46079..5f0a55c378f 100644 --- a/reference/test/solver/upper_trs_kernels.cpp +++ b/reference/test/solver/upper_trs_kernels.cpp @@ -108,7 +108,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystem) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; std::shared_ptr b = gko::initialize({4.0, 2.0, 3.0}, this->exec); auto x = gko::initialize({0.0, 0.0, 0.0}, this->exec); @@ -146,7 +146,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; std::shared_ptr b = gko::initialize( @@ -218,7 +218,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApply) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixed) { using other_value_type = typename TestFixture::value_type; - using value_type = gko::next_precision; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -260,7 +260,7 @@ TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyComplex) TYPED_TEST(UpperTrs, SolvesTriangularSystemUsingAdvancedApplyMixedComplex) { using other_value_type = typename TestFixture::value_type; - using Scalar = gko::matrix::Dense>; + using Scalar = gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto alpha = gko::initialize({2.0}, this->exec); diff --git a/test/matrix/matrix.cpp b/test/matrix/matrix.cpp index 0b06f76df85..2eb45d0df22 100644 --- a/test/matrix/matrix.cpp +++ b/test/matrix/matrix.cpp @@ -557,7 +557,7 @@ class Matrix : public CommonTestFixture { using Mtx = typename T::matrix_type; using index_type = typename Mtx::index_type; using value_type = typename Mtx::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = gko::matrix::Dense; using MixedVec = gko::matrix::Dense; diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 6d6812dea12..c22d83d8014 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -682,7 +682,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); @@ -708,7 +708,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp index f3d36f0dcfe..4de8d6450b9 100644 --- a/test/mpi/solver/solver.cpp +++ b/test/mpi/solver/solver.cpp @@ -45,7 +45,7 @@ template struct SimpleSolverTest { using solver_type = SolverType; using value_type = typename solver_type::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using local_index_type = gko::int32; using global_index_type = gko::int64; using dist_matrix_type = @@ -229,7 +229,7 @@ class Solver : public CommonMpiTestFixture { using local_index_type = typename T::local_index_type; using global_index_type = typename T::global_index_type; using value_type = typename T::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = typename T::dist_vector_type; using LocalVec = typename T::non_dist_vector_type; using MixedVec = typename T::mixed_dist_vector_type; diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index cedd483b0a2..1e3cb1b5fce 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -839,7 +839,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported) TYPED_TEST(VectorLocalOps, ConvertsToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -855,7 +855,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision) TYPED_TEST(VectorLocalOps, MovesToPrecision) { using T = typename TestFixture::value_type; - using OtherT = typename gko::next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); diff --git a/test/solver/solver.cpp b/test/solver/solver.cpp index 2ef82997c9e..9c6f143708c 100644 --- a/test/solver/solver.cpp +++ b/test/solver/solver.cpp @@ -520,7 +520,7 @@ class Solver : public CommonTestFixture { using Precond = typename T::precond_type; using Mtx = typename T::matrix_type; using value_type = typename Mtx::value_type; - using mixed_value_type = gko::next_precision; + using mixed_value_type = next_precision; using Vec = gko::matrix::Dense; using MixedVec = gko::matrix::Dense; From fdcc0666a64bd81f392a7e0d1b1886618adb03b4 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 7 Feb 2023 17:03:33 +0100 Subject: [PATCH 16/62] allow disable half operation --- CMakeLists.txt | 1 + cmake/get_info.cmake | 3 + core/base/mixed_precision_types.hpp | 178 ++++++++++----------- core/distributed/matrix.cpp | 3 +- core/distributed/vector.cpp | 7 +- core/matrix/coo.cpp | 2 + core/matrix/csr.cpp | 3 +- core/matrix/dense.cpp | 2 + core/matrix/diagonal.cpp | 3 + core/matrix/ell.cpp | 2 + core/matrix/fbcsr.cpp | 5 +- core/matrix/hybrid.cpp | 2 + core/matrix/row_gatherer.cpp | 18 ++- core/matrix/sellp.cpp | 3 + core/multigrid/pgm.cpp | 1 - core/solver/multigrid.cpp | 63 +++++++- include/ginkgo/config.hpp.in | 5 + include/ginkgo/core/base/math.hpp | 11 ++ include/ginkgo/core/base/mpi.hpp | 7 +- include/ginkgo/core/base/types.hpp | 129 +++++++-------- include/ginkgo/core/distributed/matrix.hpp | 8 +- include/ginkgo/core/distributed/vector.hpp | 7 +- include/ginkgo/core/matrix/coo.hpp | 19 ++- include/ginkgo/core/matrix/csr.hpp | 10 +- include/ginkgo/core/matrix/dense.hpp | 27 ++-- include/ginkgo/core/matrix/diagonal.hpp | 14 +- include/ginkgo/core/matrix/ell.hpp | 18 ++- include/ginkgo/core/matrix/fbcsr.hpp | 18 ++- include/ginkgo/core/matrix/hybrid.hpp | 19 ++- include/ginkgo/core/matrix/sellp.hpp | 18 ++- 30 files changed, 376 insertions(+), 230 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 59131fac4f8..4c2284bcf3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,7 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) +option(GINKGO_ENABLE_HALF "Enable the half operation" OFF) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF) diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 63f43c645f0..3f0d7243121 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -204,11 +204,14 @@ if(TARGET hwloc) ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") endif() +ginkgo_print_variable(${minimal_log} "GINKGO_ENABLE_HALF") +ginkgo_print_variable(${detailed_log} "GINKGO_ENABLE_HALF") ginkgo_print_module_footer(${detailed_log} "") ginkgo_print_generic_header(${detailed_log} " Extensions:") ginkgo_print_variable(${detailed_log} "GINKGO_EXTENSION_KOKKOS_CHECK_TYPE_ALIGNMENT") + _minimal( " --\n-- Detailed information (More compiler flags, module configuration) can be found in detailed.log diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 7a2d2463672..27794c2c9bf 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -14,97 +14,97 @@ #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ - template _macro(float, half, half, __VA_ARGS__); \ - template _macro(float, half, float, __VA_ARGS__); \ - template _macro(float, half, double, __VA_ARGS__); \ - template _macro(float, float, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(float, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, float, half, __VA_ARGS__)); \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ - template _macro(float, double, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(float, double, half, __VA_ARGS__)); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ - template _macro(double, half, half, __VA_ARGS__); \ - template _macro(double, half, float, __VA_ARGS__); \ - template _macro(double, half, double, __VA_ARGS__); \ - template _macro(double, float, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(double, float, half, __VA_ARGS__)); \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ - template _macro(double, double, half, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, double, half, __VA_ARGS__)); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ - template _macro(half, half, half, __VA_ARGS__); \ - template _macro(half, half, float, __VA_ARGS__); \ - template _macro(half, half, double, __VA_ARGS__); \ - template _macro(half, float, half, __VA_ARGS__); \ - template _macro(half, float, float, __VA_ARGS__); \ - template _macro(half, float, double, __VA_ARGS__); \ - template _macro(half, double, half, __VA_ARGS__); \ - template _macro(half, double, float, __VA_ARGS__); \ - template _macro(half, double, double, __VA_ARGS__) + GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, double, __VA_ARGS__)) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)) + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #else @@ -124,11 +124,11 @@ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ - template _macro(half, half, half, __VA_ARGS__) + GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)) + GKO_ADAPT_HF(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #endif @@ -149,38 +149,36 @@ #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(half, half, __VA_ARGS__); \ - template _macro(half, float, __VA_ARGS__); \ - template _macro(half, double, __VA_ARGS__); \ - template _macro(float, half, __VA_ARGS__); \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(float, double, __VA_ARGS__); \ - template _macro(double, half, __VA_ARGS__); \ - template _macro(double, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(_macro(float, half, __VA_ARGS__)); \ + template _macro(float, float, __VA_ARGS__); \ + template _macro(float, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(double, half, __VA_ARGS__)); \ + template _macro(double, float, __VA_ARGS__); \ + template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + GKO_ADAPT_HF( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #else -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(half, half, __VA_ARGS__); \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - GKO_ADAPT_CPHF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + template _macro(float, float, __VA_ARGS__); \ + template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #endif diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index c9337e6d80b..36b41c02b0b 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -237,6 +237,7 @@ void Matrix::move_to( } +#if GKO_ENABLE_HALF template void Matrix::convert_to( Matrix>, local_index_type, @@ -274,7 +275,7 @@ void Matrix::move_to( result->set_size(this->get_size()); this->set_size({}); } - +#endif template void Matrix::read_distributed( diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 21b7c334af7..db97f66831e 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -299,6 +299,7 @@ void Vector::move_to(Vector>* result) } +#if GKO_ENABLE_HALF template void Vector::convert_to( Vector>>* result) const @@ -316,7 +317,7 @@ void Vector::move_to( { this->convert_to(result); } - +#endif template std::unique_ptr::absolute_type> @@ -650,8 +651,8 @@ ValueType& Vector::at_local(size_type row, size_type col) noexcept template -ValueType Vector::at_local(size_type row, - size_type col) const noexcept +ValueType Vector::at_local(size_type row, size_type col) const + noexcept { return local_.at(row, col); } diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 8834ca60b12..1d1b1441e6b 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -231,6 +231,7 @@ void Coo::move_to( } +#if GKO_ENABLE_HALF template void Coo::convert_to( Coo>, IndexType>* result) const @@ -248,6 +249,7 @@ void Coo::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 0c89394021c..2a544315eb1 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -321,7 +321,7 @@ void Csr::move_to( this->convert_to(result); } - +#if GKO_ENABLE_HALF template void Csr::convert_to( Csr>, IndexType>* result) const @@ -340,6 +340,7 @@ void Csr::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index f94547a687a..d0670b2db32 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -603,6 +603,7 @@ void Dense::move_to(Dense>* result) } +#if GKO_ENABLE_HALF template void Dense::convert_to( Dense>>* result) const @@ -625,6 +626,7 @@ void Dense::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 921087c1a96..6a870ab6fde 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -162,6 +162,8 @@ void Diagonal::move_to(Diagonal>* result) this->convert_to(result); } + +#if GKO_ENABLE_HALF template void Diagonal::convert_to( Diagonal>>* result) const @@ -177,6 +179,7 @@ void Diagonal::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index b8bdf4b8e4a..128725f66cf 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -172,6 +172,7 @@ void Ell::move_to( } +#if GKO_ENABLE_HALF template void Ell::convert_to( Ell>, IndexType>* result) const @@ -190,6 +191,7 @@ void Ell::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index e6c00a93180..d71701a0f4a 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -167,9 +167,11 @@ void Fbcsr::move_to( } +#if GKO_ENABLE_HALF template void Fbcsr::convert_to( - Fbcsr>, IndexType>* const result) const + Fbcsr>, IndexType>* const result) + const { result->values_ = this->values_; result->col_idxs_ = this->col_idxs_; @@ -186,6 +188,7 @@ void Fbcsr::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 56dc7dd290b..9f89cdce3db 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -222,6 +222,7 @@ void Hybrid::move_to( } +#if GKO_ENABLE_HALF template void Hybrid::convert_to( Hybrid>, IndexType>* result) const @@ -241,6 +242,7 @@ void Hybrid::move_to( { this->convert_to(result); } +#endif template diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index 836855b89a9..62982c612fe 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -64,8 +64,11 @@ RowGatherer::create_const( template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { - run, - std::complex, std::complex>( + run, +#endif + float, double, std::complex, std::complex>( in, [&](auto gather) { gather->row_gather(&row_idxs_, out); }); } @@ -73,10 +76,13 @@ template void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { - run, - std::complex, std::complex>(in, [&](auto gather) { - gather->row_gather(alpha, &row_idxs_, beta, out); - }); + run, +#endif + float, double, std::complex, std::complex>( + in, + [&](auto gather) { gather->row_gather(alpha, &row_idxs_, beta, out); }); } diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index bbbabe6c36b..fccd035a23e 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -195,6 +195,8 @@ void Sellp::move_to( this->convert_to(result); } + +#if GKO_ENABLE_HALF template void Sellp::convert_to( Sellp>, IndexType>* result) const @@ -215,6 +217,7 @@ void Sellp::move_to( { this->convert_to(result); } +#endif template diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 16cacb1fa09..1cef0f0d77c 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -543,6 +543,5 @@ void Pgm::generate() #define GKO_DECLARE_PGM(_vtype, _itype) class Pgm<_vtype, _itype> GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_PGM); - } // namespace multigrid } // namespace gko diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 967a861e339..b0b4fcab5ad 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -316,7 +316,14 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto next_nrows = mg_level_list.at(i)->get_coarse_op()->get_size()[0]; auto mg_level = mg_level_list.at(i); - run, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level, auto i, auto cycle, auto current_nrows, @@ -454,7 +461,14 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, return; } auto mg_level = multigrid->get_mg_level_list().at(level); - run, +#endif std::complex, std::complex>( mg_level, [&, this](auto mg_level) { #if GINKGO_BUILD_MPI @@ -703,7 +717,14 @@ void Multigrid::generate() break; } - run, +#endif std::complex, std::complex>( mg_level, [this](auto mg_level, auto index, auto matrix) { @@ -741,7 +762,14 @@ void Multigrid::generate() auto last_mg_level = mg_level_list_.back(); // generate coarsest solver - run, +#endif std::complex, std::complex>( last_mg_level, [this](auto mg_level, auto level, auto matrix) { @@ -858,7 +886,14 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, b, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } @@ -897,7 +932,14 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, alpha, b, beta, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, +#endif std::complex, std::complex>(first_mg_level, lambda, alpha, b, beta, x); } @@ -962,7 +1004,14 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, auto first_mg_level = this->get_mg_level_list().front(); - run, +#endif std::complex, std::complex>(first_mg_level, lambda, b, x); } diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 1dfa6bc61bc..9adea69b857 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -105,6 +105,11 @@ #define GKO_HAVE_HWLOC @GINKGO_HAVE_HWLOC@ // clang-format on +/* Is half operation available ? */ +// clang-format off +#define GKO_ENABLE_HALF @GINKGO_ENABLE_HALF@ +// clang-format on + /* Do we need to use blocking communication in our SpMV? */ // clang-format off diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 963fc4d0da0..ec372452e08 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -410,10 +410,12 @@ namespace detail { template struct next_precision_impl {}; +#if GKO_ENABLE_HALF template <> struct next_precision_impl { using type = float; }; +#endif template <> struct next_precision_impl { @@ -422,7 +424,11 @@ struct next_precision_impl { template <> struct next_precision_impl { +#if GKO_ENABLE_HALF using type = half; +#else + using type = float; +#endif }; @@ -534,8 +540,13 @@ using next_precision = typename detail::next_precision_impl::type; * @note Currently our lists contains only two elements, so this is the same as * next_precision. */ +#if GKO_ENABLE_HALF template using previous_precision = next_precision>; +#else +template +using previous_precision = next_precision; +#endif /** diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 32d2e5d899a..57bb433b38a 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -79,9 +79,6 @@ struct type_impl {}; GKO_REGISTER_MPI_TYPE(char, MPI_CHAR); GKO_REGISTER_MPI_TYPE(unsigned char, MPI_UNSIGNED_CHAR); GKO_REGISTER_MPI_TYPE(unsigned, MPI_UNSIGNED); -// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 -// TODO: it only works on the transferring -GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(int, MPI_INT); GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG); @@ -91,8 +88,12 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); +#if GKO_ENABLE_HALF +// OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 // TODO: it only works on the transferring +GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(std::complex, MPI_FLOAT); +#endif // GKO_ENABLE_HALF GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_FLOAT_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 272c71d3638..ceb82b96747 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -406,10 +406,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) -#if GINKGO_COMPILE_KERNEL -#define GKO_ADAPT_CPHF(_macro) template _macro +#if GKO_ENABLE_HALF +#define GKO_ADAPT_HF(_macro) template _macro #else -#define GKO_ADAPT_CPHF(_macro) template _macro +#define GKO_ADAPT_HF(_macro) \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") #endif @@ -423,13 +426,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - template _macro(half); \ + GKO_ADAPT_HF(_macro(half)); \ template _macro(float); \ template <> \ _macro(double) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - template _macro(half); \ + GKO_ADAPT_HF(_macro(half)); \ template _macro(float); \ template _macro(double) #endif @@ -456,7 +459,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - GKO_ADAPT_CPHF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -479,7 +482,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(half, half); \ + GKO_ADAPT_HF(_macro(half, half)); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ @@ -490,15 +493,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - GKO_ADAPT_CPHF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -527,20 +530,20 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(half, int32); \ + GKO_ADAPT_HF(_macro(half, int32)); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ - template _macro(half, int64); \ + GKO_ADAPT_HF(_macro(half, int64)); \ template _macro(float, int64); \ template <> \ _macro(double, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(half, int32); \ + GKO_ADAPT_HF(_macro(half, int32)); \ template _macro(float, int32); \ template _macro(double, int32); \ - template _macro(half, int64); \ + GKO_ADAPT_HF(_macro(half, int64)); \ template _macro(float, int64); \ template _macro(double, int64) #endif @@ -582,10 +585,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - GKO_ADAPT_CPHF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ - GKO_ADAPT_CPHF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -603,9 +606,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ - template _macro(half, int32, int32); \ - template _macro(half, int32, int64); \ - template _macro(half, int64, int64); \ + GKO_ADAPT_HF(_macro(half, int32, int32)); \ + GKO_ADAPT_HF(_macro(half, int32, int64)); \ + GKO_ADAPT_HF(_macro(half, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -618,9 +621,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ - template _macro(half, int32, int32); \ - template _macro(half, int32, int64); \ - template _macro(half, int64, int64); \ + GKO_ADAPT_HF(_macro(half, int32, int32)); \ + GKO_ADAPT_HF(_macro(half, int32, int64)); \ + GKO_ADAPT_HF(_macro(half, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -655,9 +658,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - GKO_ADAPT_CPHF(_macro(std::complex, int32, int32)); \ - GKO_ADAPT_CPHF(_macro(std::complex, int32, int64)); \ - GKO_ADAPT_CPHF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -697,18 +700,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - template _macro(half, double); \ - template _macro(double, half); \ - template _macro(float, half); \ - template _macro(half, float); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + GKO_ADAPT_HF(_macro(half, double)); \ + GKO_ADAPT_HF(_macro(double, half)); \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -721,13 +724,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -740,15 +743,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ - template _macro(half, half); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + GKO_ADAPT_HF(_macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ + template _macro(std::complex, float); \ + template _macro(std::complex, double); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -767,11 +770,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(int64, int64); \ template _macro(unsigned int, unsigned int); \ template _macro(unsigned long, unsigned long); \ - template _macro(half, half); \ + GKO_ADAPT_HF(_macro(half, half)); \ template _macro(float, float); \ template _macro(double, double); \ template _macro(long double, long double); \ - GKO_ADAPT_CPHF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -784,10 +787,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * value and index types. */ #define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ - template _macro(half); \ + GKO_ADAPT_HF(_macro(half)); \ template _macro(float); \ template _macro(double); \ - GKO_ADAPT_CPHF(_macro(std::complex)); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex); \ template _macro(size_type); \ diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index a64f8395297..21faa366f0f 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -250,14 +250,14 @@ class Matrix Matrix>, public ConvertibleTo< Matrix, LocalIndexType, GlobalIndexType>>, +#if GKO_ENABLE_HALF public ConvertibleTo>, LocalIndexType, GlobalIndexType>>, +#endif public DistributedBase { friend class EnableDistributedPolymorphicObject; friend class Matrix, LocalIndexType, GlobalIndexType>; - friend class Matrix>, - LocalIndexType, GlobalIndexType>; friend class multigrid::Pgm; @@ -282,6 +282,9 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; +#if GKO_ENABLE_HALF + friend class Matrix>, + LocalIndexType, GlobalIndexType>; void convert_to( Matrix>, local_index_type, @@ -290,6 +293,7 @@ class Matrix void move_to(Matrix>, local_index_type, global_index_type>* result) override; +#endif /** * Reads a square matrix from the device_matrix_data structure and a global * partition. diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index a476f2f2661..ac0262ccdb3 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -59,14 +59,15 @@ template class Vector : public EnableDistributedLinOp>, public ConvertibleTo>>, +#if GKO_ENABLE_HALF public ConvertibleTo>>>, +#endif public EnableAbsoluteComputation>>, public DistributedBase { friend class EnableDistributedPolymorphicObject; friend class Vector>; friend class Vector>; friend class Vector>; - friend class Vector>>; public: using EnableDistributedLinOp::convert_to; @@ -165,11 +166,15 @@ class Vector void move_to(Vector>* result) override; +#if GKO_ENABLE_HALF + friend class Vector>>; + void convert_to(Vector>>* result) const override; void move_to( Vector>>* result) override; +#endif std::unique_ptr compute_absolute() const override; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 8d8797ef9ed..438e3fd1a7b 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -49,7 +49,10 @@ class Hybrid; template class Coo : public EnableLinOp>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Coo>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -83,17 +86,21 @@ class Coo : public EnableLinOp>, friend class Coo, IndexType>; - friend class Coo>, IndexType>; - void convert_to( Coo, IndexType>* result) const override; void move_to(Coo, IndexType>* result) override; - void convert_to( - Coo>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Coo>, + IndexType>; + + void convert_to(Coo>, IndexType>* + result) const override; - void move_to(Coo>, IndexType>* result) override; + void move_to(Coo>, IndexType>* + result) override; +#endif void convert_to(Csr* other) const override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index bd8c8c0f21b..319f2259ad3 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -100,8 +100,10 @@ void strategy_rebuild_helper(Csr* result); template class Csr : public EnableLinOp>, public ConvertibleTo, IndexType>>, +#if GKO_ENABLE_HALF public ConvertibleTo< Csr>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -692,19 +694,21 @@ class Csr : public EnableLinOp>, friend class Csr, IndexType>; - friend class Csr>, - IndexType>; - void convert_to( Csr, IndexType>* result) const override; void move_to(Csr, IndexType>* result) override; +#if GKO_ENABLE_HALF + friend class Csr>, + IndexType>; + void convert_to(Csr>, IndexType>* result) const override; void move_to(Csr>, IndexType>* result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index efe69328f08..9b2f2dab2ef 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -61,17 +61,6 @@ template class SparsityCsr; -class Empty {}; - -template -using next2_type = next_precision>; - - -// template -// using conditional_type = typename std::conditional< -// std::is_same>::value, Empty, -// Dense>>::type; - /** * Dense is a matrix format which explicitly stores all values of the matrix. * @@ -91,7 +80,9 @@ template class Dense : public EnableLinOp>, public ConvertibleTo>>, +#if GKO_ENABLE_HALF public ConvertibleTo>>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -279,19 +270,21 @@ class Dense return other->create_const_view_of_impl(); } - friend class Dense>; + friend class Dense>; - friend class Dense>>; + void convert_to(Dense>* result) const override; + + void move_to(Dense>* result) override; + +#if GKO_ENABLE_HALF + friend class Dense>>; void convert_to(Dense>>* result) const override; void move_to( Dense>>* result) override; - - void convert_to(Dense>* result) const override; - - void move_to(Dense>* result) override; +#endif void convert_to(Coo* result) const override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index a3c92aca6b0..84168d69368 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -42,7 +42,9 @@ class Diagonal public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>>, +#if GKO_ENABLE_HALF public ConvertibleTo>>>, +#endif public Transposable, public WritableToMatrixData, public WritableToMatrixData, @@ -74,8 +76,6 @@ class Diagonal friend class Diagonal>; - friend class Diagonal>>; - std::unique_ptr transpose() const override; std::unique_ptr conj_transpose() const override; @@ -84,9 +84,15 @@ class Diagonal void move_to(Diagonal>* result) override; - void convert_to(Diagonal>>* result) const override; +#if GKO_ENABLE_HALF + friend class Diagonal>>; + + void convert_to(Diagonal>>* result) + const override; - void move_to(Diagonal>>* result) override; + void move_to( + Diagonal>>* result) override; +#endif void convert_to(Csr* result) const override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index a2b13f0b8e3..451f9311692 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -51,7 +51,10 @@ class Hybrid; template class Ell : public EnableLinOp>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Ell>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -65,7 +68,6 @@ class Ell : public EnableLinOp>, friend class Csr; friend class Ell, IndexType>; friend class Ell, IndexType>; - friend class Ell>, IndexType>; friend class Hybrid; public: @@ -90,10 +92,16 @@ class Ell : public EnableLinOp>, void move_to(Ell, IndexType>* result) override; - void convert_to( - Ell>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Ell>, + IndexType>; + + void convert_to(Ell>, IndexType>* + result) const override; - void move_to(Ell>, IndexType>* result) override; + void move_to(Ell>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index 58ac3afe307..056c55e2224 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -98,7 +98,10 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size) template class Fbcsr : public EnableLinOp>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Fbcsr>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>, @@ -147,17 +150,22 @@ class Fbcsr : public EnableLinOp>, using ConvertibleTo>::move_to; friend class Fbcsr, IndexType>; - friend class Fbcsr>, IndexType>; void convert_to( Fbcsr, IndexType>* result) const override; void move_to(Fbcsr, IndexType>* result) override; - void convert_to( - Fbcsr>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Fbcsr>, + IndexType>; + + void convert_to(Fbcsr>, IndexType>* + result) const override; - void move_to(Fbcsr>, IndexType>* result) override; + void move_to(Fbcsr>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 9e030b5fc44..9edd3427bbe 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -42,7 +42,10 @@ template class Hybrid : public EnableLinOp>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Hybrid>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -358,17 +361,21 @@ class Hybrid friend class Hybrid, IndexType>; - friend class Hybrid>, IndexType>; - void convert_to( Hybrid, IndexType>* result) const override; void move_to(Hybrid, IndexType>* result) override; - void convert_to( - Hybrid>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Hybrid>, + IndexType>; + + void convert_to(Hybrid>, + IndexType>* result) const override; - void move_to(Hybrid>, IndexType>* result) override; + void move_to(Hybrid>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index ae7db46a081..6fee03938fd 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -42,7 +42,10 @@ class Csr; template class Sellp : public EnableLinOp>, public ConvertibleTo, IndexType>>, - public ConvertibleTo>, IndexType>>, +#if GKO_ENABLE_HALF + public ConvertibleTo< + Sellp>, IndexType>>, +#endif public ConvertibleTo>, public ConvertibleTo>, public DiagonalExtractable, @@ -74,17 +77,22 @@ class Sellp : public EnableLinOp>, using absolute_type = remove_complex; friend class Sellp, IndexType>; - friend class Sellp>, IndexType>; void convert_to( Sellp, IndexType>* result) const override; void move_to(Sellp, IndexType>* result) override; - void convert_to( - Sellp>, IndexType>* result) const override; +#if GKO_ENABLE_HALF + friend class Sellp>, + IndexType>; + + void convert_to(Sellp>, IndexType>* + result) const override; - void move_to(Sellp>, IndexType>* result) override; + void move_to(Sellp>, IndexType>* + result) override; +#endif void convert_to(Dense* other) const override; From f0a8a075a0817f5828bb9d9901708d416d09b96b Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 17:28:40 +0100 Subject: [PATCH 17/62] fix macro --- core/distributed/matrix.cpp | 2 +- core/distributed/vector.cpp | 2 +- core/matrix/coo.cpp | 2 +- core/matrix/csr.cpp | 2 +- core/matrix/dense.cpp | 2 +- core/matrix/diagonal.cpp | 2 +- core/matrix/ell.cpp | 2 +- core/matrix/fbcsr.cpp | 2 +- core/matrix/hybrid.cpp | 2 +- core/matrix/row_gatherer.cpp | 4 ++-- core/matrix/sellp.cpp | 2 +- core/solver/multigrid.cpp | 24 +++++++++++----------- include/ginkgo/config.hpp.in | 2 +- include/ginkgo/core/base/math.hpp | 6 +++--- include/ginkgo/core/base/mpi.hpp | 2 +- include/ginkgo/core/base/types.hpp | 5 ++++- include/ginkgo/core/distributed/matrix.hpp | 4 ++-- include/ginkgo/core/distributed/vector.hpp | 11 ++++++---- include/ginkgo/core/matrix/coo.hpp | 4 ++-- include/ginkgo/core/matrix/csr.hpp | 4 ++-- include/ginkgo/core/matrix/dense.hpp | 4 ++-- include/ginkgo/core/matrix/diagonal.hpp | 4 ++-- include/ginkgo/core/matrix/ell.hpp | 4 ++-- include/ginkgo/core/matrix/fbcsr.hpp | 4 ++-- include/ginkgo/core/matrix/hybrid.hpp | 4 ++-- include/ginkgo/core/matrix/sellp.hpp | 4 ++-- 26 files changed, 58 insertions(+), 52 deletions(-) diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp index 36b41c02b0b..93dab716f7c 100644 --- a/core/distributed/matrix.cpp +++ b/core/distributed/matrix.cpp @@ -237,7 +237,7 @@ void Matrix::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Matrix::convert_to( Matrix>, local_index_type, diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index db97f66831e..9dea0c4d30b 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -299,7 +299,7 @@ void Vector::move_to(Vector>* result) } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Vector::convert_to( Vector>>* result) const diff --git a/core/matrix/coo.cpp b/core/matrix/coo.cpp index 1d1b1441e6b..6316e8e948a 100644 --- a/core/matrix/coo.cpp +++ b/core/matrix/coo.cpp @@ -231,7 +231,7 @@ void Coo::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Coo::convert_to( Coo>, IndexType>* result) const diff --git a/core/matrix/csr.cpp b/core/matrix/csr.cpp index 2a544315eb1..bdd0e0d13e4 100644 --- a/core/matrix/csr.cpp +++ b/core/matrix/csr.cpp @@ -321,7 +321,7 @@ void Csr::move_to( this->convert_to(result); } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Csr::convert_to( Csr>, IndexType>* result) const diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index d0670b2db32..3bc94f04011 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -603,7 +603,7 @@ void Dense::move_to(Dense>* result) } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Dense::convert_to( Dense>>* result) const diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 6a870ab6fde..f29c7e036b2 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -163,7 +163,7 @@ void Diagonal::move_to(Diagonal>* result) } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Diagonal::convert_to( Diagonal>>* result) const diff --git a/core/matrix/ell.cpp b/core/matrix/ell.cpp index 128725f66cf..70dcc1719ef 100644 --- a/core/matrix/ell.cpp +++ b/core/matrix/ell.cpp @@ -172,7 +172,7 @@ void Ell::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Ell::convert_to( Ell>, IndexType>* result) const diff --git a/core/matrix/fbcsr.cpp b/core/matrix/fbcsr.cpp index d71701a0f4a..44ba6c1e950 100644 --- a/core/matrix/fbcsr.cpp +++ b/core/matrix/fbcsr.cpp @@ -167,7 +167,7 @@ void Fbcsr::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Fbcsr::convert_to( Fbcsr>, IndexType>* const result) diff --git a/core/matrix/hybrid.cpp b/core/matrix/hybrid.cpp index 9f89cdce3db..4b36b7115ac 100644 --- a/core/matrix/hybrid.cpp +++ b/core/matrix/hybrid.cpp @@ -222,7 +222,7 @@ void Hybrid::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Hybrid::convert_to( Hybrid>, IndexType>* result) const diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index 62982c612fe..20b592ce98a 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -65,7 +65,7 @@ template void RowGatherer::apply_impl(const LinOp* in, LinOp* out) const { run, #endif float, double, std::complex, std::complex>( @@ -77,7 +77,7 @@ void RowGatherer::apply_impl(const LinOp* alpha, const LinOp* in, const LinOp* beta, LinOp* out) const { run, #endif float, double, std::complex, std::complex>( diff --git a/core/matrix/sellp.cpp b/core/matrix/sellp.cpp index fccd035a23e..d4cff180295 100644 --- a/core/matrix/sellp.cpp +++ b/core/matrix/sellp.cpp @@ -196,7 +196,7 @@ void Sellp::move_to( } -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template void Sellp::convert_to( Sellp>, IndexType>* result) const diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index b0b4fcab5ad..04e0192101e 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -317,11 +317,11 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto mg_level = mg_level_list.at(i); run, #endif std::complex, std::complex>( @@ -462,11 +462,11 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, } auto mg_level = multigrid->get_mg_level_list().at(level); run, #endif std::complex, std::complex>( @@ -718,11 +718,11 @@ void Multigrid::generate() } run, #endif std::complex, std::complex>( @@ -763,11 +763,11 @@ void Multigrid::generate() // generate coarsest solver run, #endif std::complex, std::complex>( @@ -887,11 +887,11 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, }; auto first_mg_level = this->get_mg_level_list().front(); run, #endif std::complex, std::complex>(first_mg_level, lambda, b, @@ -933,11 +933,11 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, }; auto first_mg_level = this->get_mg_level_list().front(); run, #endif std::complex, std::complex>(first_mg_level, lambda, diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 9adea69b857..cf25dcd3c77 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -107,7 +107,7 @@ /* Is half operation available ? */ // clang-format off -#define GKO_ENABLE_HALF @GINKGO_ENABLE_HALF@ +#cmakedefine01 GINKGO_ENABLE_HALF // clang-format on diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index ec372452e08..34f1a36fa2e 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -410,7 +410,7 @@ namespace detail { template struct next_precision_impl {}; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template <> struct next_precision_impl { using type = float; @@ -424,7 +424,7 @@ struct next_precision_impl { template <> struct next_precision_impl { -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF using type = half; #else using type = float; @@ -540,7 +540,7 @@ using next_precision = typename detail::next_precision_impl::type; * @note Currently our lists contains only two elements, so this is the same as * next_precision. */ -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF template using previous_precision = next_precision>; #else diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 57bb433b38a..293b4d64cf9 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -88,7 +88,7 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF // OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16 // TODO: it only works on the transferring GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT); diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index ceb82b96747..d6ada1d4fef 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -18,6 +18,9 @@ #include +#include + + #ifdef __HIPCC__ #include #endif // __HIPCC__ @@ -406,7 +409,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF #define GKO_ADAPT_HF(_macro) template _macro #else #define GKO_ADAPT_HF(_macro) \ diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 21faa366f0f..1ae933d01fa 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -250,7 +250,7 @@ class Matrix Matrix>, public ConvertibleTo< Matrix, LocalIndexType, GlobalIndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>, LocalIndexType, GlobalIndexType>>, #endif @@ -282,7 +282,7 @@ class Matrix void move_to(Matrix, local_index_type, global_index_type>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Matrix>, LocalIndexType, GlobalIndexType>; diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index ac0262ccdb3..33128f8c8de 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -59,7 +59,7 @@ template class Vector : public EnableDistributedLinOp>, public ConvertibleTo>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>>>, #endif public EnableAbsoluteComputation>>, @@ -166,7 +166,7 @@ class Vector void move_to(Vector>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Vector>>; void convert_to(Vector>>* result) @@ -677,8 +677,6 @@ struct conversion_target_helper> { using target_type = experimental::distributed::Vector; using source_type = experimental::distributed::Vector>; - using snd_source_type = experimental::distributed::Vector< - previous_precision>>; static std::unique_ptr create_empty(const source_type* source) { @@ -686,12 +684,17 @@ struct conversion_target_helper> { source->get_communicator()); } +#if GINKGO_ENABLE_HALF + using snd_source_type = experimental::distributed::Vector< + previous_precision>>; + static std::unique_ptr create_empty( const snd_source_type* source) { return target_type::create(source->get_executor(), source->get_communicator()); } +#endif }; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 438e3fd1a7b..1971e93cb46 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -49,7 +49,7 @@ class Hybrid; template class Coo : public EnableLinOp>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Coo>, IndexType>>, #endif @@ -91,7 +91,7 @@ class Coo : public EnableLinOp>, void move_to(Coo, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Coo>, IndexType>; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 319f2259ad3..40e94efe0a4 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -100,7 +100,7 @@ void strategy_rebuild_helper(Csr* result); template class Csr : public EnableLinOp>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Csr>, IndexType>>, #endif @@ -699,7 +699,7 @@ class Csr : public EnableLinOp>, void move_to(Csr, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Csr>, IndexType>; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 9b2f2dab2ef..53159d054b0 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -80,7 +80,7 @@ template class Dense : public EnableLinOp>, public ConvertibleTo>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>>>, #endif public ConvertibleTo>, @@ -276,7 +276,7 @@ class Dense void move_to(Dense>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Dense>>; void convert_to(Dense>>* result) diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 84168d69368..89424dab2bf 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -42,7 +42,7 @@ class Diagonal public ConvertibleTo>, public ConvertibleTo>, public ConvertibleTo>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo>>>, #endif public Transposable, @@ -84,7 +84,7 @@ class Diagonal void move_to(Diagonal>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Diagonal>>; void convert_to(Diagonal>>* result) diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index 451f9311692..ca9e61ef4e8 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -51,7 +51,7 @@ class Hybrid; template class Ell : public EnableLinOp>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Ell>, IndexType>>, #endif @@ -92,7 +92,7 @@ class Ell : public EnableLinOp>, void move_to(Ell, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Ell>, IndexType>; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index 056c55e2224..20d4f78fa62 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -98,7 +98,7 @@ inline IndexType get_num_blocks(const int block_size, const IndexType size) template class Fbcsr : public EnableLinOp>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Fbcsr>, IndexType>>, #endif @@ -156,7 +156,7 @@ class Fbcsr : public EnableLinOp>, void move_to(Fbcsr, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Fbcsr>, IndexType>; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index 9edd3427bbe..bba5d18f1da 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -42,7 +42,7 @@ template class Hybrid : public EnableLinOp>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Hybrid>, IndexType>>, #endif @@ -366,7 +366,7 @@ class Hybrid void move_to(Hybrid, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Hybrid>, IndexType>; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 6fee03938fd..84786538cc1 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -42,7 +42,7 @@ class Csr; template class Sellp : public EnableLinOp>, public ConvertibleTo, IndexType>>, -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF public ConvertibleTo< Sellp>, IndexType>>, #endif @@ -83,7 +83,7 @@ class Sellp : public EnableLinOp>, void move_to(Sellp, IndexType>* result) override; -#if GKO_ENABLE_HALF +#if GINKGO_ENABLE_HALF friend class Sellp>, IndexType>; From f041b4a1481139e4ab524642741c684d92e7f54b Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 21:46:30 +0100 Subject: [PATCH 18/62] clean and refine the code --- accessor/cuda_helper.hpp | 3 ++ common/cuda_hip/base/math.hpp | 10 ----- .../unified/components/fill_array_kernels.cpp | 8 +++- core/stop/residual_norm.cpp | 4 +- cuda/CMakeLists.txt | 1 - cuda/base/types.hpp | 6 ++- cuda/solver/common_trs_kernels.cuh | 14 ++++--- dpcpp/CMakeLists.txt | 1 - dpcpp/components/cooperative_groups.dp.hpp | 6 --- dpcpp/matrix/csr_kernels.dp.cpp | 9 +---- dpcpp/matrix/dense_kernels.dp.cpp | 18 +-------- hip/CMakeLists.txt | 1 - hip/base/types.hip.hpp | 15 +++----- include/ginkgo/core/base/math.hpp | 26 ++++++------- .../ginkgo/core/base/precision_dispatch.hpp | 37 ++++++------------- include/ginkgo/ginkgo.hpp | 1 + omp/CMakeLists.txt | 1 - omp/solver/idr_kernels.cpp | 12 +++--- reference/CMakeLists.txt | 2 - reference/matrix/diagonal_kernels.cpp | 1 - test/mpi/matrix.cpp | 4 +- 21 files changed, 64 insertions(+), 116 deletions(-) diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index 1a5404b1738..6ea7b6881d9 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -17,6 +17,9 @@ #include "utils.hpp" +struct __half; + + namespace gko { namespace acc { namespace detail { diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 0278fbbc711..fa8ba747c27 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -33,16 +33,6 @@ struct remove_complex_impl> { }; -// template -// struct is_complex_impl> -// : public std::integral_constant {}; - - -// template -// struct is_complex_or_scalar_impl> -// : is_complex_or_scalar_impl {}; - - template struct truncate_type_impl> { using type = thrust::complex::type>; diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp index 5481be27e32..b5870ecbe29 100644 --- a/common/unified/components/fill_array_kernels.cpp +++ b/common/unified/components/fill_array_kernels.cpp @@ -31,9 +31,13 @@ template void fill_seq_array(std::shared_ptr exec, ValueType* array, size_type n) { + // __half only has long long not int64_t run_kernel( - exec, [] GKO_KERNEL(auto idx, auto array) { array[idx] = static_cast(idx); }, n, - array); + exec, + [] GKO_KERNEL(auto idx, auto array) { + array[idx] = static_cast(idx); + }, + n, array); } GKO_INSTANTIATE_FOR_EACH_TEMPLATE_TYPE(GKO_DECLARE_FILL_SEQ_ARRAY_KERNEL); diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index 672d273db65..4e73cc8d56a 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -97,8 +97,8 @@ ResidualNormBase::ResidualNormBase( baseline_{baseline}, system_matrix_{args.system_matrix}, b_{args.b}, - one_{gko::initialize({one()}, exec)}, - neg_one_{gko::initialize({-one()}, exec)}, + one_{gko::initialize({1}, exec)}, + neg_one_{gko::initialize({-1}, exec)}, reduction_tmp_{exec} { switch (baseline_) { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index fcf9ac4b885..000cb7b215f 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -61,7 +61,6 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") endif() endif() -target_compile_definitions(ginkgo_cuda PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_compile_features(ginkgo_cuda) target_compile_definitions(ginkgo_cuda PRIVATE GKO_COMPILING_CUDA GKO_DEVICE_NAMESPACE=cuda) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 2dc22cf3712..2f7f01c1e24 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -62,7 +62,8 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +// from the cuda_fp16.hpp +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 template <> @@ -95,7 +96,8 @@ __device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) namespace kernels { namespace cuda { -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 90cdb362855..4ebd4c26e0a 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -212,14 +212,15 @@ struct CudaSolveStruct : gko::solver::SolveStruct { size_type work_size{}; - // In nullptr is considered nullptr_t not casted to const ValueType* + // TODO: In nullptr is considered nullptr_t not casted to const + // ValueType* it works as expected now sparselib::buffer_size_ext( handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, - solve_info, policy, &work_size); + matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, + &work_size); // allocate workspace work.resize_and_reset(work_size); @@ -229,8 +230,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, - solve_info, policy, work.get_data()); + matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, + work.get_data()); } void solve(const matrix::Csr* matrix, @@ -504,7 +505,8 @@ __global__ void sptrsv_naive_legacy_kernel( const auto row_end = is_upper ? rowptrs[row] - 1 : rowptrs[row + 1]; const int row_step = is_upper ? -1 : 1; - ValueType sum = ValueType{0.0}; + // no constructor from double to thrust<__half> + ValueType sum = zero(); auto j = row_begin; auto col = colidxs[j]; while (j != row_end) { diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 8d658bb6994..851ef9a3dc6 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -120,7 +120,6 @@ target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) if (GINKGO_DPCPP_SINGLE_MODE) target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_DPCPP_SINGLE_MODE=1) endif() -target_compile_definitions(ginkgo_dpcpp PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_default_includes(ginkgo_dpcpp) ginkgo_install_library(ginkgo_dpcpp) diff --git a/dpcpp/components/cooperative_groups.dp.hpp b/dpcpp/components/cooperative_groups.dp.hpp index 89f5839676e..034bf4baf28 100644 --- a/dpcpp/components/cooperative_groups.dp.hpp +++ b/dpcpp/components/cooperative_groups.dp.hpp @@ -13,12 +13,6 @@ #include "dpcpp/base/config.hpp" #include "dpcpp/base/dpct.hpp" -// namespace sycl { -// namespace detail { -// template <> -// struct is_arithmetic : public std::false_type {}; -// } // namespace detail -// } // namespace sycl namespace gko { namespace kernels { diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 4a1382c3bb3..5d72e767693 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -31,6 +31,7 @@ #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" +#include "dpcpp/base/onemkl_bindings.hpp" #include "dpcpp/components/atomic.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" @@ -1384,14 +1385,6 @@ void load_balance_spmv(std::shared_ptr exec, } } -template -struct onemkl_support : std::false_type {}; - -template <> -struct onemkl_support : std::true_type {}; - -template <> -struct onemkl_support : std::true_type {}; template bool try_general_sparselib_spmv(std::shared_ptr exec, diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 01c0cc8b3ba..fae9f02b516 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -209,20 +209,6 @@ void compute_norm2_dispatch(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); -template -struct onemkl_support : std::false_type {}; - -template <> -struct onemkl_support : std::true_type {}; - -template <> -struct onemkl_support : std::true_type {}; - -template <> -struct onemkl_support> : std::true_type {}; - -template <> -struct onemkl_support> : std::true_type {}; template void simple_apply(std::shared_ptr exec, @@ -231,7 +217,7 @@ void simple_apply(std::shared_ptr exec, matrix::Dense* c) { using namespace oneapi::mkl; - if constexpr (onemkl_support::value) { + if constexpr (onemkl::is_supported::value) { if (b->get_stride() != 0 && c->get_stride() != 0) { if (a->get_size()[1] > 0) { oneapi::mkl::blas::row_major::gemm( @@ -259,7 +245,7 @@ void apply(std::shared_ptr exec, const matrix::Dense* beta, matrix::Dense* c) { using namespace oneapi::mkl; - if constexpr (onemkl_support::value) { + if constexpr (onemkl::is_supported::value) { if (b->get_stride() != 0 && c->get_stride() != 0) { if (a->get_size()[1] > 0) { oneapi::mkl::blas::row_major::gemm( diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 2c882af99de..7d914d57a81 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -70,7 +70,6 @@ if (GINKGO_HAVE_ROCTX) endif() target_compile_options(ginkgo_hip PRIVATE $<$:${GINKGO_COMPILER_FLAGS}>) -target_compile_definitions(ginkgo_hip PRIVATE GINKGO_COMPILE_KERNEL=1) ginkgo_compile_features(ginkgo_hip) diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 8b397802e84..41633221e5c 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -28,19 +28,17 @@ // thrust calls the c function not the function from std // Maybe override the function from thrust directlry -GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) +__device__ __forceinline__ __half hypot(__half a, __half b) { return hypot(static_cast(a), static_cast(b)); } -GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( +__device__ __forceinline__ thrust::complex<__half> sqrt( thrust::complex<__half> a) { return sqrt(static_cast>(a)); } -// __device__ __forceinline__ float sqrt(float val) { return sqrtf(val); } -// __device__ __forceinline__ double sqrt(double val) { return ::sqrt(val); } __device__ __forceinline__ thrust::complex sqrt( thrust::complex val) { @@ -90,7 +88,8 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { #if defined(__CUDA_ARCH__) -#if __CUDA_ARCH__ >= 700 +// from the cuda_fp16.hpp +#if __CUDA_ARCH__ >= 530 __device__ __forceinline__ bool is_nan(const __half& val) { return __hisnan(val); @@ -109,7 +108,7 @@ __device__ __forceinline__ __half abs(const __half& val) } #endif -#elif defined(__HIP_DEVICE_COMPILE__) +#else // Not nvidia device __device__ __forceinline__ bool is_nan(const __half& val) { return __hisnan(val); @@ -120,10 +119,6 @@ __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } #endif -// #if defined(__HIPCC__) - -// #endif - namespace kernels { namespace hip { diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 34f1a36fa2e..c7e2eb855ec 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -490,6 +490,11 @@ struct arth_type { using type = float; }; +template +struct arth_type> { + using type = std::complex::type>; +}; + template struct infinity_impl { // CUDA doesn't allow us to call std::numeric_limits functions @@ -701,7 +706,7 @@ GKO_INLINE constexpr int64 ceildiv(int64 num, int64 den) template GKO_INLINE constexpr T zero() { - return T(0.0); + return T{}; } @@ -729,7 +734,7 @@ GKO_INLINE constexpr T zero(const T&) template GKO_INLINE constexpr T one() { - return T(1.0); + return T(1); } @@ -1023,7 +1028,7 @@ template GKO_INLINE constexpr std::enable_if_t::value, T> abs( const T& x) { - return x >= zero() ? x : static_cast(-x); + return x >= zero() ? x : -x; } @@ -1209,20 +1214,13 @@ GKO_INLINE GKO_ATTRIBUTES std::enable_if_t::value, bool> is_nan( * @return NaN. */ template -GKO_INLINE constexpr std::enable_if_t< - !is_complex_s::value && !std::is_same::value, T> +GKO_INLINE constexpr std::enable_if_t::value, + typename detail::arth_type::type> nan() { return std::numeric_limits::quiet_NaN(); } -template -GKO_INLINE constexpr std::enable_if_t::value, float> nan() -{ - return std::numeric_limits::quiet_NaN(); -} - - /** * Returns a complex with both components quiet NaN. * @@ -1231,7 +1229,9 @@ GKO_INLINE constexpr std::enable_if_t::value, float> nan() * @return complex{NaN, NaN}. */ template -GKO_INLINE constexpr std::enable_if_t::value, T> nan() +GKO_INLINE constexpr std::enable_if_t::value, + typename detail::arth_type::type> +nan() { return T{nan>(), nan>()}; } diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index e028336f202..9d8ec1c9cb3 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -49,18 +49,13 @@ make_temporary_conversion(Ptr&& matrix) using Pointee = detail::pointee; using Dense = matrix::Dense; using NextDense = matrix::Dense>; + using NextNextDense = matrix::Dense>>; using MaybeConstDense = std::conditional_t::value, const Dense, Dense>; auto result = detail::temporary_conversion< - MaybeConstDense>::template create(matrix); + MaybeConstDense>::template create(matrix); if (!result) { - result = detail::temporary_conversion>:: - template create< - matrix::Dense>>>( - matrix); - if (!result) { - GKO_NOT_SUPPORTED(matrix); - } + GKO_NOT_SUPPORTED(matrix); } return result; } @@ -358,16 +353,11 @@ make_temporary_conversion(LinOp* matrix) auto result = detail::temporary_conversion< experimental::distributed::Vector>:: template create< - experimental::distributed::Vector>>( - matrix); - if (!result) { - result = detail::temporary_conversion< - experimental::distributed::Vector>:: - template create>, + experimental::distributed::Vector< next_precision>>>(matrix); - if (!result) { - GKO_NOT_SUPPORTED(matrix); - } + if (!result) { + GKO_NOT_SUPPORTED(matrix); } return result; } @@ -383,16 +373,11 @@ make_temporary_conversion(const LinOp* matrix) auto result = detail::temporary_conversion< const experimental::distributed::Vector>:: template create< - experimental::distributed::Vector>>( - matrix); - if (!result) { - result = detail::temporary_conversion< - const experimental::distributed::Vector>:: - template create>, + experimental::distributed::Vector< next_precision>>>(matrix); - if (!result) { - GKO_NOT_SUPPORTED(matrix); - } + if (!result) { + GKO_NOT_SUPPORTED(matrix); } return result; } diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp index 0fab93dcefe..371efaaac39 100644 --- a/include/ginkgo/ginkgo.hpp +++ b/include/ginkgo/ginkgo.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index e8379a77535..41bec80673f 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -81,7 +81,6 @@ target_include_directories(ginkgo_omp PRIVATE "${OpenMP_CXX_INCLUDE_DIRS}") # and the compiler is unhappy with the quotation marks. separate_arguments(OpenMP_SEP_FLAGS NATIVE_COMMAND "${OpenMP_CXX_FLAGS}") target_compile_options(ginkgo_omp PRIVATE "${OpenMP_SEP_FLAGS}") -target_compile_definitions(ginkgo_omp PRIVATE GINKGO_COMPILE_KERNEL=1) # Need to link against ginkgo_cuda for the `raw_copy_to(CudaExecutor ...)` method target_link_libraries(ginkgo_omp PRIVATE ginkgo_cuda) diff --git a/omp/solver/idr_kernels.cpp b/omp/solver/idr_kernels.cpp index 388e10342ad..26f490b603f 100644 --- a/omp/solver/idr_kernels.cpp +++ b/omp/solver/idr_kernels.cpp @@ -135,16 +135,16 @@ void initialize(std::shared_ptr exec, const size_type nrhs, // Initialize and Orthonormalize P const auto num_rows = subspace_vectors->get_size()[0]; const auto num_cols = subspace_vectors->get_size()[1]; - // auto dist = - // std::normal_distribution>(0.0, 1.0); + auto dist = std::normal_distribution< + typename detail::arth_type>::type>(0.0, 1.0); auto seed = std::random_device{}(); auto gen = std::default_random_engine(seed); for (size_type row = 0; row < num_rows; row++) { if (!deterministic) { - // for (size_type col = 0; col < num_cols; col++) { - // subspace_vectors->at(row, col) = - // get_rand_value(dist, gen); - // } + for (size_type col = 0; col < num_cols; col++) { + subspace_vectors->at(row, col) = + get_rand_value(dist, gen); + } } for (size_type i = 0; i < row; i++) { diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt index 7a02998e927..83652eb8783 100644 --- a/reference/CMakeLists.txt +++ b/reference/CMakeLists.txt @@ -79,8 +79,6 @@ ginkgo_install_library(ginkgo_reference) if (CMAKE_CXX_COMPILER_ID MATCHES "PGI|NVHPC") set_source_files_properties(preconditioner/jacobi_kernels.cpp PROPERTIES COMPILE_FLAGS "-O1") endif() - -target_compile_definitions(ginkgo_reference PRIVATE GINKGO_COMPILE_KERNEL=1) if (GINKGO_CHECK_CIRCULAR_DEPS) ginkgo_check_headers(ginkgo_reference "") endif() diff --git a/reference/matrix/diagonal_kernels.cpp b/reference/matrix/diagonal_kernels.cpp index 1193c81ecdb..028b7685c2b 100644 --- a/reference/matrix/diagonal_kernels.cpp +++ b/reference/matrix/diagonal_kernels.cpp @@ -6,7 +6,6 @@ #include #include -#include "core/base/extended_float.hpp" namespace gko { diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index c22d83d8014..d1b03fce8a1 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -690,7 +690,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{static_cast>(r::value)}; + : gko::remove_complex{r::value}; this->dist_mat->convert_to(tmp); tmp->convert_to(res); @@ -717,7 +717,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{static_cast>(r::value)}; + : gko::remove_complex{r::value}; this->dist_mat->move_to(tmp); tmp->convert_to(res); From 3154a04176a9f538d1312ca0322f7fa0e26b79d4 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 22:19:17 +0100 Subject: [PATCH 19/62] move half.hpp out of type.hpp --- core/base/device_matrix_data_kernels.hpp | 1 + core/base/extended_float.hpp | 2 + core/base/mixed_precision_types.hpp | 1 + core/base/mtx_io.cpp | 1 + core/base/utils.hpp | 1 + core/components/absolute_array_kernels.hpp | 1 + core/components/fill_array_kernels.hpp | 1 + core/components/format_conversion_kernels.hpp | 1 + .../precision_conversion_kernels.hpp | 1 + core/components/prefix_sum_kernels.hpp | 1 + core/components/reduce_array_kernels.hpp | 1 + core/distributed/matrix_kernels.hpp | 1 + core/factorization/cholesky_kernels.hpp | 1 + core/factorization/factorization_kernels.hpp | 1 + core/factorization/ilu_kernels.hpp | 1 + core/factorization/lu_kernels.hpp | 1 + core/factorization/par_ic_kernels.hpp | 1 + core/factorization/par_ict_kernels.hpp | 1 + core/factorization/par_ilu_kernels.hpp | 1 + core/factorization/par_ilut_kernels.hpp | 1 + core/matrix/coo_kernels.hpp | 1 + core/matrix/csr_kernels.hpp | 1 + core/matrix/csr_lookup.hpp | 1 + core/matrix/dense_kernels.hpp | 2 + core/matrix/diagonal_kernels.hpp | 1 + core/matrix/fbcsr_kernels.hpp | 1 + core/matrix/fft_kernels.hpp | 1 + core/matrix/row_gatherer.cpp | 1 + core/matrix/sparsity_csr_kernels.hpp | 1 + core/multigrid/pgm.cpp | 1 + core/preconditioner/jacobi_utils.hpp | 1 + core/reorder/rcm_kernels.hpp | 1 + core/solver/bicg_kernels.hpp | 1 + core/solver/bicgstab_kernels.hpp | 1 + core/solver/cb_gmres.cpp | 1 + core/solver/cb_gmres_accessor.hpp | 1 + core/solver/cb_gmres_kernels.hpp | 1 + core/solver/cg_kernels.hpp | 1 + core/solver/cgs_kernels.hpp | 1 + core/solver/common_gmres_kernels.hpp | 1 + core/solver/gmres_kernels.hpp | 1 + core/solver/idr_kernels.hpp | 1 + core/solver/ir_kernels.hpp | 1 + core/solver/multigrid.cpp | 1 + core/solver/multigrid_kernels.hpp | 1 + core/stop/criterion_kernels.hpp | 1 + core/stop/residual_norm_kernels.hpp | 1 + .../accessor/reduced_row_major_ginkgo.cpp | 2 + core/test/utils.hpp | 1 + core/test/utils/assertions.hpp | 1 + cuda/base/types.hpp | 1 + hip/base/types.hip.hpp | 1 + include/ginkgo/core/base/array.hpp | 1 + include/ginkgo/core/base/dim.hpp | 1 + include/ginkgo/core/base/exception.hpp | 60 ++++++++++--------- include/ginkgo/core/base/executor.hpp | 1 + include/ginkgo/core/base/half.hpp | 1 + include/ginkgo/core/base/index_set.hpp | 1 + include/ginkgo/core/base/intrinsics.hpp | 1 + include/ginkgo/core/base/lin_op.hpp | 1 + include/ginkgo/core/base/math.hpp | 7 +++ .../ginkgo/core/base/matrix_assembly_data.hpp | 1 + include/ginkgo/core/base/matrix_data.hpp | 1 + include/ginkgo/core/base/mpi.hpp | 1 + include/ginkgo/core/base/range.hpp | 1 + include/ginkgo/core/base/range_accessors.hpp | 1 + include/ginkgo/core/base/types.hpp | 2 +- include/ginkgo/core/base/utils_helper.hpp | 1 + include/ginkgo/core/base/version.hpp | 1 + include/ginkgo/core/distributed/partition.hpp | 1 + .../core/factorization/factorization.hpp | 1 + include/ginkgo/core/factorization/ic.hpp | 1 + include/ginkgo/core/factorization/ilu.hpp | 1 + include/ginkgo/core/factorization/par_ic.hpp | 1 + include/ginkgo/core/factorization/par_ict.hpp | 1 + include/ginkgo/core/factorization/par_ilu.hpp | 1 + .../ginkgo/core/factorization/par_ilut.hpp | 1 + include/ginkgo/core/log/logger.hpp | 1 + include/ginkgo/core/matrix/dense.hpp | 1 + include/ginkgo/core/matrix/permutation.hpp | 1 + include/ginkgo/core/matrix/row_gatherer.hpp | 1 + .../core/multigrid/fixed_coarsening.hpp | 1 + include/ginkgo/core/multigrid/pgm.hpp | 1 + include/ginkgo/core/reorder/rcm.hpp | 1 + .../ginkgo/core/reorder/scaled_reordered.hpp | 1 + include/ginkgo/core/solver/bicg.hpp | 1 + include/ginkgo/core/solver/bicgstab.hpp | 1 + include/ginkgo/core/solver/cb_gmres.hpp | 1 + include/ginkgo/core/solver/cg.hpp | 1 + include/ginkgo/core/solver/cgs.hpp | 1 + include/ginkgo/core/solver/fcg.hpp | 1 + include/ginkgo/core/solver/gmres.hpp | 1 + include/ginkgo/core/solver/idr.hpp | 1 + include/ginkgo/core/solver/ir.hpp | 1 + include/ginkgo/core/solver/multigrid.hpp | 1 + include/ginkgo/core/solver/triangular.hpp | 1 + include/ginkgo/core/stop/stopping_status.hpp | 1 + omp/components/atomic.hpp | 1 + 98 files changed, 139 insertions(+), 28 deletions(-) diff --git a/core/base/device_matrix_data_kernels.hpp b/core/base/device_matrix_data_kernels.hpp index bcaeebdf0cb..019427b4a83 100644 --- a/core/base/device_matrix_data_kernels.hpp +++ b/core/base/device_matrix_data_kernels.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/base/extended_float.hpp b/core/base/extended_float.hpp index f6b2e6e5309..13d104efed1 100644 --- a/core/base/extended_float.hpp +++ b/core/base/extended_float.hpp @@ -9,6 +9,8 @@ #include #include +#include +#include #include diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 27794c2c9bf..0f1f9869f91 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -7,6 +7,7 @@ #include +#include #include diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 3ca4f7e9d3a..30c6de08b75 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -14,6 +14,7 @@ #include #include +#include #include #include diff --git a/core/base/utils.hpp b/core/base/utils.hpp index 061c6e303ed..157a82b1a8f 100644 --- a/core/base/utils.hpp +++ b/core/base/utils.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/components/absolute_array_kernels.hpp b/core/components/absolute_array_kernels.hpp index 7617883cd1c..8965215fb96 100644 --- a/core/components/absolute_array_kernels.hpp +++ b/core/components/absolute_array_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include diff --git a/core/components/fill_array_kernels.hpp b/core/components/fill_array_kernels.hpp index 2608cabe409..5bb18ada799 100644 --- a/core/components/fill_array_kernels.hpp +++ b/core/components/fill_array_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp index 10be3a10232..5f4ad5e519e 100644 --- a/core/components/format_conversion_kernels.hpp +++ b/core/components/format_conversion_kernels.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/components/precision_conversion_kernels.hpp b/core/components/precision_conversion_kernels.hpp index 8443a657502..3157a04c703 100644 --- a/core/components/precision_conversion_kernels.hpp +++ b/core/components/precision_conversion_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include diff --git a/core/components/prefix_sum_kernels.hpp b/core/components/prefix_sum_kernels.hpp index 8b68b54e29f..aa4a812cc73 100644 --- a/core/components/prefix_sum_kernels.hpp +++ b/core/components/prefix_sum_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/components/reduce_array_kernels.hpp b/core/components/reduce_array_kernels.hpp index b124e6ec2e3..ef79c3b18be 100644 --- a/core/components/reduce_array_kernels.hpp +++ b/core/components/reduce_array_kernels.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index f24e8c9945e..e3f5801ad03 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index db889ce1162..630707cdd18 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp index bab3dd16bd2..c73856fd44a 100644 --- a/core/factorization/factorization_kernels.hpp +++ b/core/factorization/factorization_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp index 2371c17fda4..1b9fd4cb590 100644 --- a/core/factorization/ilu_kernels.hpp +++ b/core/factorization/ilu_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp index f497398cb90..9c26cc95736 100644 --- a/core/factorization/lu_kernels.hpp +++ b/core/factorization/lu_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include diff --git a/core/factorization/par_ic_kernels.hpp b/core/factorization/par_ic_kernels.hpp index 59d2d97ffce..8827c55a2ee 100644 --- a/core/factorization/par_ic_kernels.hpp +++ b/core/factorization/par_ic_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp index 25172c0d649..29ac0def3e1 100644 --- a/core/factorization/par_ict_kernels.hpp +++ b/core/factorization/par_ict_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp index 16d20859c3e..1fde2d7abab 100644 --- a/core/factorization/par_ilu_kernels.hpp +++ b/core/factorization/par_ilu_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp index 2d8ac7b4f88..ede8d858792 100644 --- a/core/factorization/par_ilut_kernels.hpp +++ b/core/factorization/par_ilut_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index a2cc44b74d9..50833621675 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_MATRIX_COO_KERNELS_HPP_ +#include #include #include #include diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 6013e014c8a..23676c1810e 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index a7b687c3618..129736841c6 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -8,6 +8,7 @@ #include +#include #include #include #include diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 7422b431aa0..95b1ca754f5 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -8,6 +8,8 @@ #include + +#include #include #include #include diff --git a/core/matrix/diagonal_kernels.hpp b/core/matrix/diagonal_kernels.hpp index 630c76e43ad..4baf2fa8bc5 100644 --- a/core/matrix/diagonal_kernels.hpp +++ b/core/matrix/diagonal_kernels.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_MATRIX_DIAGONAL_KERNELS_HPP_ +#include #include #include #include diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp index 7a644d48d78..9f8d10d5be9 100644 --- a/core/matrix/fbcsr_kernels.hpp +++ b/core/matrix/fbcsr_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/core/matrix/fft_kernels.hpp b/core/matrix/fft_kernels.hpp index bd0e231c394..b843f65521c 100644 --- a/core/matrix/fft_kernels.hpp +++ b/core/matrix/fft_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include diff --git a/core/matrix/row_gatherer.cpp b/core/matrix/row_gatherer.cpp index 20b592ce98a..56fcbf93d88 100644 --- a/core/matrix/row_gatherer.cpp +++ b/core/matrix/row_gatherer.cpp @@ -4,6 +4,7 @@ #include "ginkgo/core/matrix/row_gatherer.hpp" +#include #include #include "core/base/dispatch_helper.hpp" diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index e07bb980dce..655c5a76dde 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_MATRIX_SPARSITY_CSR_KERNELS_HPP_ +#include #include #include #include diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 1cef0f0d77c..7e7ccf24037 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index b0aa8b5f38a..1320524a1a7 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -6,6 +6,7 @@ #define GKO_CORE_PRECONDITIONER_JACOBI_UTILS_HPP_ +#include #include #include diff --git a/core/reorder/rcm_kernels.hpp b/core/reorder/rcm_kernels.hpp index a89b2732cb0..737182a63d3 100644 --- a/core/reorder/rcm_kernels.hpp +++ b/core/reorder/rcm_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp index 5e94d8ca350..46fadb8ccc4 100644 --- a/core/solver/bicg_kernels.hpp +++ b/core/solver/bicg_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp index e3bfbdcdcb6..cc2ebada4c1 100644 --- a/core/solver/bicgstab_kernels.hpp +++ b/core/solver/bicgstab_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/solver/cb_gmres.cpp b/core/solver/cb_gmres.cpp index 4ba329d7252..c1d084fb123 100644 --- a/core/solver/cb_gmres.cpp +++ b/core/solver/cb_gmres.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp index a5d95793d15..72e9618cc0e 100644 --- a/core/solver/cb_gmres_accessor.hpp +++ b/core/solver/cb_gmres_accessor.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index 29a84f25ba1..1f012416a39 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp index bec5f04d0e5..dc05acd169c 100644 --- a/core/solver/cg_kernels.hpp +++ b/core/solver/cg_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp index d64aeedb549..9b0847b858b 100644 --- a/core/solver/cgs_kernels.hpp +++ b/core/solver/cgs_kernels.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/core/solver/common_gmres_kernels.hpp b/core/solver/common_gmres_kernels.hpp index 0209284c446..cd7eb821d3d 100644 --- a/core/solver/common_gmres_kernels.hpp +++ b/core/solver/common_gmres_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index 21bb5854816..31feeee5e84 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/core/solver/idr_kernels.hpp b/core/solver/idr_kernels.hpp index 3d579bd01af..e988febf0ac 100644 --- a/core/solver/idr_kernels.hpp +++ b/core/solver/idr_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp index a411c9f375d..f6fa94cef66 100644 --- a/core/solver/ir_kernels.hpp +++ b/core/solver/ir_kernels.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 04e0192101e..2ae444a3d82 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/core/solver/multigrid_kernels.hpp b/core/solver/multigrid_kernels.hpp index 73c660cbefb..c6f5c0abc50 100644 --- a/core/solver/multigrid_kernels.hpp +++ b/core/solver/multigrid_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp index 62e4135ee37..014763c6079 100644 --- a/core/stop/criterion_kernels.hpp +++ b/core/stop/criterion_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include diff --git a/core/stop/residual_norm_kernels.hpp b/core/stop/residual_norm_kernels.hpp index 7625dadefeb..665004d37bc 100644 --- a/core/stop/residual_norm_kernels.hpp +++ b/core/stop/residual_norm_kernels.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp index 7acad0b9638..13ab40cf933 100644 --- a/core/test/accessor/reduced_row_major_ginkgo.cpp +++ b/core/test/accessor/reduced_row_major_ginkgo.cpp @@ -10,6 +10,8 @@ #include +#include // necessary for gko::half + #include "accessor/index_span.hpp" #include "accessor/range.hpp" #include "accessor/reduced_row_major.hpp" diff --git a/core/test/utils.hpp b/core/test/utils.hpp index bfdcf1ee35a..3b6dd13444d 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -15,6 +15,7 @@ #include +#include #include #include #include diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index d082d7992ec..3a275b8ee53 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 2f7f01c1e24..efcfe8f3d2d 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 41633221e5c..fff60d6d798 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -21,6 +21,7 @@ #endif #include +#include #include #include "common/cuda_hip/base/runtime.hpp" diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index e0cf8c22ab3..a2ba2a394ba 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index ffa38aa6a76..3ad0ee7f619 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -8,6 +8,7 @@ #include +#include #include diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index febc5e17034..74a06c60c7b 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -9,25 +9,24 @@ #include #include +#include #include namespace gko { - - /** * The Error class is used to report exceptional behaviour in library * functions. Ginkgo uses C++ exception mechanism to this end, and the - * Error class represents a base class for all types of errors. The exact list - * of errors which could occur during the execution of a certain library - * routine is provided in the documentation of that routine, along with a short - * description of the situation when that error can occur. - * During runtime, these errors can be detected by using standard C++ try-catch - * blocks, and a human-readable error description can be obtained by calling - * the Error::what() method. + * Error class represents a base class for all types of errors. The exact + * list of errors which could occur during the execution of a certain + * library routine is provided in the documentation of that routine, along + * with a short description of the situation when that error can occur. + * During runtime, these errors can be detected by using standard C++ + * try-catch blocks, and a human-readable error description can be obtained + * by calling the Error::what() method. * - * As an example, trying to compute a matrix-vector product with arguments of - * incompatible size will result in a DimensionMismatch error, which is + * As an example, trying to compute a matrix-vector product with arguments + * of incompatible size will result in a DimensionMismatch error, which is * demonstrated in the following program. * * ```cpp @@ -68,8 +67,8 @@ class Error : public std::exception { {} /** - * Returns a human-readable string with a more detailed description of the - * error. + * Returns a human-readable string with a more detailed description of + * the error. */ virtual const char* what() const noexcept override { return what_.c_str(); } @@ -98,8 +97,8 @@ class NotImplemented : public Error { /** - * NotCompiled is thrown when attempting to call an operation which is a part of - * a module that was not compiled on the system. + * NotCompiled is thrown when attempting to call an operation which is a + * part of a module that was not compiled on the system. */ class NotCompiled : public Error { public: @@ -236,7 +235,8 @@ class CurandError : public Error { /** - * CusparseError is thrown when a cuSPARSE routine throws a non-zero error code. + * CusparseError is thrown when a cuSPARSE routine throws a non-zero error + * code. */ class CusparseError : public Error { public: @@ -305,7 +305,8 @@ class HipError : public Error { /** - * HipblasError is thrown when a hipBLAS routine throws a non-zero error code. + * HipblasError is thrown when a hipBLAS routine throws a non-zero error + * code. */ class HipblasError : public Error { public: @@ -328,7 +329,8 @@ class HipblasError : public Error { /** - * HiprandError is thrown when a hipRAND routine throws a non-zero error code. + * HiprandError is thrown when a hipRAND routine throws a non-zero error + * code. */ class HiprandError : public Error { public: @@ -435,7 +437,8 @@ class DimensionMismatch : public Error { * @param second_name The name of the second operator * @param second_rows The output dimension of the second operator * @param second_cols The input dimension of the second operator - * @param clarification An additional message describing the error further + * @param clarification An additional message describing the error + * further */ DimensionMismatch(const std::string& file, int line, const std::string& func, const std::string& first_name, @@ -467,7 +470,8 @@ class BadDimension : public Error { * @param op_name The name of the operator * @param op_num_rows The row dimension of the operator * @param op_num_cols The column dimension of the operator - * @param clarification An additional message further describing the error + * @param clarification An additional message further describing the + * error */ BadDimension(const std::string& file, int line, const std::string& func, const std::string& op_name, size_type op_num_rows, @@ -483,8 +487,8 @@ class BadDimension : public Error { /** * Error that denotes issues between block sizes and matrix dimensions * - * \tparam IndexType Type of index used by the linear algebra object that is - * incompatible with the required block size. + * \tparam IndexType Type of index used by the linear algebra object that + * is incompatible with the required block size. */ template class BlockSizeError : public Error { @@ -517,7 +521,8 @@ class ValueMismatch : public Error { * @param func The function name where the error occurred * @param val1 The first value to be compared. * @param val2 The second value to be compared. - * @param clarification An additional message further describing the error + * @param clarification An additional message further describing the + * error */ ValueMismatch(const std::string& file, int line, const std::string& func, size_type val1, size_type val2, @@ -576,8 +581,9 @@ class OutOfBoundsError : public Error { /** - * OverflowError is thrown when an index calculation for storage requirements - * overflows. This most likely means that the index type is too small. + * OverflowError is thrown when an index calculation for storage + * requirements overflows. This most likely means that the index type is too + * small. */ class OverflowError : public Error { public: @@ -614,8 +620,8 @@ class StreamError : public Error { /** - * KernelNotFound is thrown if Ginkgo cannot find a kernel which satisfies the - * criteria imposed by the input arguments. + * KernelNotFound is thrown if Ginkgo cannot find a kernel which satisfies + * the criteria imposed by the input arguments. */ class KernelNotFound : public Error { public: diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 963e30bfddd..ec3c42eb387 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 09b3c7a0686..1ae96bd942d 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp index 260896d6b2f..7285a3ff880 100644 --- a/include/ginkgo/core/base/index_set.hpp +++ b/include/ginkgo/core/base/index_set.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp index 37e7f361781..f5220c384a3 100644 --- a/include/ginkgo/core/base/intrinsics.hpp +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -8,6 +8,7 @@ #include +#include #include diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index 26e1c1b9baa..06f874374a6 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index c7e2eb855ec..d2e60ea677d 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -27,9 +28,14 @@ class __half; + namespace thrust { + + template class complex; + + } namespace std { @@ -41,6 +47,7 @@ inline gko::half abs(std::complex a) return gko::half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); } + inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } inline std::complex sqrt(std::complex a) diff --git a/include/ginkgo/core/base/matrix_assembly_data.hpp b/include/ginkgo/core/base/matrix_assembly_data.hpp index 6993f2004f2..ac3ab91e687 100644 --- a/include/ginkgo/core/base/matrix_assembly_data.hpp +++ b/include/ginkgo/core/base/matrix_assembly_data.hpp @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 218c79a6fea..4c6f146e474 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index 293b4d64cf9..d42025589ee 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 680bc47bcb6..716f65fa797 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -8,6 +8,7 @@ #include +#include #include #include #include diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp index 56335b8dd97..5401988f963 100644 --- a/include/ginkgo/core/base/range_accessors.hpp +++ b/include/ginkgo/core/base/range_accessors.hpp @@ -8,6 +8,7 @@ #include +#include #include #include diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index d6ada1d4fef..14e34529a3b 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -870,5 +870,5 @@ using comm_index_type = int; } // namespace experimental } // namespace gko -#include + #endif // GKO_PUBLIC_CORE_BASE_TYPES_HPP_ diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index 951ea4bbf5d..c19460a2c65 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp index 9fad9430527..2f8efd1cbce 100644 --- a/include/ginkgo/core/base/version.hpp +++ b/include/ginkgo/core/base/version.hpp @@ -9,6 +9,7 @@ #include #include +#include #include diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index 89adb22f3e7..ba4835a4c3f 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -7,6 +7,7 @@ #include +#include #include #include diff --git a/include/ginkgo/core/factorization/factorization.hpp b/include/ginkgo/core/factorization/factorization.hpp index 39345f59a44..01cfa2aec3b 100644 --- a/include/ginkgo/core/factorization/factorization.hpp +++ b/include/ginkgo/core/factorization/factorization.hpp @@ -7,6 +7,7 @@ #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp index 616360ce039..c430c914207 100644 --- a/include/ginkgo/core/factorization/ic.hpp +++ b/include/ginkgo/core/factorization/ic.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp index 80f11ab7b6f..839035c5e0e 100644 --- a/include/ginkgo/core/factorization/ilu.hpp +++ b/include/ginkgo/core/factorization/ilu.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp index b5f14a997b4..54c0b3eeb66 100644 --- a/include/ginkgo/core/factorization/par_ic.hpp +++ b/include/ginkgo/core/factorization/par_ic.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp index bc2e38eadf4..d4cf34b137a 100644 --- a/include/ginkgo/core/factorization/par_ict.hpp +++ b/include/ginkgo/core/factorization/par_ict.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp index 88d183a939c..5c97718bc2c 100644 --- a/include/ginkgo/core/factorization/par_ilu.hpp +++ b/include/ginkgo/core/factorization/par_ilu.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp index c73e3a1b905..afd2d834ab6 100644 --- a/include/ginkgo/core/factorization/par_ilut.hpp +++ b/include/ginkgo/core/factorization/par_ilut.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index dd9d30249e9..7a75fe0d111 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 53159d054b0..9033adbf766 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index 5549b75f694..ee5ec427816 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/matrix/row_gatherer.hpp b/include/ginkgo/core/matrix/row_gatherer.hpp index bf55f03bdb0..ad1254207c0 100644 --- a/include/ginkgo/core/matrix/row_gatherer.hpp +++ b/include/ginkgo/core/matrix/row_gatherer.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/multigrid/fixed_coarsening.hpp b/include/ginkgo/core/multigrid/fixed_coarsening.hpp index 86c21acba39..becc149e433 100644 --- a/include/ginkgo/core/multigrid/fixed_coarsening.hpp +++ b/include/ginkgo/core/multigrid/fixed_coarsening.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp index d07001be2f1..ebb19fd2c89 100644 --- a/include/ginkgo/core/multigrid/pgm.hpp +++ b/include/ginkgo/core/multigrid/pgm.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp index 589d38e29d1..661dea03c55 100644 --- a/include/ginkgo/core/reorder/rcm.hpp +++ b/include/ginkgo/core/reorder/rcm.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/reorder/scaled_reordered.hpp b/include/ginkgo/core/reorder/scaled_reordered.hpp index 862a2135bca..65627ec3e4c 100644 --- a/include/ginkgo/core/reorder/scaled_reordered.hpp +++ b/include/ginkgo/core/reorder/scaled_reordered.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp index 2a43c1ca3f8..36188946b96 100644 --- a/include/ginkgo/core/solver/bicg.hpp +++ b/include/ginkgo/core/solver/bicg.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp index a57a6c27aa4..e89e65387b7 100644 --- a/include/ginkgo/core/solver/bicgstab.hpp +++ b/include/ginkgo/core/solver/bicgstab.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp index 976712cd673..5cab2c466eb 100644 --- a/include/ginkgo/core/solver/cb_gmres.hpp +++ b/include/ginkgo/core/solver/cb_gmres.hpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index 984d5d1f104..6ba5efe4226 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp index bde23d76910..ef5d0ac5226 100644 --- a/include/ginkgo/core/solver/cgs.hpp +++ b/include/ginkgo/core/solver/cgs.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index dfaf252b557..5d3f60de0ef 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index 3ba3acf94bb..19f45303d27 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index 9f167d9b2eb..66e574b28b9 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index 91949261a79..dba9b50d901 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 2d0278b538e..697f5dbb512 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/solver/triangular.hpp b/include/ginkgo/core/solver/triangular.hpp index 2d42e3bb97a..794b4f4fc5c 100644 --- a/include/ginkgo/core/solver/triangular.hpp +++ b/include/ginkgo/core/solver/triangular.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp index 58c2f137c8d..d09404d4a6a 100644 --- a/include/ginkgo/core/stop/stopping_status.hpp +++ b/include/ginkgo/core/stop/stopping_status.hpp @@ -7,6 +7,7 @@ #include +#include #include diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index d45eb1a68cd..1de9d298fa1 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -8,6 +8,7 @@ #include +#include #include #include From 58784ab8ce9df9ef9448deb59bea0bfe90e653cb Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 8 Feb 2023 23:46:01 +0100 Subject: [PATCH 20/62] enable half for testing --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c2284bcf3b..d77f1c99cbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) -option(GINKGO_ENABLE_HALF "Enable the half operation" OFF) +option(GINKGO_ENABLE_HALF "Enable the half operation" ON) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF) From 9b2465bccfee1f585d2c1dc192b7c15000939f67 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 9 Feb 2023 00:15:04 +0100 Subject: [PATCH 21/62] __habs is added in cuda10.2 create_empty for its own type --- cuda/base/types.hpp | 8 +++++++- hip/base/types.hip.hpp | 13 ++++++++++--- include/ginkgo/core/distributed/vector.hpp | 9 +++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index efcfe8f3d2d..1093242da30 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -100,8 +100,14 @@ namespace cuda { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - +#if CUDA_VERSION >= 10020 __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } +#else +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif __device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index fff60d6d798..4f31bafaac8 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -51,7 +51,7 @@ __device__ __forceinline__ thrust::complex sqrt( return thrust::sqrt(val); } -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 +#if GINKGO_HIP_PLATFORM_NVCC && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 __device__ __forceinline__ __half sqrt(__half val) { return sqrt(static_cast(val)); @@ -88,16 +88,23 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { -#if defined(__CUDA_ARCH__) +#if GINKGO_HIP_PLATFORM_NVCC // from the cuda_fp16.hpp -#if __CUDA_ARCH__ >= 530 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ __forceinline__ bool is_nan(const __half& val) { return __hisnan(val); } +#if CUDA_VERSION >= 10020 __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } #else +__device__ __forceinline__ __half abs(const __half& val) +{ + return abs(static_cast(val)); +} +#endif +#else __device__ __forceinline__ bool is_nan(const __half& val) { return is_nan(static_cast(val)); diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index 33128f8c8de..d71ccd78015 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -684,6 +684,15 @@ struct conversion_target_helper> { source->get_communicator()); } + // Allow to create_empty of the same type + // For distributed case, next> will be V in the candicated list. + // TODO: decide to whether to add this or add condition to the list + static std::unique_ptr create_empty(const target_type* source) + { + return target_type::create(source->get_executor(), + source->get_communicator()); + } + #if GINKGO_ENABLE_HALF using snd_source_type = experimental::distributed::Vector< previous_precision>>; From e2a6c9ae67a455ed92253d81f38767aa92f77ede Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 9 Feb 2023 22:38:16 +0100 Subject: [PATCH 22/62] fix nullptr and missing instantiation. sycl::half has different rule in conv and full operator after 5.7 --- core/test/base/extended_float.cpp | 30 ++++++++++++++++++++++++++++++ cuda/solver/common_trs_kernels.cuh | 10 +++++----- include/ginkgo/core/base/half.hpp | 11 +++++++++-- include/ginkgo/core/base/types.hpp | 29 +++++++++++++++++++++++++++-- 4 files changed, 71 insertions(+), 9 deletions(-) diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index 6148c7c350a..563a1220925 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -110,7 +110,13 @@ TEST_F(FloatToHalf, ConvertsNan) { half x = create_from_bits("0" "11111111" "00000000000000000000001"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1111111111")); + #endif } @@ -118,7 +124,13 @@ TEST_F(FloatToHalf, ConvertsNegNan) { half x = create_from_bits("1" "11111111" "00010000000000000000000"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // Sycl put the 1000000000, but ours put mask + ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1111111111")); + #endif } @@ -166,7 +178,13 @@ TEST_F(FloatToHalf, TruncatesLargeNumber) { half x = create_from_bits("1" "10001110" "10010011111000010000100"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // TODO: sycl::half seems to did rounding, but ours just truncates + ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001010000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001001111")); + #endif } @@ -216,7 +234,13 @@ TEST_F(HalfToFloat, ConvertsNan) { float x = create_from_bits("0" "11111" "0001001000"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "11111111111111111111111")); + #endif } @@ -224,7 +248,13 @@ TEST_F(HalfToFloat, ConvertsNegNan) { float x = create_from_bits("1" "11111" "0000000001"); + #if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + // sycl keeps significand + ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000")); + #else ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "11111111111111111111111")); + #endif } diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 4ebd4c26e0a..362d22a653c 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -213,14 +213,14 @@ struct CudaSolveStruct : gko::solver::SolveStruct { size_type work_size{}; // TODO: In nullptr is considered nullptr_t not casted to const - // ValueType* it works as expected now + // it does not work in cuda110/100 images sparselib::buffer_size_ext( handle, algorithm, SPARSELIB_OPERATION_NON_TRANSPOSE, SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - &work_size); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, &work_size); // allocate workspace work.resize_and_reset(work_size); @@ -230,8 +230,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct { SPARSELIB_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs, matrix->get_num_stored_elements(), one(), factor_descr, matrix->get_const_values(), matrix->get_const_row_ptrs(), - matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy, - work.get_data()); + matrix->get_const_col_idxs(), (const ValueType*)(nullptr), num_rhs, + solve_info, policy, work.get_data()); } void solve(const matrix::Csr* matrix, diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 1ae96bd942d..1a8c1e1dfd1 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -322,7 +322,12 @@ struct precision_converter { } // namespace detail -#ifdef SYCL_LANGUAGE_VERSION +// sycl::half miss the arithmetic operator to result float not half before 5.7 +// (2022-06). It leads ? half : half/half ambiguous The same issue is reported +// in https://github.com/intel/llvm/issues/6028 +#if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || \ + (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) using half = sycl::half; #else /** @@ -629,7 +634,9 @@ class complex { value_type imag_; }; -#ifndef SYCL_LANGUAGE_VERSION +#if !(defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || \ + (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7))) template <> struct numeric_limits { static constexpr bool is_specialized{true}; diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 14e34529a3b..49888907410 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -143,7 +143,9 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; -#ifdef SYCL_LANGUAGE_VERSION +#if defined(SYCL_LANGUAGE_VERSION) && \ + (__LIBSYCL_MAJOR_VERSION > 5 || \ + (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) using half = sycl::half; #else class half; @@ -409,7 +411,8 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) -#if GINKGO_ENABLE_HALF +// cuda half operation is supported from arch 5.3 +#if GINKGO_ENABLE_HALF && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530) #define GKO_ADAPT_HF(_macro) template _macro #else #define GKO_ADAPT_HF(_macro) \ @@ -456,6 +459,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex)); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED @@ -489,9 +493,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, half)); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; @@ -579,9 +585,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED @@ -648,6 +656,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -680,6 +691,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template <> \ _macro(double, float) GKO_NOT_IMPLEMENTED; \ template <> \ + _macro(half, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, half) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(float, half)); \ + GKO_ADAPT_HF(_macro(half, float)); \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED @@ -687,9 +710,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(_macro(half, half)); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED From 51cf597c53038ff82cb32093983f7c057b552ef9 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 23 Mar 2023 15:29:00 +0100 Subject: [PATCH 23/62] fix missing device_type and ptr_param --- common/cuda_hip/solver/multigrid_kernels.cpp | 4 ++-- include/ginkgo/core/distributed/matrix.hpp | 5 +++++ include/ginkgo/core/distributed/vector.hpp | 4 ++++ include/ginkgo/core/matrix/coo.hpp | 4 ++++ include/ginkgo/core/matrix/csr.hpp | 4 ++++ include/ginkgo/core/matrix/dense.hpp | 4 ++++ include/ginkgo/core/matrix/diagonal.hpp | 4 ++++ include/ginkgo/core/matrix/ell.hpp | 4 ++++ include/ginkgo/core/matrix/fbcsr.hpp | 4 ++++ include/ginkgo/core/matrix/hybrid.hpp | 4 ++++ include/ginkgo/core/matrix/sellp.hpp | 4 ++++ 11 files changed, 43 insertions(+), 2 deletions(-) diff --git a/common/cuda_hip/solver/multigrid_kernels.cpp b/common/cuda_hip/solver/multigrid_kernels.cpp index b9e411bd5f8..5ba11c03a82 100644 --- a/common/cuda_hip/solver/multigrid_kernels.cpp +++ b/common/cuda_hip/solver/multigrid_kernels.cpp @@ -191,8 +191,8 @@ void kcycle_check_stop(std::shared_ptr exec, kernel::kcycle_check_stop_kernel<<get_stream()>>>( nrhs, as_device_type(old_norm->get_const_values()), - as_device_type(new_norm->get_const_values()), rel_tol, - as_device_type(dis_stop.get_data())); + as_device_type(new_norm->get_const_values()), + as_device_type(rel_tol), as_device_type(dis_stop.get_data())); } is_stop = get_element(dis_stop, 0); } diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp index 1ae933d01fa..19a3b864d0c 100644 --- a/include/ginkgo/core/distributed/matrix.hpp +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -285,6 +285,11 @@ class Matrix #if GINKGO_ENABLE_HALF friend class Matrix>, LocalIndexType, GlobalIndexType>; + using ConvertibleTo< + Matrix>, local_index_type, + global_index_type>>::convert_to; + using ConvertibleTo>, + local_index_type, global_index_type>>::move_to; void convert_to( Matrix>, local_index_type, diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp index d71ccd78015..12cfdcc96f2 100644 --- a/include/ginkgo/core/distributed/vector.hpp +++ b/include/ginkgo/core/distributed/vector.hpp @@ -168,6 +168,10 @@ class Vector #if GINKGO_ENABLE_HALF friend class Vector>>; + using ConvertibleTo< + Vector>>>::convert_to; + using ConvertibleTo< + Vector>>>::move_to; void convert_to(Vector>>* result) const override; diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp index 1971e93cb46..89e94568f0f 100644 --- a/include/ginkgo/core/matrix/coo.hpp +++ b/include/ginkgo/core/matrix/coo.hpp @@ -94,6 +94,10 @@ class Coo : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Coo>, IndexType>; + using ConvertibleTo< + Coo>, IndexType>>::convert_to; + using ConvertibleTo< + Coo>, IndexType>>::move_to; void convert_to(Coo>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index 40e94efe0a4..9c620f10ded 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -702,6 +702,10 @@ class Csr : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Csr>, IndexType>; + using ConvertibleTo< + Csr>, IndexType>>::convert_to; + using ConvertibleTo< + Csr>, IndexType>>::move_to; void convert_to(Csr>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 9033adbf766..232d92c7702 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -279,6 +279,10 @@ class Dense #if GINKGO_ENABLE_HALF friend class Dense>>; + using ConvertibleTo< + Dense>>>::convert_to; + using ConvertibleTo< + Dense>>>::move_to; void convert_to(Dense>>* result) const override; diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp index 89424dab2bf..03a485f3ff3 100644 --- a/include/ginkgo/core/matrix/diagonal.hpp +++ b/include/ginkgo/core/matrix/diagonal.hpp @@ -86,6 +86,10 @@ class Diagonal #if GINKGO_ENABLE_HALF friend class Diagonal>>; + using ConvertibleTo< + Diagonal>>>::convert_to; + using ConvertibleTo< + Diagonal>>>::move_to; void convert_to(Diagonal>>* result) const override; diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp index ca9e61ef4e8..8a1512e3f51 100644 --- a/include/ginkgo/core/matrix/ell.hpp +++ b/include/ginkgo/core/matrix/ell.hpp @@ -95,6 +95,10 @@ class Ell : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Ell>, IndexType>; + using ConvertibleTo< + Ell>, IndexType>>::convert_to; + using ConvertibleTo< + Ell>, IndexType>>::move_to; void convert_to(Ell>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp index 20d4f78fa62..ae11dec6843 100644 --- a/include/ginkgo/core/matrix/fbcsr.hpp +++ b/include/ginkgo/core/matrix/fbcsr.hpp @@ -159,6 +159,10 @@ class Fbcsr : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Fbcsr>, IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Fbcsr>, IndexType>>::move_to; void convert_to(Fbcsr>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp index bba5d18f1da..8432e856319 100644 --- a/include/ginkgo/core/matrix/hybrid.hpp +++ b/include/ginkgo/core/matrix/hybrid.hpp @@ -369,6 +369,10 @@ class Hybrid #if GINKGO_ENABLE_HALF friend class Hybrid>, IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Hybrid>, IndexType>>::move_to; void convert_to(Hybrid>, IndexType>* result) const override; diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp index 84786538cc1..529df8e9f25 100644 --- a/include/ginkgo/core/matrix/sellp.hpp +++ b/include/ginkgo/core/matrix/sellp.hpp @@ -86,6 +86,10 @@ class Sellp : public EnableLinOp>, #if GINKGO_ENABLE_HALF friend class Sellp>, IndexType>; + using ConvertibleTo>, + IndexType>>::convert_to; + using ConvertibleTo< + Sellp>, IndexType>>::move_to; void convert_to(Sellp>, IndexType>* result) const override; From 0a427965859419df379f7d694c4ccd20dcc61a6b Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 25 Mar 2023 09:30:48 +0100 Subject: [PATCH 24/62] update rounding --- core/test/base/extended_float.cpp | 25 ++++++++++++++----------- include/ginkgo/core/base/half.hpp | 17 +++++++++++++++-- test/components/fill_array_kernels.cpp | 7 ++++++- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index 563a1220925..ec8740fc15f 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -174,18 +174,21 @@ TEST_F(FloatToHalf, TruncatesSmallNumber) } -TEST_F(FloatToHalf, TruncatesLargeNumber) +TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven) { - half x = create_from_bits("1" "10001110" "10010011111000010000100"); - - #if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) - // TODO: sycl::half seems to did rounding, but ours just truncates - ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001010000")); - #else - ASSERT_EQ(get_bits(x), get_bits("1" "11110" "1001001111")); - #endif - + half neg_x = create_from_bits("1" "10001110" "10010011111000010000100"); + half neg_x2 = create_from_bits("1" "10001110" "10010011101000010000100"); + half x = create_from_bits("0" "10001110" "10010011111000010000100"); + half x2 = create_from_bits("0" "10001110" "10010011101000010000100"); + half x3 = create_from_bits("0" "10001110" "10010011101000000000000"); + half x4 = create_from_bits("0" "10001110" "10010011111000000000000"); + + EXPECT_EQ(get_bits(x), get_bits("0" "11110" "1001010000")); + EXPECT_EQ(get_bits(x2), get_bits("0" "11110" "1001001111")); + EXPECT_EQ(get_bits(x3), get_bits("0" "11110" "1001001110")); + EXPECT_EQ(get_bits(x4), get_bits("0" "11110" "1001010000")); + EXPECT_EQ(get_bits(neg_x), get_bits("1" "11110" "1001010000")); + EXPECT_EQ(get_bits(neg_x2), get_bits("1" "11110" "1001001111")); } diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 1a8c1e1dfd1..446d085754d 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -462,8 +462,21 @@ class half { // TODO: handle denormals return conv::shift_sign(data_); } else { - return conv::shift_sign(data_) | exp | - conv::shift_significand(data_); + // Rounding to even + const auto result = conv::shift_sign(data_) | exp | + conv::shift_significand(data_); + // return result + ((result & 1) && + // ((data_ >> (f32_traits::significand_bits - + // f16_traits::significand_bits - 1)) & + // 1)); + const auto tail = + data_ & static_cast( + (1 << conv::significand_offset) - 1); + + constexpr auto half = static_cast( + 1 << (conv::significand_offset - 1)); + return result + + (tail > half || ((tail == half) && (result & 1))); } } } diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index 3d494b3f5f0..ed190b80fbc 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -21,7 +21,7 @@ class FillArray : public CommonTestFixture { protected: using value_type = T; FillArray() - : total_size(63531), + : total_size(3000), vals{ref, total_size}, dvals{exec, total_size}, seqs{ref, total_size} @@ -56,5 +56,10 @@ TYPED_TEST(FillArray, FillSeqEqualsReference) gko::kernels::GKO_DEVICE_NAMESPACE::components::fill_seq_array( this->exec, this->dvals.get_data(), this->total_size); + this->dvals.set_executor(this->ref); + for (gko::size_type i = 2000; i < this->total_size; i++) { + std::cout << i << " " << this->seqs.get_data()[i] << " device " + << this->dvals.get_data()[i] << std::endl; + } GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals); } From e3b81dfcb964a914a0f7cfd522ac4b6fbbedf237 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 27 Mar 2023 12:26:17 +0200 Subject: [PATCH 25/62] do not use distribution with half --- core/test/solver/gmres.cpp | 7 +++-- core/test/solver/multigrid.cpp | 4 +-- core/test/utils.hpp | 18 +++++++++++- core/test/utils/array_generator_test.cpp | 2 +- core/test/utils/fb_matrix_generator.hpp | 13 ++++----- core/test/utils/fb_matrix_generator_test.cpp | 4 +-- core/test/utils/matrix_generator_test.cpp | 16 +++++------ core/test/utils/matrix_utils_test.cpp | 4 +-- include/ginkgo/core/preconditioner/ilu.hpp | 3 +- .../test/factorization/par_ilut_kernels.cpp | 25 +++++++++++------ reference/test/matrix/dense_kernels.cpp | 3 +- reference/test/matrix/fbcsr_kernels.cpp | 3 +- reference/test/matrix/fft_kernels.cpp | 3 +- reference/test/solver/direct.cpp | 2 +- reference/test/solver/multigrid_kernels.cpp | 2 +- test/base/device_matrix_data_kernels.cpp | 2 +- test/factorization/par_ic_kernels.cpp | 2 +- test/factorization/par_ict_kernels.cpp | 8 ++---- test/factorization/par_ilu_kernels.cpp | 2 +- test/factorization/par_ilut_kernels.cpp | 28 ++++++------------- test/matrix/fbcsr_kernels.cpp | 7 +++-- test/solver/direct.cpp | 4 +-- 22 files changed, 85 insertions(+), 77 deletions(-) diff --git a/core/test/solver/gmres.cpp b/core/test/solver/gmres.cpp index 5d9c9e3c40e..f69b1f78311 100644 --- a/core/test/solver/gmres.cpp +++ b/core/test/solver/gmres.cpp @@ -27,8 +27,8 @@ class Gmres : public ::testing::Test { using Solver = gko::solver::Gmres; using Big_solver = gko::solver::Gmres; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + // half does not have constexpr constructor + static const gko::remove_complex reduction_factor; Gmres() : exec(gko::ReferenceExecutor::create()), @@ -61,7 +61,8 @@ class Gmres : public ::testing::Test { }; template -constexpr gko::remove_complex Gmres::reduction_factor; +const gko::remove_complex Gmres::reduction_factor = + gko::remove_complex(1e-6); TYPED_TEST_SUITE(Gmres, gko::test::ValueTypes, TypenameNameGenerator); diff --git a/core/test/solver/multigrid.cpp b/core/test/solver/multigrid.cpp index 8cb545f6cb2..85be8402243 100644 --- a/core/test/solver/multigrid.cpp +++ b/core/test/solver/multigrid.cpp @@ -75,9 +75,7 @@ class DummyLinOpWithFactory std::make_shared(this->get_executor(), gko::dim<2>{n_, n_ - 1}), gko::share(gko::test::generate_random_dense_matrix( - n_ - 1, n_ - 1, - std::uniform_real_distribution>( - 0, 1), + n_ - 1, n_ - 1, std::uniform_real_distribution<>(0, 1), std::default_random_engine{}, factory->get_executor())), std::make_shared(this->get_executor(), gko::dim<2>{n_ - 1, n_})); diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 3b6dd13444d..279c5a59099 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -322,6 +322,13 @@ using add_inner_wrapper_t = using RealValueTypes = +#if GINKGO_DPCPP_SINGLE_MODE + ::testing::Types; +#else + ::testing::Types; +#endif + +using RealValueTypesNoHalf = #if GINKGO_DPCPP_SINGLE_MODE ::testing::Types; #else @@ -330,6 +337,9 @@ using RealValueTypes = using ComplexValueTypes = add_inner_wrapper_t; +using ComplexValueTypesNoHalf = + add_inner_wrapper_t; + using ValueTypes = merge_type_list_t; using IndexTypes = ::testing::Types; @@ -359,7 +369,8 @@ using TwoValueIndexType = add_to_cartesian_type_product_t< IndexTypes>; using ValueLocalGlobalIndexTypes = - add_to_cartesian_type_product_left_t; + add_to_cartesian_type_product_left_t; template @@ -462,6 +473,11 @@ namespace detail { template struct next_precision_impl {}; +template <> +struct next_precision_impl { + using type = gko::half; +}; + template <> struct next_precision_impl { using type = double; diff --git a/core/test/utils/array_generator_test.cpp b/core/test/utils/array_generator_test.cpp index ae66e4686da..a9b370895c5 100644 --- a/core/test/utils/array_generator_test.cpp +++ b/core/test/utils/array_generator_test.cpp @@ -22,7 +22,7 @@ class ArrayGenerator : public ::testing::Test { ArrayGenerator() : exec(gko::ReferenceExecutor::create()) { array = gko::test::generate_random_array( - 500, std::normal_distribution>(20.0, 5.0), + 500, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec); } diff --git a/core/test/utils/fb_matrix_generator.hpp b/core/test/utils/fb_matrix_generator.hpp index 034dd95fce1..786f836e10a 100644 --- a/core/test/utils/fb_matrix_generator.hpp +++ b/core/test/utils/fb_matrix_generator.hpp @@ -131,16 +131,15 @@ std::unique_ptr> generate_fbcsr_from_csr( const IndexType* const row_ptrs = fmtx->get_const_row_ptrs(); const IndexType* const col_idxs = fmtx->get_const_col_idxs(); ValueType* const vals = fmtx->get_values(); - std::uniform_real_distribution> - off_diag_dist(-1.0, 1.0); + std::uniform_real_distribution<> off_diag_dist(-1.0, 1.0); for (IndexType ibrow = 0; ibrow < nbrows; ibrow++) { if (row_diag_dominant) { const IndexType nrownz = (row_ptrs[ibrow + 1] - row_ptrs[ibrow]) * block_size; - std::uniform_real_distribution> - diag_dist(1.01 * nrownz, 2 * nrownz); + std::uniform_real_distribution<> diag_dist(1.01 * nrownz, + 2 * nrownz); for (IndexType ibz = row_ptrs[ibrow]; ibz < row_ptrs[ibrow + 1]; ibz++) { @@ -205,13 +204,11 @@ std::unique_ptr> generate_random_fbcsr( matrix::Csr>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref) + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref) : generate_random_matrix>( nbrows, nbcols, std::uniform_int_distribution(0, nbcols - 1), - std::normal_distribution(0.0, 1.0), - std::move(engine), ref); + std::normal_distribution<>(0.0, 1.0), std::move(engine), ref); if (unsort && rand_csr_ref->is_sorted_by_column_index()) { unsort_matrix(rand_csr_ref, engine); } diff --git a/core/test/utils/fb_matrix_generator_test.cpp b/core/test/utils/fb_matrix_generator_test.cpp index ccbb0aa477f..4db61573034 100644 --- a/core/test/utils/fb_matrix_generator_test.cpp +++ b/core/test/utils/fb_matrix_generator_test.cpp @@ -28,8 +28,8 @@ class BlockMatrixGenerator : public ::testing::Test { : exec(gko::ReferenceExecutor::create()), mtx(gko::test::generate_random_matrix< gko::matrix::Csr>( - nbrows, nbcols, std::normal_distribution(10, 5), - std::normal_distribution(20.0, 5.0), + nbrows, nbcols, std::normal_distribution<>(10, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), rbmtx(gko::test::generate_fbcsr_from_csr( exec, mtx.get(), blk_sz, false, std::default_random_engine(42))), diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index 43756bc1709..b2cbec8967d 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -26,25 +26,25 @@ class MatrixGenerator : public ::testing::Test { MatrixGenerator() : exec(gko::ReferenceExecutor::create()), mtx(gko::test::generate_random_matrix( - 500, 100, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), dense_mtx(gko::test::generate_random_dense_matrix( - 500, 100, std::normal_distribution(20.0, 5.0), + 500, 100, std::normal_distribution<>(20.0, 5.0), std::default_random_engine(41), exec)), l_mtx(gko::test::generate_random_lower_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), u_mtx(gko::test::generate_random_upper_triangular_matrix( - 4, true, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 4, true, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), lower_bandwidth(2), upper_bandwidth(3), band_mtx(gko::test::generate_random_band_matrix( 100, lower_bandwidth, upper_bandwidth, - std::normal_distribution(20.0, 5.0), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42), exec)), nnz_per_row_sample(500, 0), values_sample(0), diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 3c67571e1b2..778c0795ac7 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -30,8 +30,8 @@ class MatrixUtils : public ::testing::Test { MatrixUtils() : exec(gko::ReferenceExecutor::create()), data(gko::test::generate_random_matrix_data( - 500, 500, std::normal_distribution(50, 5), - std::normal_distribution(20.0, 5.0), + 500, 500, std::normal_distribution<>(50, 5), + std::normal_distribution<>(20.0, 5.0), std::default_random_engine(42))), rectangular_data(gko::dim<2>(500, 100)) {} diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp index 1f4be3e3046..98aa3ce70c1 100644 --- a/include/ginkgo/core/preconditioner/ilu.hpp +++ b/include/ginkgo/core/preconditioner/ilu.hpp @@ -498,7 +498,8 @@ class Ilu : public EnableLinOp< generate_default_solver(const std::shared_ptr& exec, const std::shared_ptr& mtx) { - constexpr gko::remove_complex default_reduce_residual{1e-4}; + // half can not use constexpr constructor + const gko::remove_complex default_reduce_residual{1e-4}; const unsigned int default_max_iters{ static_cast(mtx->get_size()[0])}; diff --git a/reference/test/factorization/par_ilut_kernels.cpp b/reference/test/factorization/par_ilut_kernels.cpp index 59805f246f8..a605ed678ae 100644 --- a/reference/test/factorization/par_ilut_kernels.cpp +++ b/reference/test/factorization/par_ilut_kernels.cpp @@ -54,6 +54,7 @@ class ParIlut : public ::testing::Test { using ComplexCsr = gko::matrix::Csr>, index_type>; + using complex_value_type = std::complex>; ParIlut() : ref(gko::ReferenceExecutor::create()), @@ -75,16 +76,24 @@ class ParIlut : public ::testing::Test { {0., -3., 0., 1.}}, ref)), mtx1_complex(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{-1., .1}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {-1., .1}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., .1}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{-1., .1}, complex_value_type{.1, 2.}}}, ref)), mtx1_expect_complex_thrm(gko::initialize( - {{{.1, 0.}, {0., 0.}, {0., 0.}, {0., 0.}}, - {{0., 0.}, {.1, -1.}, {0., 0.}, {0., 0.}}, - {{-1., 1.}, {-2., .2}, {-1., -.3}, {0., 0.}}, - {{1., -2.}, {-3., -.1}, {0., 0.}, {.1, 2.}}}, + {{complex_value_type{.1, 0.}, complex_value_type{0., 0.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{0., 0.}, complex_value_type{.1, -1.}, + complex_value_type{0., 0.}, complex_value_type{0., 0.}}, + {complex_value_type{-1., 1.}, complex_value_type{-2., .2}, + complex_value_type{-1., -.3}, complex_value_type{0., 0.}}, + {complex_value_type{1., -2.}, complex_value_type{-3., -.1}, + complex_value_type{0., 0.}, complex_value_type{.1, 2.}}}, ref)), identity(gko::initialize( {{1., 0., 0.}, {0., 1., 0.}, {0., 0., 1.}}, ref)), diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp index c7e26589dd7..9fe0e91a670 100644 --- a/reference/test/matrix/dense_kernels.cpp +++ b/reference/test/matrix/dense_kernels.cpp @@ -75,8 +75,7 @@ class Dense : public ::testing::Test { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(num_cols, num_cols), - std::normal_distribution>(0.0, 1.0), - rand_engine, exec); + std::normal_distribution<>(0.0, 1.0), rand_engine, exec); } }; diff --git a/reference/test/matrix/fbcsr_kernels.cpp b/reference/test/matrix/fbcsr_kernels.cpp index 4415216fd01..d88e6334dfc 100644 --- a/reference/test/matrix/fbcsr_kernels.cpp +++ b/reference/test/matrix/fbcsr_kernels.cpp @@ -114,7 +114,8 @@ std::unique_ptr> get_some_vectors( { using RT = gko::remove_complex; std::default_random_engine engine(39); - std::normal_distribution dist(0.0, 5.0); + std::normal_distribution::type> dist( + 0.0, 5.0); std::uniform_int_distribution<> nnzdist(1, nrhs); return gko::test::generate_random_matrix>( nrows, nrhs, nnzdist, dist, engine, exec); diff --git a/reference/test/matrix/fft_kernels.cpp b/reference/test/matrix/fft_kernels.cpp index 12c2521b71c..9a2ccb80322 100644 --- a/reference/test/matrix/fft_kernels.cpp +++ b/reference/test/matrix/fft_kernels.cpp @@ -148,7 +148,8 @@ class Fft : public ::testing::Test { std::unique_ptr dense_ifft3; }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, + TypenameNameGenerator); TYPED_TEST(Fft, ThrowsOnNonPowerOfTwo1D) diff --git a/reference/test/solver/direct.cpp b/reference/test/solver/direct.cpp index 1fb147a7a2b..d06948e631e 100644 --- a/reference/test/solver/direct.cpp +++ b/reference/test/solver/direct.cpp @@ -49,7 +49,7 @@ class Direct : public ::testing::Test { symmetric)) .on(exec); solver = factory->generate(mtx); - std::normal_distribution> dist(0, 1); + std::normal_distribution<> dist(0, 1); x = gko::test::generate_random_dense_matrix( mtx->get_size()[0], nrhs, dist, rng, this->exec); x_ref = x->clone(); diff --git a/reference/test/solver/multigrid_kernels.cpp b/reference/test/solver/multigrid_kernels.cpp index d3083d5819f..115bf5dfea7 100644 --- a/reference/test/solver/multigrid_kernels.cpp +++ b/reference/test/solver/multigrid_kernels.cpp @@ -154,7 +154,7 @@ class DummyLinOpWithFactory { auto alpha_value = gko::as>(alpha)->at(0, 0); - gko::remove_complex scale = std::real(alpha_value); + gko::remove_complex scale = gko::real(alpha_value); global_step *= static_cast(scale); step.push_back(global_step); global_step++; diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index ffadbcfb245..5783274f211 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -35,7 +35,7 @@ class DeviceMatrixData : public CommonTestFixture { 0, host_data.size[0] - 1); std::uniform_int_distribution col_distr( 0, host_data.size[1] - 1); - std::uniform_real_distribution> + std::uniform_real_distribution<> val_distr(1.0, 2.0); // add random entries for (int i = 0; i < 1000; i++) { diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index de2342a28db..08bca8be499 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -41,7 +41,7 @@ class ParIc : public CommonTestFixture { mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(0, 10.0), + std::normal_distribution<>(0, 10.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 3b33e52630c..62949c243c2 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -47,15 +47,11 @@ class ParIct : public CommonTestFixture { mtx = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 88f5ecff0d9..508cd911f56 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -59,7 +59,7 @@ class ParIlu : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(0, num_cols - 1), - std::normal_distribution>(0.0, 1.0), + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index dff3cc702c1..4c4167a5efa 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -48,39 +48,27 @@ class ParIlut : public CommonTestFixture { mtx1 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(10, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx2 = gko::test::generate_random_matrix( mtx_size[0], mtx_size[1], std::uniform_int_distribution(0, mtx_size[1]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_square = gko::test::generate_random_matrix( mtx_size[0], mtx_size[0], std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_l2 = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], true, std::uniform_int_distribution(1, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); mtx_u = gko::test::generate_random_upper_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); dmtx1 = gko::clone(exec, mtx1); dmtx2 = gko::clone(exec, mtx2); @@ -134,7 +122,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto size = index_type(mtx->get_num_stored_elements()); using ValueType = typename Mtx::value_type; @@ -189,7 +177,7 @@ class ParIlut : public CommonTestFixture { const std::unique_ptr& dmtx, index_type rank) { double tolerance = - gko::is_complex() ? r::value : 0.0; + gko::is_complex() ? double(r::value) : 0.0; auto res = Mtx::create(ref, mtx_size); auto dres = Mtx::create(exec, mtx_size); auto res_coo = Coo::create(ref, mtx_size); diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index 8cff04c28a0..e987bde2ccb 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -37,7 +37,7 @@ class Fbcsr : public CommonTestFixture { std::unique_ptr rsorted; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -51,7 +51,10 @@ class Fbcsr : public CommonTestFixture { for (index_type i = 0; i < x->get_size()[0] * x->get_size()[1]; i++) { xarr[i] = static_cast(2.0) * - std::sin(static_cast(i / 2.0) + get_random_value()); + static_cast( + std::sin(static_cast< + typename gko::detail::arth_type::type>( + static_cast(i / 2.0) + get_random_value()))); } } }; diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index da77682bcdd..431a7c40d59 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -51,9 +51,7 @@ class Direct : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution<>(num_cols, num_cols), - std::normal_distribution>(-1.0, - 1.0), - rand_engine, ref); + std::normal_distribution<>(-1.0, 1.0), rand_engine, ref); } void initialize_data(const char* mtx_filename, int nrhs) From c9fd747762f2d5aaae09c32a2a3c05bf286a0710 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Mon, 27 Mar 2023 23:10:12 +0200 Subject: [PATCH 26/62] WIP fix half of failed test --- core/base/mtx_io.cpp | 4 ++ core/test/base/extended_float.cpp | 18 +++++- core/test/log/stream.cpp | 70 +++++++++++------------ core/test/solver/gcr.cpp | 6 +- core/test/utils.hpp | 11 +++- core/test/utils/matrix_generator_test.cpp | 12 ++-- core/test/utils/matrix_utils_test.cpp | 4 +- cuda/base/types.hpp | 4 +- include/ginkgo/core/base/half.hpp | 24 +++++--- include/ginkgo/core/base/math.hpp | 3 +- reference/test/matrix/coo_kernels.cpp | 2 + reference/test/solver/bicg_kernels.cpp | 4 ++ reference/test/solver/cg_kernels.cpp | 6 ++ reference/test/solver/fcg_kernels.cpp | 6 ++ reference/test/solver/gmres_kernels.cpp | 7 +++ test/components/reduce_array_kernels.cpp | 4 +- test/matrix/fbcsr_kernels.cpp | 12 ++++ test/matrix/fft_kernels.cpp | 2 +- 18 files changed, 139 insertions(+), 60 deletions(-) diff --git a/core/base/mtx_io.cpp b/core/base/mtx_io.cpp index 30c6de08b75..d84b97a213b 100644 --- a/core/base/mtx_io.cpp +++ b/core/base/mtx_io.cpp @@ -889,12 +889,16 @@ matrix_data read_binary_raw(std::istream& is) } DECLARE_OVERLOAD(double, int32) DECLARE_OVERLOAD(float, int32) + DECLARE_OVERLOAD(half, int32) DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(std::complex, int32) + DECLARE_OVERLOAD(std::complex, int32) DECLARE_OVERLOAD(double, int64) DECLARE_OVERLOAD(float, int64) + DECLARE_OVERLOAD(half, int64) DECLARE_OVERLOAD(std::complex, int64) DECLARE_OVERLOAD(std::complex, int64) + DECLARE_OVERLOAD(std::complex, int64) #undef DECLARE_OVERLOAD else { diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index ec8740fc15f..bd9ff9afe81 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -8,7 +8,8 @@ #include #include - +#include +#include "ginkgo/core/base/math.hpp" namespace { @@ -192,6 +193,21 @@ TEST_F(FloatToHalf, TruncatesLargeNumberRoundToEven) } +TEST_F(FloatToHalf, Convert) +{ + float rho = 86.25; + float beta = 1110; + auto float_res = rho/beta; + gko::half rho_h = rho; + gko::half beta_h = beta; + auto half_res = rho_h/beta_h; + std::cout << float_res << std::endl; + std::cout << float(half_res) << std::endl; + + std::complex cpx{100.0, 0.0}; + std::cout << float(gko::squared_norm(cpx)) << std::endl; +} + // clang-format on diff --git a/core/test/log/stream.cpp b/core/test/log/stream.cpp index 995a9975b89..1ad02f7daf1 100644 --- a/core/test/log/stream.cpp +++ b/core/test/log/stream.cpp @@ -380,17 +380,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on(A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -429,17 +429,17 @@ TYPED_TEST(Stream, CatchesLinOpApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto b = gko::initialize({-2.2}, exec); - auto x = gko::initialize({3.3}, exec); + auto A = gko::initialize({1.5}, exec); + auto b = gko::initialize({-2.25}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), b.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -486,21 +486,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyStartedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_started_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -547,21 +547,21 @@ TYPED_TEST(Stream, CatchesLinOpAdvancedApplyCompletedWithVerbose) std::stringstream out; auto logger = gko::log::Stream::create( gko::log::Logger::linop_advanced_apply_completed_mask, out, true); - auto A = gko::initialize({1.1}, exec); - auto alpha = gko::initialize({-4.4}, exec); - auto b = gko::initialize({-2.2}, exec); + auto A = gko::initialize({1.5}, exec); + auto alpha = gko::initialize({-4.75}, exec); + auto b = gko::initialize({-2.25}, exec); auto beta = gko::initialize({-5.5}, exec); - auto x = gko::initialize({3.3}, exec); + auto x = gko::initialize({3.125}, exec); logger->template on( A.get(), alpha.get(), b.get(), beta.get(), x.get()); auto os = out.str(); - GKO_ASSERT_STR_CONTAINS(os, "1.1"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); + GKO_ASSERT_STR_CONTAINS(os, "1.5"); + GKO_ASSERT_STR_CONTAINS(os, "-4.75"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "3.3"); + GKO_ASSERT_STR_CONTAINS(os, "3.125"); } @@ -782,11 +782,11 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) gko::solver::Bicgstab::build() .with_criteria(gko::stop::Iteration::build().with_max_iters(3u)) .on(exec); - auto solver = factory->generate(gko::initialize({1.1}, exec)); + auto solver = factory->generate(gko::initialize({1.25}, exec)); auto right_hand_side = gko::initialize({-5.5}, exec); - auto residual = gko::initialize({-4.4}, exec); - auto solution = gko::initialize({-2.2}, exec); - auto residual_norm = gko::initialize({-3.3}, exec); + auto residual = gko::initialize({-4.5}, exec); + auto solution = gko::initialize({-2.25}, exec); + auto residual_norm = gko::initialize({-3.125}, exec); gko::array stop_status(exec, 1); logger->template on( @@ -795,9 +795,9 @@ TYPED_TEST(Stream, CatchesIterationsWithVerbose) auto os = out.str(); GKO_ASSERT_STR_CONTAINS(os, "-5.5"); - GKO_ASSERT_STR_CONTAINS(os, "-4.4"); - GKO_ASSERT_STR_CONTAINS(os, "-2.2"); - GKO_ASSERT_STR_CONTAINS(os, "-3.3"); + GKO_ASSERT_STR_CONTAINS(os, "-4.5"); + GKO_ASSERT_STR_CONTAINS(os, "-2.25"); + GKO_ASSERT_STR_CONTAINS(os, "-3.125"); GKO_ASSERT_STR_CONTAINS(os, "Finalized:") } diff --git a/core/test/solver/gcr.cpp b/core/test/solver/gcr.cpp index 2d7b5ea7974..b5f0e014737 100644 --- a/core/test/solver/gcr.cpp +++ b/core/test/solver/gcr.cpp @@ -27,8 +27,7 @@ class Gcr : public ::testing::Test { using Solver = gko::solver::Gcr; using Big_solver = gko::solver::Gcr; - static constexpr gko::remove_complex reduction_factor = - gko::remove_complex(1e-6); + static const gko::remove_complex reduction_factor; Gcr() : exec(gko::ReferenceExecutor::create()), @@ -71,7 +70,8 @@ class Gcr : public ::testing::Test { }; template -constexpr gko::remove_complex Gcr::reduction_factor; +const gko::remove_complex Gcr::reduction_factor = + gko::remove_complex(1e-6); TYPED_TEST_SUITE(Gcr, gko::test::ValueTypes, TypenameNameGenerator); diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 279c5a59099..7f6543bf07b 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -475,7 +475,7 @@ struct next_precision_impl {}; template <> struct next_precision_impl { - using type = gko::half; + using type = float; }; template <> @@ -501,4 +501,13 @@ template using next_precision = typename detail::next_precision_impl::type; +#define SKIP_IF_HALF(type) \ + if (std::is_same, gko::half>::value) { \ + GTEST_SKIP() << "Skip due to single mode"; \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + #endif // GKO_CORE_TEST_UTILS_HPP_ diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index b2cbec8967d..2b3ce7fa501 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -20,6 +20,8 @@ template class MatrixGenerator : public ::testing::Test { protected: using value_type = T; + using check_type = + typename gko::detail::arth_type>::type; using real_type = gko::remove_complex; using mtx_type = gko::matrix::Dense; @@ -96,15 +98,15 @@ class MatrixGenerator : public ::testing::Test { template - ValueType get_nth_moment(int n, ValueType c, InputIterator sample_start, - InputIterator sample_end, Closure closure_op) + check_type get_nth_moment(int n, ValueType c, InputIterator sample_start, + InputIterator sample_end, Closure closure_op) { using std::pow; - ValueType res = 0; - ValueType num_elems = 0; + check_type res = 0; + check_type num_elems = 0; while (sample_start != sample_end) { auto tmp = *(sample_start++); - res += pow(closure_op(tmp) - c, n); + res += pow(check_type{closure_op(tmp)} - check_type{c}, n); num_elems += 1; } return res / num_elems; diff --git a/core/test/utils/matrix_utils_test.cpp b/core/test/utils/matrix_utils_test.cpp index 778c0795ac7..6840021e9e5 100644 --- a/core/test/utils/matrix_utils_test.cpp +++ b/core/test/utils/matrix_utils_test.cpp @@ -241,7 +241,7 @@ TYPED_TEST(MatrixUtils, MakeHpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeHpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.002; auto cpy_data = this->data; gko::utils::make_hpd(this->data, ratio); @@ -276,7 +276,7 @@ TYPED_TEST(MatrixUtils, MakeSpdMatrixCorrectly) TYPED_TEST(MatrixUtils, MakeSpdMatrixWithRatioCorrectly) { using T = typename TestFixture::value_type; - gko::remove_complex ratio = 1.00001; + gko::remove_complex ratio = 1.002; auto cpy_data = this->data; gko::utils::make_spd(this->data, ratio); diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 1093242da30..c45f940815f 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -40,7 +40,7 @@ namespace thrust { template <> GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) { - return hypot(z.real(), z.imag()); + return abs(static_cast>(z)); } @@ -51,7 +51,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ - return thrust::complex{lhs} + thrust::complex(rhs); \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 446d085754d..6b21646c66c 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -610,18 +610,28 @@ class complex { template complex& operator*=(const complex& val) { - auto tmp = real_; - real_ = real_ * val.real() - imag_ * val.imag(); - imag_ = tmp * val.imag() + imag_ * val.real(); + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f *= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); + // auto tmp = real_; + // real_ = real_ * val.real() - imag_ * val.imag(); + // imag_ = tmp * val.imag() + imag_ * val.real(); return *this; } template complex& operator/=(const complex& val) { - auto real = val.real(); - auto imag = val.imag(); - (*this) *= complex{val.real(), -val.imag()}; - (*this) /= (real * real + imag * imag); + // auto real = val.real(); + // auto imag = val.imag(); + // (*this) *= complex{val.real(), -val.imag()}; + // (*this) /= (real * real + imag * imag); + auto val_f = static_cast>(val); + auto result_f = static_cast>(*this); + result_f /= val_f; + real_ = result_f.real(); + imag_ = result_f.imag(); return *this; } diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index d2e60ea677d..c4cc82acc08 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -44,7 +44,8 @@ inline gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } inline gko::half abs(std::complex a) { - return gko::half(sqrt(float(a.real() * a.real() + a.imag() * a.imag()))); + // Using float abs not sqrt on norm to avoid overflow + return gko::half(abs(std::complex(a))); } diff --git a/reference/test/matrix/coo_kernels.cpp b/reference/test/matrix/coo_kernels.cpp index b434c7064c5..c7d96e56a72 100644 --- a/reference/test/matrix/coo_kernels.cpp +++ b/reference/test/matrix/coo_kernels.cpp @@ -117,6 +117,8 @@ TYPED_TEST(Coo, MovesToPrecision) this->mtx->move_to(tmp); tmp->move_to(res); + // TODO: When use move_to to the different precision, it will keep the + // original data GKO_ASSERT_MTX_NEAR(this->mtx, res, residual); } diff --git a/reference/test/solver/bicg_kernels.cpp b/reference/test/solver/bicg_kernels.cpp index 69bea370b90..5955e9ceee9 100644 --- a/reference/test/solver/bicg_kernels.cpp +++ b/reference/test/solver/bicg_kernels.cpp @@ -448,6 +448,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -465,6 +466,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -482,6 +484,7 @@ TYPED_TEST(Bicg, SolvesBigDenseSystemImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -513,6 +516,7 @@ TYPED_TEST(Bicg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->bicg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/cg_kernels.cpp b/reference/test/solver/cg_kernels.cpp index 1419170cb61..f01649863e7 100644 --- a/reference/test/solver/cg_kernels.cpp +++ b/reference/test/solver/cg_kernels.cpp @@ -408,6 +408,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -425,6 +426,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -442,6 +444,7 @@ TYPED_TEST(Cg, SolvesBigDenseSystem3) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -459,6 +462,7 @@ TYPED_TEST(Cg, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -527,6 +531,7 @@ TYPED_TEST(Cg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -544,6 +549,7 @@ TYPED_TEST(Cg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/fcg_kernels.cpp b/reference/test/solver/fcg_kernels.cpp index e8ac009b9dd..5d219c43179 100644 --- a/reference/test/solver/fcg_kernels.cpp +++ b/reference/test/solver/fcg_kernels.cpp @@ -422,6 +422,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -439,6 +440,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -456,6 +458,7 @@ TYPED_TEST(Fcg, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {886630.5, -172578.0, 684522.0, -65310.5, 455487.5, 607436.0}, @@ -473,6 +476,7 @@ TYPED_TEST(Fcg, SolvesMultipleBigDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -541,6 +545,7 @@ TYPED_TEST(Fcg, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -558,6 +563,7 @@ TYPED_TEST(Fcg, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->fcg_factory_big->generate(this->mtx_big); auto b = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 588b225c658..7d30109af1b 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -618,6 +618,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -635,6 +636,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -652,6 +654,7 @@ TYPED_TEST(Gmres, SolveWithImplicitResNormCritIsDisabled) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -666,6 +669,7 @@ TYPED_TEST(Gmres, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -759,6 +763,7 @@ TYPED_TEST(Gmres, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); for (auto ortho : {ortho_method::mgs, ortho_method::cgs, ortho_method::cgs2}) { SCOPED_TRACE(ortho); @@ -792,6 +797,7 @@ TYPED_TEST(Gmres, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -809,6 +815,7 @@ TYPED_TEST(Gmres, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gmres_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/test/components/reduce_array_kernels.cpp b/test/components/reduce_array_kernels.cpp index b7407801a32..ef3a825039a 100644 --- a/test/components/reduce_array_kernels.cpp +++ b/test/components/reduce_array_kernels.cpp @@ -21,13 +21,13 @@ class ReduceArray : public CommonTestFixture { protected: using value_type = T; ReduceArray() - : total_size(6355), + : total_size(1024), out{ref, I{2}}, dout{exec, out}, vals{ref, total_size}, dvals{exec} { - std::fill_n(vals.get_data(), total_size, 3); + std::fill_n(vals.get_data(), total_size, 1); dvals = vals; } diff --git a/test/matrix/fbcsr_kernels.cpp b/test/matrix/fbcsr_kernels.cpp index e987bde2ccb..b5169ebfa13 100644 --- a/test/matrix/fbcsr_kernels.cpp +++ b/test/matrix/fbcsr_kernels.cpp @@ -126,6 +126,9 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -148,6 +151,9 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) using Mtx = typename TestFixture::Mtx; using Dense = typename TestFixture::Dense; using value_type = typename Mtx::value_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); @@ -171,6 +177,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 1)); @@ -201,6 +210,9 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) using Dense = typename TestFixture::Dense; using value_type = typename TestFixture::value_type; using real_type = typename TestFixture::real_type; + if (this->exec->get_master() != this->exec) { + SKIP_IF_HALF(value_type); + } auto drand = gko::clone(this->exec, this->rsorted); auto x = Dense::create(this->ref, gko::dim<2>(this->rsorted->get_size()[1], 3)); diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index 5b2c33085e3..b2dc9f7f672 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -91,7 +91,7 @@ class Fft : public CommonTestFixture { }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, TypenameNameGenerator); TYPED_TEST(Fft, Apply1DIsEqualToReference) From 684cadb77a8632415a169c5ae2ade16db874eaaf Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 13 Jun 2023 16:52:41 +0200 Subject: [PATCH 27/62] fix/skip half test and fix numeric_limit on device --- common/cuda_hip/base/math.hpp | 7 +++ .../cuda_hip/matrix/csr_kernels.template.cpp | 4 +- core/distributed/vector.cpp | 4 +- core/log/papi.cpp | 3 +- core/solver/idr.cpp | 9 ++++ core/test/base/extended_float.cpp | 6 ++- core/test/utils.hpp | 2 +- core/test/utils/matrix_generator_test.cpp | 13 +++-- cuda/CMakeLists.txt | 1 + cuda/matrix/fft_kernels.cu | 6 ++- dpcpp/matrix/csr_kernels.dp.cpp | 41 +++++++------- .../test/preconditioner/jacobi_kernels.dp.cpp | 21 ++++---- hip/test/matrix/fbcsr_kernels.cpp | 53 +++++++++++++------ include/ginkgo/core/base/half.hpp | 13 +++-- .../ginkgo/core/base/precision_dispatch.hpp | 3 +- omp/matrix/csr_kernels.cpp | 4 +- omp/matrix/fft_kernels.cpp | 6 ++- omp/solver/cb_gmres_kernels.cpp | 3 +- reference/matrix/csr_kernels.cpp | 4 +- reference/matrix/fft_kernels.cpp | 6 ++- reference/solver/cb_gmres_kernels.cpp | 3 +- reference/test/factorization/lu_kernels.cpp | 6 +-- .../test/preconditioner/isai_kernels.cpp | 7 +++ .../test/preconditioner/jacobi_kernels.cpp | 16 ++++-- reference/test/reorder/scaled_reordered.cpp | 8 +++ reference/test/solver/bicgstab_kernels.cpp | 19 ++++--- reference/test/solver/cgs_kernels.cpp | 10 ++++ reference/test/solver/gcr_kernels.cpp | 13 +++-- reference/test/solver/gmres_kernels.cpp | 1 + reference/test/solver/idr_kernels.cpp | 16 ++++-- test/base/device_matrix_data_kernels.cpp | 3 +- test/factorization/par_ic_kernels.cpp | 5 +- test/factorization/par_ict_kernels.cpp | 2 + test/factorization/par_ilu_kernels.cpp | 6 ++- test/factorization/par_ilut_kernels.cpp | 4 ++ test/matrix/fft_kernels.cpp | 3 +- test/mpi/matrix.cpp | 4 +- test/mpi/vector.cpp | 15 +++--- 38 files changed, 231 insertions(+), 119 deletions(-) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index fa8ba747c27..cf746206f46 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -23,6 +23,13 @@ struct device_numeric_limits { static constexpr auto min = std::numeric_limits::min(); }; +template <> +struct device_numeric_limits<__half> { + static constexpr auto inf = std::numeric_limits::infinity(); + static constexpr auto max = std::numeric_limits::max(); + static constexpr auto min = std::numeric_limits::min(); +}; + namespace detail { diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index 757e689a777..76e77884d8b 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -278,7 +278,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const arithmetic_type scale_factor = static_cast(alpha[0]); spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { return static_cast(scale_factor * x); @@ -486,7 +486,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_reduce( const IndexType* __restrict__ last_row, const MatrixValueType* __restrict__ alpha, acc::range c) { - const arithmetic_type alpha_val = alpha[0]; + const arithmetic_type alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }); diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp index 9dea0c4d30b..1f830fdb6a4 100644 --- a/core/distributed/vector.cpp +++ b/core/distributed/vector.cpp @@ -651,8 +651,8 @@ ValueType& Vector::at_local(size_type row, size_type col) noexcept template -ValueType Vector::at_local(size_type row, size_type col) const - noexcept +ValueType Vector::at_local(size_type row, + size_type col) const noexcept { return local_.at(row, col); } diff --git a/core/log/papi.cpp b/core/log/papi.cpp index e5aa588aee1..4b2b8e4d819 100644 --- a/core/log/papi.cpp +++ b/core/log/papi.cpp @@ -219,8 +219,7 @@ void Papi::on_criterion_check_completed( auto tmp_res_norm = Vector::create( residual->get_executor(), dim<2>{1, residual->get_size()[1]}); dense_r->compute_norm2(tmp_res_norm); - residual_norm_d = - static_cast(real(tmp_res_norm->at(0, 0))); + residual_norm_d = static_cast(real(tmp_res_norm->at(0, 0))); }); } diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index c6d89b84ea6..469af04d5c6 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,10 @@ std::unique_ptr Idr::transpose() const .with_generated_preconditioner( share(as(this->get_preconditioner())->transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate( share(as(this->get_system_matrix())->transpose())); @@ -78,6 +83,10 @@ std::unique_ptr Idr::conj_transpose() const .with_generated_preconditioner(share( as(this->get_preconditioner())->conj_transpose())) .with_criteria(this->get_stop_criterion_factory()) + .with_subspace_dim(this->get_subspace_dim()) + .with_kappa(this->get_kappa()) + .with_deterministic(this->get_deterministic()) + .with_complex_subspace(this->get_complex_subspace()) .on(this->get_executor()) ->generate(share( as(this->get_system_matrix())->conj_transpose())); diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index bd9ff9afe81..1828b0c027b 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -5,11 +5,13 @@ #include "core/base/extended_float.hpp" #include +#include #include #include -#include -#include "ginkgo/core/base/math.hpp" + + +#include namespace { diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 7f6543bf07b..62ab95e9656 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -503,7 +503,7 @@ using next_precision = typename detail::next_precision_impl::type; #define SKIP_IF_HALF(type) \ if (std::is_same, gko::half>::value) { \ - GTEST_SKIP() << "Skip due to single mode"; \ + GTEST_SKIP() << "Skip due to half mode"; \ } \ static_assert(true, \ "This assert is used to counter the false positive extra " \ diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index 2b3ce7fa501..af268af1471 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -249,7 +249,7 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto diag = gko::test::detail::get_rand_value(dist, engine); @@ -273,18 +273,23 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagInverseMatrix) { using T = typename TestFixture::value_type; using Dense = typename TestFixture::mtx_type; - auto dist = std::normal_distribution>(0, 1); + auto dist = std::normal_distribution<>(0, 1); auto engine = std::default_random_engine(42); auto lower = gko::test::detail::get_rand_value(dist, engine); auto upper = gko::test::detail::get_rand_value(dist, engine); // make diagonally dominant auto diag = std::abs(gko::test::detail::get_rand_value(dist, engine)) + std::abs(lower) + std::abs(upper); + gko::size_type size = 50; + if (std::is_same>::value) { + // half precision can only handle small matrix + size = 5; + } auto mtx = gko::test::generate_tridiag_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto inv_mtx = gko::test::generate_tridiag_inverse_matrix( - 50, {lower, diag, upper}, this->exec); + size, {lower, diag, upper}, this->exec); auto result = Dense::create(this->exec, mtx->get_size()); inv_mtx->apply(mtx, result); diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 000cb7b215f..ad7de1b6d9c 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -54,6 +54,7 @@ if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") target_compile_options(ginkgo_cuda PRIVATE $<$:--extended-lambda>) + target_compile_options(ginkgo_cuda PRIVATE -Xcompiler="/bigobj") else() target_compile_options(ginkgo_cuda PRIVATE diff --git a/cuda/matrix/fft_kernels.cu b/cuda/matrix/fft_kernels.cu index ba84c8a8d3c..a4db546bcae 100644 --- a/cuda/matrix/fft_kernels.cu +++ b/cuda/matrix/fft_kernels.cu @@ -136,7 +136,8 @@ void fft2(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -152,7 +153,8 @@ void fft3(std::shared_ptr exec, handle.execute(b->get_const_values(), x->get_values(), inverse); } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 5d72e767693..4ecb2547d7c 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -267,7 +267,7 @@ void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = alpha[0]; + const arithmetic_type scale_factor = static_cast(alpha[0]); spmv_kernel( nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { @@ -480,8 +480,8 @@ void abstract_merge_path_spmv( sycl::nd_item<3> item_ct1, IndexType* shared_row_ptrs) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); merge_path_spmv( num_rows, val, col_idxs, row_ptrs, srow, b, c, row_out, val_out, [&alpha_val](const type& x) { return alpha_val * x; }, @@ -567,7 +567,7 @@ void abstract_reduce( uninitialized_array& tmp_ind, uninitialized_array& tmp_val) { - const arithmetic_type alpha_val = alpha[0]; + const arithmetic_type alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }, @@ -664,13 +664,13 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(subgroup_size)]] { - abstract_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, - c, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, c, item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { @@ -695,8 +695,8 @@ void abstract_classical_spmv( acc::range c, sycl::nd_item<3> item_ct1) { using type = typename output_accessor::arithmetic_type; - const type alpha_val = alpha[0]; - const type beta_val = beta[0]; + const type alpha_val = static_cast(alpha[0]); + const type beta_val = static_cast(beta[0]); device_classical_spmv( num_rows, val, col_idxs, row_ptrs, b, c, [&alpha_val, &beta_val](const type& x, const type& y) { @@ -718,13 +718,14 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) - [[sycl::reqd_sub_group_size(subgroup_size)]] { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, - row_ptrs, b, beta, c, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, + item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index b8950ed2d2a..36179402262 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -62,7 +62,7 @@ class Jacobi : public ::testing::Test { if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); } else { std::vector blocks; for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { @@ -70,8 +70,7 @@ class Jacobi : public ::testing::Test { begin(block_pointers)[i + 1] - begin(block_pointers)[i]; const auto cond = begin(condition_numbers)[i]; blocks.push_back(mtx_data::cond( - size, cond, std::normal_distribution(-1, 1), - engine)); + size, cond, std::normal_distribution<>(-1, 1), engine)); } mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); @@ -107,11 +106,11 @@ class Jacobi : public ::testing::Test { } b = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); d_b = gko::clone(dpcpp, b); x = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution(0.0, 1.0), engine, ref); + std::normal_distribution<>(0.0, 1.0), engine, ref); d_x = gko::clone(dpcpp, x); } @@ -409,7 +408,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref)); + std::normal_distribution<>(0.0, 1.0), engine, ref)); auto sx = Vec::create(ref, sb->get_size()); auto d_smtx = gko::share(Mtx::create(dpcpp)); @@ -453,7 +452,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution(1.0, 2.0), engine); + std::normal_distribution<>(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -461,12 +460,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref, - gko::dim<2>(dim, 3), 4)); + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); auto sx = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution(0.0, 1.0), engine, ref, - gko::dim<2>(dim, 3), 4)); + std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), + 4)); auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); auto d_sb = gko::share(gko::clone(dpcpp, sb)); diff --git a/hip/test/matrix/fbcsr_kernels.cpp b/hip/test/matrix/fbcsr_kernels.cpp index 0b4b16086ca..0bed7e7c13e 100644 --- a/hip/test/matrix/fbcsr_kernels.cpp +++ b/hip/test/matrix/fbcsr_kernels.cpp @@ -8,6 +8,7 @@ #include +#include #include #include @@ -41,7 +42,7 @@ class Fbcsr : public HipTestFixture { std::unique_ptr rsorted_ref; - std::normal_distribution> distb; + std::normal_distribution<> distb; std::default_random_engine engine; value_type get_random_value() @@ -145,11 +146,15 @@ TYPED_TEST(Fbcsr, SpmvIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 1)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -169,11 +174,15 @@ TYPED_TEST(Fbcsr, SpmvMultiIsEquivalentToRefSorted) this->ref, gko::dim<2>(this->rsorted_ref->get_size()[0], 3)); auto prod_hip = Dense::create(this->exec, prod_ref->get_size()); - rand_hip->apply(x_hip, prod_hip); - this->rsorted_ref->apply(x_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(x_hip, prod_hip), gko::NotImplemented); + } else { + rand_hip->apply(x_hip, prod_hip); + this->rsorted_ref->apply(x_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -205,11 +214,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } @@ -241,11 +255,16 @@ TYPED_TEST(Fbcsr, AdvancedSpmvMultiIsEquivalentToRefSorted) auto beta = Dense::create(this->exec); beta->copy_from(beta_ref); - rand_hip->apply(alpha, x_hip, beta, prod_hip); - this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); + if (std::is_same::value) { + ASSERT_THROW(rand_hip->apply(alpha, x_hip, beta, prod_hip), + gko::NotImplemented); + } else { + rand_hip->apply(alpha, x_hip, beta, prod_hip); + this->rsorted_ref->apply(alpha_ref, x_ref, beta_ref, prod_ref); - const double tol = r::value; - GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + const double tol = r::value; + GKO_ASSERT_MTX_NEAR(prod_ref, prod_hip, 5 * tol); + } } diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 6b21646c66c..de749d74222 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -522,17 +522,22 @@ class complex { typename = std::enable_if_t::value && std::is_scalar::value>> explicit complex(const T& real, const U& imag) - : complex(static_cast(real), static_cast(imag)) + : real_(static_cast(real)), + imag_(static_cast(imag)) {} template ::value>> - complex(const T& real) : complex(static_cast(real)) + complex(const T& real) + : real_(static_cast(real)), + imag_(static_cast(0.f)) {} + // When using complex(real, imag), MSVC with CUDA try to recognize the + // complex is a member not constructor. template ::value>> explicit complex(const complex& other) - : complex(static_cast(other.real()), - static_cast(other.imag())) + : real_(static_cast(other.real())), + imag_(static_cast(other.imag())) {} // explicit complex(const complex& other) = default; diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index 9d8ec1c9cb3..c677f6269a4 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -49,7 +49,8 @@ make_temporary_conversion(Ptr&& matrix) using Pointee = detail::pointee; using Dense = matrix::Dense; using NextDense = matrix::Dense>; - using NextNextDense = matrix::Dense>>; + using NextNextDense = + matrix::Dense>>; using MaybeConstDense = std::conditional_t::value, const Dense, Dense>; auto result = detail::temporary_conversion< diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp index 8e47caef520..9aa21b3efdb 100644 --- a/omp/matrix/csr_kernels.cpp +++ b/omp/matrix/csr_kernels.cpp @@ -95,8 +95,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + arithmetic_type valpha = static_cast(alpha->at(0, 0)); + arithmetic_type vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); diff --git a/omp/matrix/fft_kernels.cpp b/omp/matrix/fft_kernels.cpp index ca1f21c36b1..13242920bac 100644 --- a/omp/matrix/fft_kernels.cpp +++ b/omp/matrix/fft_kernels.cpp @@ -190,7 +190,8 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -295,7 +296,8 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/omp/solver/cb_gmres_kernels.cpp b/omp/solver/cb_gmres_kernels.cpp index c60e848d501..e295e72a3b4 100644 --- a/omp/solver/cb_gmres_kernels.cpp +++ b/omp/solver/cb_gmres_kernels.cpp @@ -330,7 +330,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp index be97da442a1..556862df791 100644 --- a/reference/matrix/csr_kernels.cpp +++ b/reference/matrix/csr_kernels.cpp @@ -94,8 +94,8 @@ void advanced_spmv(std::shared_ptr exec, auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - arithmetic_type valpha = alpha->at(0, 0); - arithmetic_type vbeta = beta->at(0, 0); + arithmetic_type valpha = static_cast(alpha->at(0, 0)); + arithmetic_type vbeta = static_cast(beta->at(0, 0)); const auto a_vals = acc::helper::build_const_rrm_accessor(a); diff --git a/reference/matrix/fft_kernels.cpp b/reference/matrix/fft_kernels.cpp index e8617592265..6ea8ccfa812 100644 --- a/reference/matrix/fft_kernels.cpp +++ b/reference/matrix/fft_kernels.cpp @@ -183,7 +183,8 @@ void fft2(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT2_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT2_KERNEL); template @@ -283,7 +284,8 @@ void fft3(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF(GKO_DECLARE_FFT3_KERNEL); +GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE_NO_HALF( + GKO_DECLARE_FFT3_KERNEL); } // namespace fft diff --git a/reference/solver/cb_gmres_kernels.cpp b/reference/solver/cb_gmres_kernels.cpp index e0c5ea22b1c..53ec0024064 100644 --- a/reference/solver/cb_gmres_kernels.cpp +++ b/reference/solver/cb_gmres_kernels.cpp @@ -294,7 +294,8 @@ void initialize(std::shared_ptr exec, } } -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF(GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_NO_HALF( + GKO_DECLARE_CB_GMRES_INITIALIZE_KERNEL); template diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp index f4a8b240b38..0c31a37ed45 100644 --- a/reference/test/factorization/lu_kernels.cpp +++ b/reference/test/factorization/lu_kernels.cpp @@ -219,7 +219,7 @@ TYPED_TEST(Lu, KernelFactorizeWorks) diag_idxs.get_const_data(), this->mtx_lu.get(), tmp); GKO_ASSERT_MTX_NEAR(this->mtx_lu, mtx_lu_ref, - 15 * r::value); + 30 * r::value); }); } @@ -294,7 +294,7 @@ TYPED_TEST(Lu, FactorizeNearSymmetricWorks) GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); @@ -321,7 +321,7 @@ TYPED_TEST(Lu, FactorizeWithKnownSparsityWorks) auto lu = factory->generate(this->mtx); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); diff --git a/reference/test/preconditioner/isai_kernels.cpp b/reference/test/preconditioner/isai_kernels.cpp index e989125c61d..a72213179aa 100644 --- a/reference/test/preconditioner/isai_kernels.cpp +++ b/reference/test/preconditioner/isai_kernels.cpp @@ -1005,6 +1005,8 @@ TYPED_TEST(Isai, ReturnsCorrectInverseA) TYPED_TEST(Isai, ReturnsCorrectInverseALongrow) { using value_type = typename TestFixture::value_type; + // TODO: figure out whether relaxed residual norm works in half or not. + SKIP_IF_HALF(value_type); const auto isai = this->general_isai_factory->generate(this->a_csr_longrow); auto a_inv = isai->get_approximate_inverse(); @@ -1021,6 +1023,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseALongrowWithExcessSolver) { using value_type = typename TestFixture::value_type; using GeneralIsai = typename TestFixture::GeneralIsai; + SKIP_IF_HALF(value_type); auto general_isai_factory = GeneralIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1068,6 +1071,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseLLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using LowerIsai = typename TestFixture::LowerIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto lower_isai_factory = LowerIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1115,6 +1119,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseULongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using UpperIsai = typename TestFixture::UpperIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto upper_isai_factory = UpperIsai::build() .with_excess_solver_factory(this->excess_solver_factory) @@ -1215,6 +1220,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrow) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); const auto isai = this->spd_isai_factory->generate(this->spd_csr_longrow); const auto expected_transpose = gko::as(this->spd_csr_longrow_inv->transpose()); @@ -1238,6 +1244,7 @@ TYPED_TEST(Isai, ReturnsCorrectInverseSpdLongrowWithExcessSolver) using Csr = typename TestFixture::Csr; using SpdIsai = typename TestFixture::SpdIsai; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); const auto expected_transpose = gko::as(this->spd_csr_longrow_inv->transpose()); auto spd_isai_factory = diff --git a/reference/test/preconditioner/jacobi_kernels.cpp b/reference/test/preconditioner/jacobi_kernels.cpp index c038dc475b4..1344dad7166 100644 --- a/reference/test/preconditioner/jacobi_kernels.cpp +++ b/reference/test/preconditioner/jacobi_kernels.cpp @@ -561,11 +561,14 @@ TYPED_TEST(Jacobi, SelectsCorrectBlockPrecisions) auto prec = bj->get_parameters().storage_optimization.block_wise.get_const_data(); - auto precision2 = std::is_same, float>::value - ? gko::precision_reduction(0, 0) // float - : gko::precision_reduction(0, 1); // double - EXPECT_EQ(prec[0], gko::precision_reduction(0, 2)); // u * cond = ~1.2e-3 - ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 + auto precision1 = std::is_same, gko::half>::value + ? gko::precision_reduction(2, 0) + : gko::precision_reduction(0, 2); + auto precision2 = std::is_same, double>::value + ? gko::precision_reduction(0, 1) // double + : gko::precision_reduction(0, 0); // float, half + EXPECT_EQ(prec[0], precision1); // u * cond = ~1.2e-3 + ASSERT_EQ(prec[1], precision2); // u * cond = ~2.0e-3 } @@ -606,6 +609,9 @@ TYPED_TEST(Jacobi, AvoidsPrecisionsThatOverflow) auto precision = std::is_same, float>::value ? gko::precision_reduction(0, 2) // float : gko::precision_reduction(1, 1); // double + if (std::is_same, gko::half>::value) { + precision = gko::precision_reduction(2, 0); + } EXPECT_EQ(prec[0], precision); ASSERT_EQ(prec[1], precision); } diff --git a/reference/test/reorder/scaled_reordered.cpp b/reference/test/reorder/scaled_reordered.cpp index aa6e963e6b1..57f9bfde4bb 100644 --- a/reference/test/reorder/scaled_reordered.cpp +++ b/reference/test/reorder/scaled_reordered.cpp @@ -364,6 +364,8 @@ TYPED_TEST(ScaledReordered, AppliesWithRcmReordering) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithOnlyInnerOperator) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build().with_inner_operator(this->solver_factory).on(this->exec); auto scaled_reordered = scaled_reordered_fact->generate(this->rcm_mtx); @@ -410,6 +412,8 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithColScaling) TYPED_TEST(ScaledReordered, SolvesSingleRhsWithRcmReordering) { using SR = typename TestFixture::SR; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto scaled_reordered_fact = SR::build() .with_reordering(this->rcm_factory) .with_inner_operator(this->solver_factory) @@ -446,6 +450,7 @@ TYPED_TEST(ScaledReordered, SolvesSingleRhsWithScalingAndRcmReorderingMixed) using SR = typename TestFixture::SR; using T = typename TestFixture::value_type; using Vec = gko::matrix::Dense>; + SKIP_IF_HALF(T); auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) @@ -467,6 +472,8 @@ TYPED_TEST(ScaledReordered, AdvancedSolvesSingleRhsWithScalingAndRcmReordering) { using SR = typename TestFixture::SR; using Vec = typename TestFixture::Vec; + using T = typename TestFixture::value_type; + SKIP_IF_HALF(T); const auto alpha = gko::initialize({2.0}, this->exec); const auto beta = gko::initialize({-1.0}, this->exec); auto scaled_reordered_fact = SR::build() @@ -491,6 +498,7 @@ TYPED_TEST(ScaledReordered, using T = typename TestFixture::value_type; using value_type = next_precision; using Vec = gko::matrix::Dense; + SKIP_IF_HALF(T); auto scaled_reordered_fact = SR::build() .with_row_scaling(this->diag2) .with_col_scaling(this->diag3) diff --git a/reference/test/solver/bicgstab_kernels.cpp b/reference/test/solver/bicgstab_kernels.cpp index e2974d664c4..1813f916488 100644 --- a/reference/test/solver/bicgstab_kernels.cpp +++ b/reference/test/solver/bicgstab_kernels.cpp @@ -489,7 +489,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApply) solver->apply(alpha, b, beta, x); - GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), r::value); + GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), 2 * r::value); } @@ -506,7 +506,7 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixed) solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, l({-8.5, -3.0, 6.0}), - (r_mixed())); + (2 * r_mixed())); } @@ -522,14 +522,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), r::value); } @@ -547,14 +547,14 @@ TYPED_TEST(Bicgstab, SolvesDenseSystemUsingAdvancedApplyMixedComplex) {value_type{-1.0, 2.0}, value_type{3.0, -6.0}, value_type{1.0, -2.0}}, this->exec); auto x = gko::initialize( - {value_type{0.5, -1.0}, value_type{1.0, -2.0}, value_type{2.0, -4.0}}, + {value_type{0.5, -0.5}, value_type{1.0, 0.5}, value_type{2.0, -1.0}}, this->exec); solver->apply(alpha, b, beta, x); GKO_ASSERT_MTX_NEAR(x, - l({value_type{-8.5, 17.0}, value_type{-3.0, 6.0}, - value_type{6.0, -12.0}}), + l({value_type{-8.5, 16.5}, value_type{-3.0, 3.5}, + value_type{6.0, -15.0}}), (r_mixed())); } @@ -585,6 +585,7 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -613,6 +614,7 @@ TYPED_TEST(Bicgstab, SolvesBigDenseSystemForDivergenceCheck2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, @@ -642,6 +644,7 @@ TYPED_TEST(Bicgstab, SolvesMultipleDenseSystemsDivergenceCheck) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_HALF(value_type); std::shared_ptr locmtx = gko::initialize({{-19.0, 47.0, -41.0, 35.0, -21.0, 71.0}, {-8.0, -66.0, 29.0, -96.0, -95.0, -14.0}, diff --git a/reference/test/solver/cgs_kernels.cpp b/reference/test/solver/cgs_kernels.cpp index 7df07a770bd..f850f7bd202 100644 --- a/reference/test/solver/cgs_kernels.cpp +++ b/reference/test/solver/cgs_kernels.cpp @@ -406,6 +406,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyComplex) using Scalar = typename TestFixture::Mtx; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; + // different initial guess leads complex divergent. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -431,6 +433,8 @@ TYPED_TEST(Cgs, SolvesDenseSystemUsingAdvancedApplyMixedComplex) gko::matrix::Dense>; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; + // different initial guess leads complex divergent. + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -475,6 +479,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -491,6 +496,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystemWithImplicitResNormCrit) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big2->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -507,6 +513,7 @@ TYPED_TEST(Cgs, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b = gko::initialize( {17356.0, 5466.0, 748.0, -456.0, 3434.0, -7020.0}, this->exec); @@ -523,6 +530,7 @@ TYPED_TEST(Cgs, SolvesMultipleDenseSystems) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -589,6 +597,7 @@ TYPED_TEST(Cgs, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {764.0, -4032.0, -11855.0, 7111.0, -12765.0, -4589}, this->exec); @@ -605,6 +614,7 @@ TYPED_TEST(Cgs, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->cgs_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index a81c3ce4285..d3f29fffee9 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -234,7 +234,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixed) solver->apply(b.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.0, 3.0, 2.0}), - (r_mixed())); + (r_mixed() * 1e1)); } @@ -279,7 +279,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemMixedComplex) GKO_ASSERT_MTX_NEAR(x, l({value_type{1.0, -2.0}, value_type{3.0, -6.0}, value_type{2.0, -4.0}}), - (r_mixed())); + (r_mixed() * 1e1)); } @@ -330,7 +330,7 @@ TYPED_TEST(Gcr, SolvesStencilSystemUsingAdvancedApplyMixed) solver->apply(alpha.get(), b.get(), beta.get(), x.get()); GKO_ASSERT_MTX_NEAR(x, l({1.5, 5.0, 2.0}), - (r_mixed()) * 1e1); + (r_mixed()) * 1e2); } @@ -409,6 +409,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -426,6 +427,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem2) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b = gko::initialize( {175352.10, 313410.50, 131114.10, -134116.30, 179529.30, -43564.90}, @@ -471,6 +473,7 @@ TYPED_TEST(Gcr, SolvesMultipleDenseSystemForDivergenceCheck) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big); auto b1 = gko::initialize( {1300083.0, 1018120.5, 906410.0, -42679.5, 846779.5, 1176858.5}, @@ -537,6 +540,7 @@ TYPED_TEST(Gcr, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gcr_factory_restart = Solver::build() @@ -562,6 +566,7 @@ TYPED_TEST(Gcr, SolvesWithPreconditioner) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto gcr_factory_preconditioner = Solver::build() .with_criteria(gko::stop::Iteration::build().with_max_iters(100u), @@ -588,6 +593,7 @@ TYPED_TEST(Gcr, SolvesTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->transpose()); auto b = gko::initialize( {72748.36, 297469.88, 347229.24, 36290.66, 82958.82, -80192.15}, @@ -605,6 +611,7 @@ TYPED_TEST(Gcr, SolvesConjTransposedBigDenseSystem) { using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto solver = this->gcr_factory_big->generate(this->mtx_big->conj_transpose()); auto b = gko::initialize( diff --git a/reference/test/solver/gmres_kernels.cpp b/reference/test/solver/gmres_kernels.cpp index 7d30109af1b..0b013719040 100644 --- a/reference/test/solver/gmres_kernels.cpp +++ b/reference/test/solver/gmres_kernels.cpp @@ -736,6 +736,7 @@ TYPED_TEST(Gmres, SolvesBigDenseSystem1WithRestart) using Mtx = typename TestFixture::Mtx; using Solver = typename TestFixture::Solver; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto half_tol = std::sqrt(r::value); auto gmres_factory_restart = Solver::build() diff --git a/reference/test/solver/idr_kernels.cpp b/reference/test/solver/idr_kernels.cpp index c8a2e45d14e..ff4831a5712 100644 --- a/reference/test/solver/idr_kernels.cpp +++ b/reference/test/solver/idr_kernels.cpp @@ -57,7 +57,10 @@ class Idr : public ::testing::Test { std::unique_ptr idr_factory_precision; }; -TYPED_TEST_SUITE(Idr, gko::test::ValueTypes, TypenameNameGenerator); +// Solves((Conj)Trans)DenseSystem((Mixed)Complex) does not work in some default +// random generator from different environments. All tests will SKIP half, so we +// do not test half here. +TYPED_TEST_SUITE(Idr, gko::test::ValueTypesNoHalf, TypenameNameGenerator); TYPED_TEST(Idr, SolvesDenseSystem) @@ -76,7 +79,8 @@ TYPED_TEST(Idr, SolvesDenseSystem) TYPED_TEST(Idr, SolvesDenseSystemMixed) { - using value_type = next_precision; + using T = typename TestFixture::value_type; + using value_type = next_precision; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize({-1.0, 3.0, 1.0}, this->exec); @@ -91,6 +95,7 @@ TYPED_TEST(Idr, SolvesDenseSystemMixed) TYPED_TEST(Idr, SolvesDenseSystemComplex) { + using T = typename TestFixture::value_type; using Mtx = gko::to_complex; using value_type = typename Mtx::value_type; auto solver = this->idr_factory->generate(this->mtx); @@ -112,8 +117,8 @@ TYPED_TEST(Idr, SolvesDenseSystemComplex) TYPED_TEST(Idr, SolvesDenseSystemMixedComplex) { - using value_type = - gko::to_complex>; + using T = typename TestFixture::value_type; + using value_type = gko::to_complex>; using Mtx = gko::matrix::Dense; auto solver = this->idr_factory->generate(this->mtx); auto b = gko::initialize( @@ -137,6 +142,7 @@ TYPED_TEST(Idr, SolvesDenseSystemWithComplexSubSpace) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using Solver = typename TestFixture::Solver; + // intermediate value is too small to represent in half auto half_tol = std::sqrt(r::value); auto solver_factory = Solver::build() @@ -233,6 +239,7 @@ TYPED_TEST(Idr, SolvesDenseSystemUsingAdvancedApplyMixed) { using value_type = next_precision; using Mtx = gko::matrix::Dense; + SKIP_IF_HALF(typename TestFixture::value_type); auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); auto beta = gko::initialize({-1.0}, this->exec); @@ -300,6 +307,7 @@ TYPED_TEST(Idr, SolvesMultipleDenseSystemsUsingAdvancedApply) using Mtx = typename TestFixture::Mtx; using value_type = typename TestFixture::value_type; using T = value_type; + SKIP_IF_HALF(T); auto half_tol = std::sqrt(r::value); auto solver = this->idr_factory->generate(this->mtx); auto alpha = gko::initialize({2.0}, this->exec); diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index 5783274f211..9f2abd2730e 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -35,8 +35,7 @@ class DeviceMatrixData : public CommonTestFixture { 0, host_data.size[0] - 1); std::uniform_int_distribution col_distr( 0, host_data.size[1] - 1); - std::uniform_real_distribution<> - val_distr(1.0, 2.0); + std::uniform_real_distribution<> val_distr(1.0, 2.0); // add random entries for (int i = 0; i < 1000; i++) { host_data.nonzeros.emplace_back( diff --git a/test/factorization/par_ic_kernels.cpp b/test/factorization/par_ic_kernels.cpp index 08bca8be499..7cb421e94b0 100644 --- a/test/factorization/par_ic_kernels.cpp +++ b/test/factorization/par_ic_kernels.cpp @@ -41,8 +41,7 @@ class ParIc : public CommonTestFixture { mtx_l = gko::test::generate_random_lower_triangular_matrix( mtx_size[0], false, std::uniform_int_distribution(10, mtx_size[0]), - std::normal_distribution<>(0, 10.0), - rand_engine, ref); + std::normal_distribution<>(0, 10.0), rand_engine, ref); dmtx_ani = Csr::create(exec); dmtx_l_ani = Csr::create(exec); dmtx_l_ani_init = Csr::create(exec); @@ -107,6 +106,8 @@ TYPED_TEST(ParIc, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ict_kernels.cpp b/test/factorization/par_ict_kernels.cpp index 62949c243c2..5d7400c6814 100644 --- a/test/factorization/par_ict_kernels.cpp +++ b/test/factorization/par_ict_kernels.cpp @@ -123,6 +123,8 @@ TYPED_TEST(ParIct, KernelComputeFactorIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); this->mtx_l_ani->convert_to(mtx_l_coo); diff --git a/test/factorization/par_ilu_kernels.cpp b/test/factorization/par_ilu_kernels.cpp index 508cd911f56..27bbdcc480c 100644 --- a/test/factorization/par_ilu_kernels.cpp +++ b/test/factorization/par_ilu_kernels.cpp @@ -59,8 +59,7 @@ class ParIlu : public CommonTestFixture { return gko::test::generate_random_matrix( num_rows, num_cols, std::uniform_int_distribution(0, num_cols - 1), - std::normal_distribution<>(0.0, 1.0), - rand_engine, ref); + std::normal_distribution<>(0.0, 1.0), rand_engine, ref); } std::unique_ptr gen_unsorted_mtx(index_type num_rows, @@ -237,6 +236,8 @@ TYPED_TEST(ParIlu, KernelInitializeParILUIsEquivalentToRef) TYPED_TEST(ParIlu, KernelComputeParILUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; @@ -255,6 +256,7 @@ TYPED_TEST(ParIlu, KernelComputeParILUWithMoreIterationsIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); std::unique_ptr l_mtx{}; std::unique_ptr u_mtx{}; std::unique_ptr dl_mtx{}; diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index 4c4167a5efa..0aaac36e4b3 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -365,6 +365,8 @@ TYPED_TEST(ParIlut, KernelAddCandidatesIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using value_type = typename TestFixture::value_type; + // there's one value larger than half range + SKIP_IF_HALF(value_type); auto square_size = this->mtx_square->get_size(); auto mtx_lu = Csr::create(this->ref, square_size); this->mtx_l2->apply(this->mtx_u, mtx_lu); @@ -393,6 +395,8 @@ TYPED_TEST(ParIlut, KernelComputeLUIsEquivalentToRef) { using Csr = typename TestFixture::Csr; using Coo = typename TestFixture::Coo; + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); auto square_size = this->mtx_ani->get_size(); auto mtx_l_coo = Coo::create(this->ref, square_size); auto mtx_u_coo = Coo::create(this->ref, square_size); diff --git a/test/matrix/fft_kernels.cpp b/test/matrix/fft_kernels.cpp index b2dc9f7f672..b6d832b7a38 100644 --- a/test/matrix/fft_kernels.cpp +++ b/test/matrix/fft_kernels.cpp @@ -91,7 +91,8 @@ class Fft : public CommonTestFixture { }; -TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, TypenameNameGenerator); +TYPED_TEST_SUITE(Fft, gko::test::ComplexValueTypesNoHalf, + TypenameNameGenerator); TYPED_TEST(Fft, Apply1DIsEqualToReference) diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index d1b03fce8a1..759de58905d 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -682,7 +682,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); @@ -708,7 +708,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) using csr = typename TestFixture::local_matrix_type; using local_index_type = typename TestFixture::local_index_type; using global_index_type = typename TestFixture::global_index_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherDist = typename gko::experimental::distributed::Matrix< OtherT, local_index_type, global_index_type>; auto tmp = OtherDist::create(this->ref, this->comm); diff --git a/test/mpi/vector.cpp b/test/mpi/vector.cpp index 1e3cb1b5fce..b91fc5ca3b0 100644 --- a/test/mpi/vector.cpp +++ b/test/mpi/vector.cpp @@ -513,7 +513,7 @@ class VectorReductions : public CommonMpiTestFixture { std::default_random_engine engine; }; -TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypes, +TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypesNoHalf, TypenameNameGenerator); @@ -762,8 +762,7 @@ class VectorLocalOps : public CommonMpiTestFixture { local_size[0], local_size[1], std::uniform_int_distribution(local_size[1], local_size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); dist = DistVectorType::create(exec, comm, size, gko::clone(local)); } @@ -775,8 +774,7 @@ class VectorLocalOps : public CommonMpiTestFixture { alpha = gko::test::generate_random_matrix( 1, size[1], std::uniform_int_distribution(size[1], size[1]), - std::normal_distribution>(), engine, - exec); + std::normal_distribution<>(), engine, exec); } void init_complex_vectors() @@ -839,7 +837,7 @@ TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported) TYPED_TEST(VectorLocalOps, ConvertsToPrecision) { using T = typename TestFixture::value_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -855,7 +853,7 @@ TYPED_TEST(VectorLocalOps, ConvertsToPrecision) TYPED_TEST(VectorLocalOps, MovesToPrecision) { using T = typename TestFixture::value_type; - using OtherT = next_precision; + using OtherT = next_precision; using OtherVector = typename gko::experimental::distributed::Vector; auto local_tmp = OtherVector::local_vector_type::create(this->exec); auto tmp = OtherVector::create(this->exec, this->comm); @@ -970,8 +968,7 @@ TYPED_TEST(VectorLocalOps, FillSameAsLocal) { using value_type = typename TestFixture::value_type; auto value = gko::test::detail::get_rand_value( - std::normal_distribution>(), - this->engine); + std::normal_distribution<>(), this->engine); this->init_vectors(); this->x->fill(value); From 60767ed954b83a6462358703669d30b2824cfcc0 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 21 Jun 2023 22:28:16 +0200 Subject: [PATCH 28/62] mkl csr does not support half --- dpcpp/matrix/csr_kernels.dp.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 4ecb2547d7c..61ff76325c5 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -1395,8 +1395,9 @@ bool try_general_sparselib_spmv(std::shared_ptr exec, const ValueType host_beta, matrix::Dense* c) { - bool try_sparselib = !is_complex(); - if (try_sparselib) { + constexpr bool try_sparselib = + !is_complex() && !std::is_same::value; + if constexpr (try_sparselib) { oneapi::mkl::sparse::matrix_handle_t mat_handle; oneapi::mkl::sparse::init_matrix_handle(&mat_handle); oneapi::mkl::sparse::set_csr_data( From d65255af495f591203c9ac5b9ce438b5e9d55eaf Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 7 Sep 2023 10:49:59 +0200 Subject: [PATCH 29/62] add half to batch_vector --- core/base/batch_multi_vector.cpp | 19 ++++++++++++++++ .../ginkgo/core/base/batch_multi_vector.hpp | 22 +++++++++++++++++-- .../test/base/batch_multi_vector_kernels.cpp | 4 ++-- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/core/base/batch_multi_vector.cpp b/core/base/batch_multi_vector.cpp index f4485377f25..4fb9eec6845 100644 --- a/core/base/batch_multi_vector.cpp +++ b/core/base/batch_multi_vector.cpp @@ -296,6 +296,25 @@ void MultiVector::move_to( } +#if GINKGO_ENABLE_HALF +template +void MultiVector::convert_to( + MultiVector>>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void MultiVector::move_to( + MultiVector>>* result) +{ + this->convert_to(result); +} +#endif + + #define GKO_DECLARE_BATCH_MULTI_VECTOR(_type) class MultiVector<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR); diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index d04e9562fce..7ccee45ebd3 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -52,10 +52,14 @@ template class MultiVector : public EnablePolymorphicObject>, public EnablePolymorphicAssignment>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + MultiVector>>>, +#endif public ConvertibleTo>> { friend class EnablePolymorphicObject; friend class MultiVector>; - friend class MultiVector>; + friend class MultiVector>; public: using EnablePolymorphicAssignment::convert_to; @@ -83,6 +87,20 @@ class MultiVector void move_to(MultiVector>* result) override; +#if GINKGO_ENABLE_HALF + friend class MultiVector>>; + using ConvertibleTo< + MultiVector>>>::convert_to; + using ConvertibleTo< + MultiVector>>>::move_to; + + void convert_to(MultiVector>>* + result) const override; + + void move_to(MultiVector>>* result) + override; +#endif + /** * Creates a mutable view (of matrix::Dense type) of one item of the Batch * MultiVector object. Does not perform any deep copies, but only returns a @@ -429,7 +447,7 @@ class MultiVector private: batch_dim<2> batch_size_; array values_; -}; +}; // namespace batch } // namespace batch diff --git a/reference/test/base/batch_multi_vector_kernels.cpp b/reference/test/base/batch_multi_vector_kernels.cpp index e673046a490..694ae491ef4 100644 --- a/reference/test/base/batch_multi_vector_kernels.cpp +++ b/reference/test/base/batch_multi_vector_kernels.cpp @@ -349,7 +349,7 @@ TYPED_TEST(MultiVector, ConvertsToPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->mtx_1->convert_to(tmp.get()); tmp->convert_to(res.get()); @@ -373,7 +373,7 @@ TYPED_TEST(MultiVector, MovesToPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->mtx_1->move_to(tmp.get()); tmp->move_to(res.get()); From 5c0454f86270f58dca3d3639f3565042ae700faa Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 12 Sep 2023 17:02:42 +0200 Subject: [PATCH 30/62] fix hip thrust complex op, avoid const in nvhpc, reduce job in windows --- accessor/reference_helper.hpp | 6 ++++-- hip/base/types.hip.hpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp index a3a77352f8f..2581556e140 100644 --- a/accessor/reference_helper.hpp +++ b/accessor/reference_helper.hpp @@ -14,8 +14,10 @@ // CUDA TOOLKIT < 11 does not support constexpr in combination with // thrust::complex, which is why constexpr is only present in later versions -#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \ - (__CUDACC_VER_MAJOR__ < 11) +// TODO: NVC++ constexpr +#if (defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \ + (__CUDACC_VER_MAJOR__ < 11)) || \ + (defined(__NVCOMPILER) && GINKGO_ENABLE_HALF) #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index 4f31bafaac8..a52dfe0b239 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -78,7 +78,7 @@ GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ { \ - return thrust::complex{lhs} + thrust::complex(rhs); \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ } THRUST_HALF_FRIEND_OPERATOR(+, +=) From da15916daeb3eb1915ce1d95d543728c7f28c474 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 13 Sep 2023 16:21:12 +0200 Subject: [PATCH 31/62] fix nvc++ atomic, dpcpp half jacobi need to use value_type generator --- .../test/preconditioner/jacobi_kernels.dp.cpp | 23 ++++++++++--------- include/ginkgo/core/base/half.hpp | 16 ------------- include/ginkgo/core/base/math.hpp | 7 ------ include/ginkgo/core/base/types.hpp | 15 +++++------- omp/components/atomic.hpp | 9 ++++++++ 5 files changed, 27 insertions(+), 43 deletions(-) diff --git a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp index 36179402262..833d210c91a 100644 --- a/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp +++ b/dpcpp/test/preconditioner/jacobi_kernels.dp.cpp @@ -62,7 +62,7 @@ class Jacobi : public ::testing::Test { if (condition_numbers.size() == 0) { mtx = gko::test::generate_random_matrix( dim, dim, std::uniform_int_distribution<>(min_nnz, max_nnz), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); } else { std::vector blocks; for (gko::size_type i = 0; i < block_pointers.size() - 1; ++i) { @@ -70,7 +70,8 @@ class Jacobi : public ::testing::Test { begin(block_pointers)[i + 1] - begin(block_pointers)[i]; const auto cond = begin(condition_numbers)[i]; blocks.push_back(mtx_data::cond( - size, cond, std::normal_distribution<>(-1, 1), engine)); + size, cond, std::normal_distribution(-1, 1), + engine)); } mtx = Mtx::create(ref); mtx->read(mtx_data::diag(begin(blocks), end(blocks))); @@ -106,11 +107,11 @@ class Jacobi : public ::testing::Test { } b = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_b = gko::clone(dpcpp, b); x = gko::test::generate_random_matrix( dim, num_rhs, std::uniform_int_distribution<>(num_rhs, num_rhs), - std::normal_distribution<>(0.0, 1.0), engine, ref); + std::normal_distribution(0.0, 1.0), engine, ref); d_x = gko::clone(dpcpp, x); } @@ -400,7 +401,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine); + std::normal_distribution(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -408,7 +409,7 @@ TEST_F(Jacobi, DpcppScalarApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref)); + std::normal_distribution(0.0, 1.0), engine, ref)); auto sx = Vec::create(ref, sb->get_size()); auto d_smtx = gko::share(Mtx::create(dpcpp)); @@ -452,7 +453,7 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) auto dense_data = gko::test::generate_random_matrix_data( dim, dim, std::uniform_int_distribution<>(1, dim), - std::normal_distribution<>(1.0, 2.0), engine); + std::normal_distribution(1.0, 2.0), engine); gko::utils::make_diag_dominant(dense_data); auto dense_smtx = gko::share(Vec::create(ref)); dense_smtx->read(dense_data); @@ -460,12 +461,12 @@ TEST_F(Jacobi, DpcppScalarLinearCombinationApplyEquivalentToRef) smtx->copy_from(dense_smtx); auto sb = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto sx = gko::share(gko::test::generate_random_matrix( dim, 3, std::uniform_int_distribution<>(1, 1), - std::normal_distribution<>(0.0, 1.0), engine, ref, gko::dim<2>(dim, 3), - 4)); + std::normal_distribution(0.0, 1.0), engine, ref, + gko::dim<2>(dim, 3), 4)); auto d_smtx = gko::share(gko::clone(dpcpp, smtx)); auto d_sb = gko::share(gko::clone(dpcpp, sb)); diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index de749d74222..218a487e1a4 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -42,9 +42,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include -#ifdef SYCL_LANGUAGE_VERSION -#include -#endif #ifdef __CUDA_ARCH__ @@ -322,14 +319,6 @@ struct precision_converter { } // namespace detail -// sycl::half miss the arithmetic operator to result float not half before 5.7 -// (2022-06). It leads ? half : half/half ambiguous The same issue is reported -// in https://github.com/intel/llvm/issues/6028 -#if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || \ - (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) -using half = sycl::half; -#else /** * A class providing basic support for half precision floating point types. * @@ -500,7 +489,6 @@ class half { uint16 data_; }; -#endif } // namespace gko @@ -662,9 +650,6 @@ class complex { value_type imag_; }; -#if !(defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || \ - (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7))) template <> struct numeric_limits { static constexpr bool is_specialized{true}; @@ -700,7 +685,6 @@ struct numeric_limits { } }; -#endif // complex using a template on operator= for any kind of complex, so we can // do full specialization for half diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index c4cc82acc08..3e9925b8a34 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -19,13 +19,6 @@ #include -// Using SYCL_LANGUAGE_VERSION will lead the mismatch sycl namespace from 6.0.0 -// when using dpcpp compiler without dpcpp module -#if GINKGO_DPCPP_MAJOR_VERSION -#include -#endif - - class __half; diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 49888907410..b5f87f1c96f 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -25,9 +25,6 @@ #include #endif // __HIPCC__ -#ifdef SYCL_LANGUAGE_VERSION -#include -#endif // Macros for handling different compilers / architectures uniformly #if defined(__CUDACC__) || defined(__HIPCC__) @@ -143,13 +140,13 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; -#if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || \ - (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) -using half = sycl::half; -#else +// #if defined(SYCL_LANGUAGE_VERSION) && \ +// (__LIBSYCL_MAJOR_VERSION > 5 || \ +// (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) +// using half = sycl::half; +// #else class half; -#endif +// #endif /** diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp index 1de9d298fa1..f5f0984e418 100644 --- a/omp/components/atomic.hpp +++ b/omp/components/atomic.hpp @@ -51,6 +51,14 @@ inline ResultType reinterpret(ValueType val) template <> void atomic_add(half& out, half val) { +#ifdef __NVCOMPILER +// NVC++ uses atomic capture on uint16 leads the following error. +// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !* +#pragma omp critical + { + out += val; + } +#else // UB? uint16_t* address_as_converter = reinterpret_cast(&out); uint16_t old = *address_as_converter; @@ -64,6 +72,7 @@ void atomic_add(half& out, half val) *address_as_converter = (old == assumed) ? answer : old; } } while (assumed != old); +#endif } From 5f9e3ff94b77c54cdd5cbdf129b7b188aa426283 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 14 Sep 2023 15:04:20 +0200 Subject: [PATCH 32/62] make half test optional --- core/test/utils.hpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 62ab95e9656..f7683bcbd28 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -321,12 +321,14 @@ using add_inner_wrapper_t = typename detail::add_inner_wrapper::type; -using RealValueTypes = +using RealValueTypes = ::testing::Types< +#if GINKGO_ENABLE_HALF + gko::half, +#endif #if GINKGO_DPCPP_SINGLE_MODE - ::testing::Types; -#else - ::testing::Types; + double, #endif + float>; using RealValueTypesNoHalf = #if GINKGO_DPCPP_SINGLE_MODE From fe455601115d987956bc536693d7f382c1bfa0b1 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 16 Sep 2023 00:58:00 +0200 Subject: [PATCH 33/62] nvhpc optimization/computation error workaround --- core/test/utils/matrix_generator.hpp | 8 ++++++-- include/ginkgo/core/base/half.hpp | 6 ++++-- omp/factorization/par_ilut_kernels.cpp | 7 ++++++- reference/factorization/par_ilut_kernels.cpp | 7 ++++++- reference/test/stop/residual_norm_kernels.cpp | 4 +++- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index 56ff38c520d..11a4729b95d 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -659,9 +659,13 @@ gko::matrix_data generate_tridiag_inverse_matrix_data( auto off_diag = i < j ? upper : lower; auto min_idx = std::min(i, j); auto max_idx = std::max(i, j); + // TODO: NVHPC requires explicitly casting to single precision + // from half. auto val = sign * - static_cast( - std::pow(off_diag, max_idx - min_idx)) * + static_cast(std::pow( + typename gko::detail::arth_type::type{ + off_diag}, + max_idx - min_idx)) * alpha[min_idx] * beta[max_idx + 1] / alpha.back(); md.nonzeros.emplace_back(i, j, val); } diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 218a487e1a4..8df7b14fec9 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -38,7 +38,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include #include #include @@ -327,7 +326,10 @@ struct precision_converter { */ class half { public: - GKO_ATTRIBUTES half() noexcept = default; + // TODO: NVHPC (host side) may not use zero initialzation for the data + // member by default constructor in some cases. Not sure whether it is + // caused by something else in jacobi or isai. + GKO_ATTRIBUTES half() noexcept : data_(0){}; template ::value>> GKO_ATTRIBUTES half(const T val) diff --git a/omp/factorization/par_ilut_kernels.cpp b/omp/factorization/par_ilut_kernels.cpp index a24709e4f1a..4ba9449ccc1 100644 --- a/omp/factorization/par_ilut_kernels.cpp +++ b/omp/factorization/par_ilut_kernels.cpp @@ -181,7 +181,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // TODO FIXME: NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. + // https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto total_histogram = reinterpret_cast(sample + bucket_count); diff --git a/reference/factorization/par_ilut_kernels.cpp b/reference/factorization/par_ilut_kernels.cpp index abef6e9b5f2..69e6e99ddb2 100644 --- a/reference/factorization/par_ilut_kernels.cpp +++ b/reference/factorization/par_ilut_kernels.cpp @@ -191,7 +191,12 @@ void threshold_filter_approx(std::shared_ptr exec, // pick splitters for (IndexType i = 0; i < bucket_count - 1; ++i) { // shift by one so we get upper bounds for the buckets - sample[i] = sample[(i + 1) * sampleselect_oversampling]; + // TODO FIXME: NVHPC 23.3 seems to handle assignment index with + // optimization wrongly on a custom class when IndexType is long. We set + // the index explicitly with volatile to solve it. + // https://godbolt.org/z/srYhGndKn + volatile auto index = (i + 1) * sampleselect_oversampling; + sample[i] = sample[index]; } // count elements per bucket auto histogram = reinterpret_cast(sample + bucket_count); diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp index 43b865796b7..443bd8afef8 100644 --- a/reference/test/stop/residual_norm_kernels.cpp +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -399,7 +399,9 @@ TYPED_TEST(ResidualNorm, SelfCalculatesAndWaitsTillResidualGoal) ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); - solution->at(0) = rhs_val - r::value * T{1.2}; + // TODO FIXME: NVHPC calculates different result of rhs - r*1.2 from + // rhs - tmp = rhs - (r * 1.2). https://godbolt.org/z/GrGE9PE67 + solution->at(0) = rhs_val - r::value * T{1.4}; ASSERT_FALSE(abs_criterion->update().solution(solution).check( RelativeStoppingId, true, &stop_status, &one_changed)); ASSERT_EQ(stop_status.get_data()[0].has_converged(), false); From c7f0d2aaeae800029d785fd4c13484b4aeee1a47 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Fri, 29 Sep 2023 15:57:48 +0200 Subject: [PATCH 34/62] some math func is not defined if nvhpc is for host --- cuda/base/types.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index c45f940815f..c7fe79b5a6f 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -62,6 +62,9 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { +// It is required by NVHPC 23.3, isnan is undefined when NVHPC are only as host +// compiler. +#ifdef __CUDACC__ // from the cuda_fp16.hpp #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 @@ -94,10 +97,14 @@ __device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) } +#endif + + namespace kernels { namespace cuda { +#ifdef __CUDACC__ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 #if CUDA_VERSION >= 10020 @@ -129,7 +136,7 @@ __device__ __forceinline__ __half sqrt(const __half& val) #endif - +#endif namespace detail { From 710e037bdc3b2aeb358ac1a30d4f55f6b2c9c9df Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Sat, 30 Sep 2023 20:33:02 +0200 Subject: [PATCH 35/62] add half spmv benchmark (with cusparse for cuda) --- benchmark/CMakeLists.txt | 33 ++++++++++++++++++--------- benchmark/run_all_benchmarks.sh | 15 ++++++++++--- benchmark/spmv/CMakeLists.txt | 5 +++++ benchmark/spmv/spmv_common.hpp | 4 +++- benchmark/utils/cuda_linops.cpp | 40 ++++++++++++++++++++------------- benchmark/utils/generator.hpp | 5 +---- benchmark/utils/types.hpp | 7 +++++- 7 files changed, 74 insertions(+), 35 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 55ed76d1613..c9c5e0e64f0 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -77,17 +77,25 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_definitions("${name}" PRIVATE "${macro_def}") ginkgo_benchmark_add_tuning_maybe("${name}") if("${use_lib_linops}") - if (GINKGO_BUILD_CUDA) - target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) - target_link_libraries("${name}" cusparse_linops_${type}) - endif() - if (GINKGO_BUILD_HIP) - target_compile_definitions("${name}" PRIVATE HAS_HIP=1) - target_link_libraries("${name}" hipsparse_linops_${type}) - endif() - if (GINKGO_BUILD_SYCL) - target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) - target_link_libraries("${name}" onemkl_linops_${type}) + if ("${type}" STREQUAL "h") + # only cuda supports half currently + if (GINKGO_BUILD_CUDA) + target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) + target_link_libraries("${name}" cusparse_linops_${type}) + endif() + else() + if (GINKGO_BUILD_CUDA) + target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) + target_link_libraries("${name}" cusparse_linops_${type}) + endif() + if (GINKGO_BUILD_HIP) + target_compile_definitions("${name}" PRIVATE HAS_HIP=1) + target_link_libraries("${name}" hipsparse_linops_${type}) + endif() + if (GINKGO_BUILD_SYCL) + target_compile_definitions("${name}" PRIVATE HAS_DPCPP=1) + target_link_libraries("${name}" onemkl_linops_${type}) + endif() endif() endif() endfunction(ginkgo_add_single_benchmark_executable) @@ -117,6 +125,9 @@ if (GINKGO_BUILD_CUDA) ginkgo_benchmark_cusparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + if (GINKGO_ENABLE_HALF) + ginkgo_benchmark_cusparse_linops(h GKO_BENCHMARK_USE_HALF_PRECISION) + endif() add_library(cuda_timer utils/cuda_timer.cpp) target_link_libraries(cuda_timer PRIVATE ginkgo CUDA::cudart) ginkgo_compile_features(cuda_timer) diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh index 2a614a87904..0efc0f0b3c2 100755 --- a/benchmark/run_all_benchmarks.sh +++ b/benchmark/run_all_benchmarks.sh @@ -110,6 +110,8 @@ elif [ "${BENCHMARK_PRECISION}" == "dcomplex" ]; then BENCH_SUFFIX="_dcomplex" elif [ "${BENCHMARK_PRECISION}" == "scomplex" ]; then BENCH_SUFFIX="_scomplex" +elif [ "${BENCHMARK_PRECISION}" == "half" ]; then + BENCH_SUFFIX="_half" else echo "BENCHMARK_PRECISION is set to the not supported \"${BENCHMARK_PRECISION}\"." 1>&2 echo "Currently supported values: \"double\", \"single\", \"dcomplex\" and \"scomplex\"" 1>&2 @@ -216,9 +218,16 @@ keep_latest() { compute_matrix_statistics() { [ "${DRY_RUN}" == "true" ] && return cp "$1" "$1.imd" # make sure we're not loosing the original input - ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \ - --backup="$1.bkp" --double_buffer="$1.bkp2" \ - <"$1.imd" 2>&1 >"$1" + if [ "${BENCH_SUFFIX}" == "_half" ]; then + # half precision benchmark still uses single for statistics + ./matrix_statistics/matrix_statistics_single \ + --backup="$1.bkp" --double_buffer="$1.bkp2" \ + <"$1.imd" 2>&1 >"$1" + else + ./matrix_statistics/matrix_statistics${BENCH_SUFFIX} \ + --backup="$1.bkp" --double_buffer="$1.bkp2" \ + <"$1.imd" 2>&1 >"$1" + fi keep_latest "$1" "$1.bkp" "$1.bkp2" "$1.imd" } diff --git a/benchmark/spmv/CMakeLists.txt b/benchmark/spmv/CMakeLists.txt index 1e3bab1c884..0165d96a264 100644 --- a/benchmark/spmv/CMakeLists.txt +++ b/benchmark/spmv/CMakeLists.txt @@ -1,4 +1,9 @@ ginkgo_add_typed_benchmark_executables(spmv "YES" spmv.cpp) +# TODO: move to all benchmark +if (GINKGO_ENABLE_HALF) + ginkgo_add_single_benchmark_executable( + "spmv_half" "YES" "GKO_BENCHMARK_USE_HALF_PRECISION" "h" spmv.cpp) +endif() if(GINKGO_BUILD_MPI) add_subdirectory(distributed) endif() diff --git a/benchmark/spmv/spmv_common.hpp b/benchmark/spmv/spmv_common.hpp index 4ac777479b2..8e2b568c976 100644 --- a/benchmark/spmv/spmv_common.hpp +++ b/benchmark/spmv/spmv_common.hpp @@ -118,7 +118,9 @@ struct SpmvBenchmark : Benchmark> { exec->synchronize(); auto max_relative_norm2 = compute_max_relative_norm2(x_clone.get(), state.answer.get()); - format_case["max_relative_norm2"] = max_relative_norm2; + format_case["max_relative_norm2"] = + static_cast::type>( + max_relative_norm2); } IterationControl ic{timer}; diff --git a/benchmark/utils/cuda_linops.cpp b/benchmark/utils/cuda_linops.cpp index 4683d6086e1..bbacec084d1 100644 --- a/benchmark/utils/cuda_linops.cpp +++ b/benchmark/utils/cuda_linops.cpp @@ -527,14 +527,19 @@ class CusparseHybrid ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__))) +// cuSPARSE does not support 16 bit compute for full 16 bit floating point +// input. Also, the scalar must be the compute type, i.e. float. template -void cusparse_generic_spmv(std::shared_ptr gpu_exec, - const cusparseSpMatDescr_t mat, - const gko::array& scalars, - const gko::LinOp* b, gko::LinOp* x, - cusparseOperation_t trans, cusparseSpMVAlg_t alg) +void cusparse_generic_spmv( + std::shared_ptr gpu_exec, + const cusparseSpMatDescr_t mat, + const gko::array::type>& scalars, + const gko::LinOp* b, gko::LinOp* x, cusparseOperation_t trans, + cusparseSpMVAlg_t alg) { cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); + cudaDataType_t compute_value = gko::kernels::cuda::cuda_data_type< + typename gko::detail::arth_type::type>(); using gko::kernels::cuda::as_culibs_type; auto dense_b = gko::as>(b); auto dense_x = gko::as>(x); @@ -553,13 +558,14 @@ void cusparse_generic_spmv(std::shared_ptr gpu_exec, gko::size_type buffer_size = 0; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize( gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0], - mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, + mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg, &buffer_size)); gko::array buffer_array(gpu_exec, buffer_size); auto dbuffer = buffer_array.get_data(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV( gpu_exec->get_sparselib_handle(), trans, &scalars.get_const_data()[0], - mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, dbuffer)); + mat, vecb, &scalars.get_const_data()[1], vecx, compute_value, alg, + dbuffer)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecx)); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(vecb)); } @@ -638,8 +644,8 @@ class CusparseGenericCsr protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - Alg); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, + x, trans_, Alg); } void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, @@ -655,9 +661,11 @@ class CusparseGenericCsr {} private: + using compute_type = typename gko::detail::arth_type::type; // Contains {alpha, beta} - gko::array scalars{ - this->get_executor(), {gko::one(), gko::zero()}}; + gko::array scalars{ + this->get_executor(), + {gko::one(), gko::zero()}}; std::shared_ptr csr_; cusparseOperation_t trans_; cusparseSpMatDescr_t mat_; @@ -730,8 +738,8 @@ class CusparseGenericCoo protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - default_csr_alg); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, + x, trans_, default_csr_alg); } void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, @@ -746,9 +754,11 @@ class CusparseGenericCoo {} private: + using compute_type = typename gko::detail::arth_type::type; // Contains {alpha, beta} - gko::array scalars{ - this->get_executor(), {gko::one(), gko::zero()}}; + gko::array scalars{ + this->get_executor(), + {gko::one(), gko::zero()}}; std::shared_ptr coo_; cusparseOperation_t trans_; cusparseSpMatDescr_t mat_; diff --git a/benchmark/utils/generator.hpp b/benchmark/utils/generator.hpp index 9ec22a33d1b..b6b40207e7b 100644 --- a/benchmark/utils/generator.hpp +++ b/benchmark/utils/generator.hpp @@ -132,10 +132,7 @@ struct DefaultSystemGenerator { { auto res = Vec::create(exec); res->read(gko::matrix_data( - size, - std::uniform_real_distribution>(-1.0, - 1.0), - get_engine())); + size, std::uniform_real_distribution<>(-1.0, 1.0), get_engine())); return res; } diff --git a/benchmark/utils/types.hpp b/benchmark/utils/types.hpp index eadb8650463..2bfe21bd368 100644 --- a/benchmark/utils/types.hpp +++ b/benchmark/utils/types.hpp @@ -17,7 +17,8 @@ using itype = gko::int32; #if defined(GKO_BENCHMARK_USE_DOUBLE_PRECISION) || \ defined(GKO_BENCHMARK_USE_SINGLE_PRECISION) || \ defined(GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) || \ - defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + defined(GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) || \ + defined(GKO_BENCHMARK_USE_HALF_PRECISION) // separate ifdefs to catch duplicate definitions #ifdef GKO_BENCHMARK_USE_DOUBLE_PRECISION using etype = double; @@ -31,6 +32,10 @@ using etype = std::complex; #ifdef GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION using etype = std::complex; #endif +#ifdef GKO_BENCHMARK_USE_HALF_PRECISION +#include +using etype = gko::half; +#endif #else // default to double precision using etype = double; #endif From 34845f3891d657006dc8df486fbd4eb024f97c7b Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 24 Oct 2023 17:20:20 +0200 Subject: [PATCH 36/62] fixes batched support for half --- core/matrix/batch_dense.cpp | 19 +++++++++ core/matrix/batch_ell.cpp | 21 ++++++++++ core/solver/batch_dispatch.hpp | 40 +++++++++++++++++- cuda/solver/batch_bicgstab_kernels.cu | 47 +++++++++++---------- hip/solver/batch_bicgstab_kernels.hip.cpp | 48 +++++++++++----------- include/ginkgo/core/base/types.hpp | 2 + include/ginkgo/core/log/logger.hpp | 11 +++++ include/ginkgo/core/matrix/batch_dense.hpp | 24 +++++++++-- include/ginkgo/core/matrix/batch_ell.hpp | 21 +++++++++- 9 files changed, 178 insertions(+), 55 deletions(-) diff --git a/core/matrix/batch_dense.cpp b/core/matrix/batch_dense.cpp index 6390a4c7ad0..f56e512d41e 100644 --- a/core/matrix/batch_dense.cpp +++ b/core/matrix/batch_dense.cpp @@ -259,6 +259,25 @@ void Dense::move_to(Dense>* result) } +#if GINKGO_ENABLE_HALF +template +void Dense::convert_to( + Dense>>* result) const +{ + result->values_ = this->values_; + result->set_size(this->get_size()); +} + + +template +void Dense::move_to( + Dense>>* result) +{ + this->convert_to(result); +} +#endif + + #define GKO_DECLARE_BATCH_DENSE_MATRIX(_type) class Dense<_type> GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_DENSE_MATRIX); diff --git a/core/matrix/batch_ell.cpp b/core/matrix/batch_ell.cpp index 3722c41de60..288d053e219 100644 --- a/core/matrix/batch_ell.cpp +++ b/core/matrix/batch_ell.cpp @@ -283,6 +283,27 @@ void Ell::move_to( } +#if GINKGO_ENABLE_HALF +template +void Ell::convert_to( + Ell>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->num_elems_per_row_ = this->num_elems_per_row_; + result->set_size(this->get_size()); +} + + +template +void Ell::move_to( + Ell>, IndexType>* result) +{ + this->convert_to(result); +} +#endif + + #define GKO_DECLARE_BATCH_ELL_MATRIX(ValueType) class Ell GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_ELL_MATRIX); diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 018a6674df5..7374e4cdec1 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -85,6 +85,23 @@ using DeviceValueType = gko::kernels::hip::hip_type; #include "dpcpp/stop/batch_criteria.hpp" +namespace gko { +namespace kernels { +namespace host { + + +template +inline std::decay_t as_device_type(T val) +{ + return val; +} + + +} // namespace host +} // namespace kernels +} // namespace gko + + namespace gko { namespace batch { namespace solver { @@ -114,6 +131,23 @@ using DeviceValueType = ValueType; #include "reference/stop/batch_criteria.hpp" +namespace gko { +namespace kernels { +namespace host { + + +template +inline std::decay_t as_device_type(T val) +{ + return val; +} + + +} // namespace host +} // namespace kernels +} // namespace gko + + namespace gko { namespace batch { namespace solver { @@ -181,6 +215,7 @@ class batch_solver_dispatch { using value_type = ValueType; using device_value_type = DeviceValueType; using real_type = remove_complex; + using device_real_type = DeviceValueType; batch_solver_dispatch( const KernelCaller& kernel_caller, const SettingsType& settings, @@ -270,8 +305,9 @@ class batch_solver_dispatch { { if (logger_type_ == log::detail::log_type::simple_convergence_completion) { - device::batch_log::SimpleFinalLogger logger( - log_data.res_norms.get_data(), log_data.iter_counts.get_data()); + device::batch_log::SimpleFinalLogger logger( + device::as_device_type(log_data.res_norms.get_data()), + log_data.iter_counts.get_data()); dispatch_on_preconditioner(logger, amat, b_item, x_item); } else { GKO_NOT_IMPLEMENTED; diff --git a/cuda/solver/batch_bicgstab_kernels.cu b/cuda/solver/batch_bicgstab_kernels.cu index 8a5eee6b196..6b9755fae39 100644 --- a/cuda/solver/batch_bicgstab_kernels.cu +++ b/cuda/solver/batch_bicgstab_kernels.cu @@ -75,13 +75,13 @@ template using settings = gko::kernels::batch_bicgstab::settings; -template +template class kernel_caller { public: - using value_type = CuValueType; + using cu_value_type = cuda_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{std::move(exec)}, settings_{settings} {} @@ -91,16 +91,17 @@ public: void launch_apply_kernel( const gko::kernels::batch_bicgstab::storage_config& sconf, LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, + const cu_value_type* const __restrict__ b_values, + cu_value_type* const __restrict__ x_values, + cu_value_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) const { batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); + as_cuda_type(settings_.residual_tol), + logger, prec, mat, b_values, x_values, + workspace_data); } @@ -108,21 +109,20 @@ public: typename LogType> void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; - const int shmem_per_blk = - get_max_dynamic_shared_memory(exec_); + const int shmem_per_blk = get_max_dynamic_shared_memory< + StopType, PrecType, LogType, BatchMatrixType, cu_value_type>(exec_); // TODO const int block_size = 256; // get_num_threads_per_block( + // BatchMatrixType, cu_value_type>( // exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); @@ -130,18 +130,18 @@ public: padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage( + cu_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(cu_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + sconf.gmem_stride_bytes * num_batch_items / sizeof(cu_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cu_value_type) == 0); - value_type* const workspace_data = workspace.get_data(); + cu_value_type* const workspace_data = workspace.get_data(); // TODO: split compilation // Template parameters launch_apply_kernel exec_; - const settings> settings_; + const settings> settings_; }; @@ -223,9 +223,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/hip/solver/batch_bicgstab_kernels.hip.cpp b/hip/solver/batch_bicgstab_kernels.hip.cpp index 17199d2cd19..a51feeb5b6c 100644 --- a/hip/solver/batch_bicgstab_kernels.hip.cpp +++ b/hip/solver/batch_bicgstab_kernels.hip.cpp @@ -55,13 +55,13 @@ template using settings = gko::kernels::batch_bicgstab::settings; -template +template class kernel_caller { public: - using value_type = HipValueType; + using hip_value_type = hip_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{exec}, settings_{settings} {} @@ -71,16 +71,17 @@ class kernel_caller { void launch_apply_kernel( const gko::kernels::batch_bicgstab::storage_config& sconf, LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const + const hip_value_type* const __restrict__ b_values, + hip_value_type* const __restrict__ x_values, + hip_value_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) const { batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); + as_hip_type(settings_.residual_tol), + logger, prec, mat, b_values, x_values, + workspace_data); } @@ -88,10 +89,10 @@ class kernel_caller { typename LogType> void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = @@ -109,22 +110,20 @@ class kernel_caller { // Returns amount required in bytes const size_t prec_size = PrecType::dynamic_work_size( padded_num_rows, mat.get_single_item_num_nnz()); - const auto sconf = - gko::kernels::batch_bicgstab::compute_shared_storage( - shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), - b.num_rhs); + const auto sconf = gko::kernels::batch_bicgstab::compute_shared_storage< + PrecType, hip_value_type>(shmem_per_blk, padded_num_rows, + mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(hip_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); + sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type)); bool is_stride_aligned = - sconf.gmem_stride_bytes % sizeof(value_type) == 0; + sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0; GKO_ASSERT(is_stride_aligned); - value_type* const workspace_data = workspace.get_data(); + hip_value_type* const workspace_data = workspace.get_data(); // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. @@ -194,7 +193,7 @@ class kernel_caller { private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -207,9 +206,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using hip_value_type = hip_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index b5f87f1c96f..41a1048ba40 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -564,8 +564,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _macro(std::complex, int32) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \ + GKO_ADAPT_HF(_macro(half, int32)); \ template _macro(float, int32); \ template _macro(double, int32); \ + GKO_ADAPT_HF(_macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32) #endif diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp index 7a75fe0d111..a2d49b6ec3e 100644 --- a/include/ginkgo/core/log/logger.hpp +++ b/include/ginkgo/core/log/logger.hpp @@ -580,6 +580,17 @@ public: \ const array& iters, const array& residual_norms) const {} + /** + * Batch solver's event that records the iteration count and the residual + * norm. + * + * @param iters the array of iteration counts. + * @param residual_norms the array storing the residual norms. + */ + virtual void on_batch_solver_completed( + const array& iters, const array& residual_norms) const + {} + public: #undef GKO_LOGGER_REGISTER_EVENT diff --git a/include/ginkgo/core/matrix/batch_dense.hpp b/include/ginkgo/core/matrix/batch_dense.hpp index 5ea7c3ee128..ad4db6d0a84 100644 --- a/include/ginkgo/core/matrix/batch_dense.hpp +++ b/include/ginkgo/core/matrix/batch_dense.hpp @@ -45,11 +45,15 @@ namespace matrix { * @ingroup BatchLinOp */ template -class Dense final : public EnableBatchLinOp>, - public ConvertibleTo>> { +class Dense final + : public EnableBatchLinOp>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo>>>, +#endif + public ConvertibleTo>> { friend class EnablePolymorphicObject; friend class Dense>; - friend class Dense>; + friend class Dense>; public: using EnableBatchLinOp::convert_to; @@ -66,6 +70,20 @@ class Dense final : public EnableBatchLinOp>, void move_to(Dense>* result) override; +#if GINKGO_ENABLE_HALF + friend class Dense>>; + using ConvertibleTo< + Dense>>>::convert_to; + using ConvertibleTo< + Dense>>>::move_to; + + void convert_to(Dense>>* result) + const override; + + void move_to( + Dense>>* result) override; +#endif + /** * Creates a mutable view (of gko::matrix::Dense type) of one item of the * batch::matrix::Dense object. Does not perform any deep diff --git a/include/ginkgo/core/matrix/batch_ell.hpp b/include/ginkgo/core/matrix/batch_ell.hpp index b760cee795a..c52da8f8f9d 100644 --- a/include/ginkgo/core/matrix/batch_ell.hpp +++ b/include/ginkgo/core/matrix/batch_ell.hpp @@ -51,10 +51,14 @@ namespace matrix { template class Ell final : public EnableBatchLinOp>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Ell>, IndexType>>, +#endif public ConvertibleTo, IndexType>> { friend class EnablePolymorphicObject; friend class Ell, IndexType>; - friend class Ell, IndexType>; + friend class Ell, IndexType>; static_assert(std::is_same::value, "IndexType must be a 32 bit integer"); @@ -73,6 +77,21 @@ class Ell final void move_to(Ell, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Ell>, + IndexType>; + using ConvertibleTo< + Ell>, IndexType>>::convert_to; + using ConvertibleTo< + Ell>, IndexType>>::move_to; + + void convert_to(Ell>, IndexType>* + result) const override; + + void move_to(Ell>, IndexType>* + result) override; +#endif + /** * Creates a mutable view (of matrix::Ell type) of one item of the * batch::matrix::Ell object. Does not perform any deep From 48afbb59eebed4b95d290a9561ec9f87320699e9 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Fri, 3 Nov 2023 18:13:00 +0100 Subject: [PATCH 37/62] generate PTX load/stores for half --- .../cuda_hip/components/memory.nvidia.hpp.inc | 243 ++++++++++++++++++ dev_tools/scripts/generate_cuda_memory_ptx.py | 96 +++++++ 2 files changed, 339 insertions(+) diff --git a/common/cuda_hip/components/memory.nvidia.hpp.inc b/common/cuda_hip/components/memory.nvidia.hpp.inc index a695904e82a..a284bdab708 100644 --- a/common/cuda_hip/components/memory.nvidia.hpp.inc +++ b/common/cuda_hip/components/memory.nvidia.hpp.inc @@ -1031,3 +1031,246 @@ __device__ __forceinline__ void store_relaxed(thrust::complex* ptr, "d"(real_result), "d"(imag_result) : "memory"); } + + +__device__ __forceinline__ __half load_relaxed_shared(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.shared.b16 t, [%1];\n\t" +#else + " ld.relaxed.cta.shared.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr))) + : "memory"); + + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_relaxed_shared(__half* ptr, __half result) +{ + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.shared.b16 [%0], t;\n\t" +#else + " st.relaxed.cta.shared.b16 [%0], t;\n\t" +#endif + "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_acquire_shared(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.shared.b16 t, [%1];\n\t" +#else + " ld.acquire.cta.shared.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "r"(convert_generic_ptr_to_smem_ptr(const_cast<__half*>(ptr))) + : "memory"); + membar_acq_rel_shared(); + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_release_shared(__half* ptr, __half result) +{ + membar_acq_rel_shared(); + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.shared.b16 [%0], t;\n\t" +#else + " st.release.cta.shared.b16 [%0], t;\n\t" +#endif + "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_relaxed(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.b16 t, [%1];\n\t" +#else + " ld.relaxed.gpu.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "l"(const_cast<__half*>(ptr)) + : "memory"); + + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_relaxed(__half* ptr, __half result) +{ + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.b16 [%0], t;\n\t" +#else + " st.relaxed.gpu.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ __half load_acquire(const __half* ptr) +{ + float result; + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + " ld.volatile.b16 t, [%1];\n\t" +#else + " ld.acquire.gpu.b16 t, [%1];\n\t" +#endif + " cvt.f32.f16 %0, t;\n\t" + "}" + : "=f"(result) + : "l"(const_cast<__half*>(ptr)) + : "memory"); + membar_acq_rel(); + return static_cast<__half>(result); +} + + +__device__ __forceinline__ void store_release(__half* ptr, __half result) +{ + membar_acq_rel(); + asm volatile( + "{\n\t" + " .reg .f16 t;\n\t" + " cvt.rn.f16.f32 t, %1;\n\t" +#if __CUDA_ARCH__ < 700 + " st.volatile.b16 [%0], t;\n\t" +#else + " st.release.gpu.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(static_cast(result)) + : "memory"); +} + + +__device__ __forceinline__ thrust::complex<__half> load_relaxed_shared( + const thrust::complex<__half>* ptr) +{ + float real_result; + float imag_result; + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + "ld.volatile.shared.v2.b16 {%0, %1}, [%2];\n\t" +#else + "ld.relaxed.cta.shared.v2.b16 {%0, %1}, [%2];\n\t" +#endif + " cvt.f32.f16 %0, t.x;\n\t" + " cvt.f32.f16 %1, t.y;\n\t" + "}" + : "=f"(real_result), "=f"(imag_result) + : "r"(convert_generic_ptr_to_smem_ptr( + const_cast*>(ptr))) + : "memory"); + return thrust::complex<__half>{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed_shared( + thrust::complex<__half>* ptr, thrust::complex<__half> result) +{ + auto real_result = static_cast(result.real()); + auto imag_result = static_cast(result.imag()); + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" + " cvt.rn.f16.f32 t.x, %1;\n\t" + " cvt.rn.f16.f32 t.y, %2;\n\t" +#if __CUDA_ARCH__ < 700 + "st.volatile.shared.v2.b16 [%0], t;\n\t" +#else + "st.relaxed.cta.shared.v2.b16 [%0], t;\n\t" +#endif + "}" ::"r"(convert_generic_ptr_to_smem_ptr(ptr)), + "f"(real_result), "f"(imag_result) + : "memory"); +} + + +__device__ __forceinline__ thrust::complex<__half> load_relaxed( + const thrust::complex<__half>* ptr) +{ + float real_result; + float imag_result; + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" +#if __CUDA_ARCH__ < 700 + "ld.volatile.v2.b16 {%0, %1}, [%2];\n\t" +#else + "ld.relaxed.gpu.v2.b16 {%0, %1}, [%2];\n\t" +#endif + " cvt.f32.f16 %0, t.x;\n\t" + " cvt.f32.f16 %1, t.y;\n\t" + "}" + : "=f"(real_result), "=f"(imag_result) + : "l"(const_cast*>(ptr)) + : "memory"); + return thrust::complex<__half>{real_result, imag_result}; +} + + +__device__ __forceinline__ void store_relaxed(thrust::complex<__half>* ptr, + thrust::complex<__half> result) +{ + auto real_result = static_cast(result.real()); + auto imag_result = static_cast(result.imag()); + asm volatile( + "{\n\t" + " .reg .v2 .f16 t;\n\t" + " cvt.rn.f16.f32 t.x, %1;\n\t" + " cvt.rn.f16.f32 t.y, %2;\n\t" +#if __CUDA_ARCH__ < 700 + "st.volatile.v2.b16 [%0], t;\n\t" +#else + "st.relaxed.gpu.v2.b16 [%0], t;\n\t" +#endif + "}" ::"l"(ptr), + "f"(real_result), "f"(imag_result) + : "memory"); +} diff --git a/dev_tools/scripts/generate_cuda_memory_ptx.py b/dev_tools/scripts/generate_cuda_memory_ptx.py index 49f99d4d96f..a7ed2670819 100755 --- a/dev_tools/scripts/generate_cuda_memory_ptx.py +++ b/dev_tools/scripts/generate_cuda_memory_ptx.py @@ -191,3 +191,99 @@ class type_desc: : "memory"); }} """) + +# since there are no constraints for f16 register an intermediate conversion needs to happen +t = type_desc(ptx_type_suffix='.f16', val_constraint='f', name='__half') +t.parent_name = "float" +t.ptx_parent_type_suffix = '.f32' +t.ptx_mem_type_suffix = '.b16' +for s in memory_spaces: + for o in memory_orderings: + membar_expression = "" if o.is_relaxed else f"membar_acq_rel{s.fn_suffix}();" + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast<{t.name}*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") + print(f""" +__device__ __forceinline__ {t.name} load{o.fn_load_suffix}{s.fn_suffix}(const {t.name}* ptr) +{{ + {t.parent_name} result; + asm volatile("{{\\n\\t" + " .reg {t.ptx_type_suffix} t;\\n\\t" + #if __CUDA_ARCH__ < 700 + " ld.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + #else + " ld{o.ptx_load_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} t, [%1];\\n\\t" + #endif + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t;\\n\\t" + "}}" + : "={t.val_constraint}"(result) + : "{s.ptr_constraint}"({const_ptr_expr}) + : "memory"); + {membar_expression} + return static_cast<{t.name}>(result); +}} + + +__device__ __forceinline__ void store{o.fn_store_suffix}{s.fn_suffix}({t.name}* ptr, {t.name} result) +{{ + {membar_expression} + asm volatile("{{\\n\\t" + " .reg {t.ptx_type_suffix} t;\\n\\t" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t, %1;\\n\\t" + #if __CUDA_ARCH__ < 700 + " st.volatile{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + #else + " st{o.ptx_store_suffix}{s.ptx_scope_suffix}{s.ptx_space_suffix}{t.ptx_mem_type_suffix} [%0], t;\\n\\t" + #endif + "}}" + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(static_cast<{t.parent_name}>(result)) + : "memory"); +}} +""") + +for s in memory_spaces: + o = ordering(ptx_load_suffix=".relaxed", fn_load_suffix="_relaxed", + ptx_store_suffix=".relaxed", fn_store_suffix="_relaxed", is_relaxed=True) + const_ptr_expr = s.ptr_expr.format( + ptr=f"const_cast*>(ptr)") + mut_ptr_expr = s.ptr_expr.format(ptr="ptr") + print(f""" +__device__ __forceinline__ thrust::complex<{t.name}> load_relaxed{s.fn_suffix}(const thrust::complex<{t.name}>* ptr) +{{ + {t.parent_name} real_result; + {t.parent_name} imag_result; + asm volatile("{{\\n\\t" + " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" +#if __CUDA_ARCH__ < 700 + "ld.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{%0, %1}}, [%2];\\n\\t" +#else + "ld.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} {{%0, %1}}, [%2];\\n\\t" +#endif + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %0, t.x;\\n\\t" + " cvt{t.ptx_parent_type_suffix}{t.ptx_type_suffix} %1, t.y;\\n\\t" + "}}" + : "={t.val_constraint}"(real_result), "={t.val_constraint}"(imag_result) + : "{s.ptr_constraint}"({const_ptr_expr}) + : "memory"); + return thrust::complex<{t.name}>{{real_result, imag_result}}; +}} + + +__device__ __forceinline__ void store_relaxed{s.fn_suffix}(thrust::complex<{t.name}>* ptr, thrust::complex<{t.name}> result) +{{ + auto real_result = static_cast<{t.parent_name}>(result.real()); + auto imag_result = static_cast<{t.parent_name}>(result.imag()); + asm volatile("{{\\n\\t" + " .reg .v2 {t.ptx_type_suffix} t;\\n\\t" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.x, %1;\\n\\t" + " cvt.rn{t.ptx_type_suffix}{t.ptx_parent_type_suffix} t.y, %2;\\n\\t" +#if __CUDA_ARCH__ < 700 + "st.volatile{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" +#else + "st.relaxed{s.ptx_scope_suffix}{s.ptx_space_suffix}.v2{t.ptx_mem_type_suffix} [%0], t;\\n\\t" +#endif + "}}" + :: "{s.ptr_constraint}"({mut_ptr_expr}), "{t.val_constraint}"(real_result), "{t.val_constraint}"(imag_result) + : "memory"); +}} +""") From a51f1365b5173a862c106d45a8fc9a88e077754d Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 12 Dec 2023 15:30:04 +0000 Subject: [PATCH 38/62] fix mc64 for half Note: the issue is that numerical_limits::infinite returns float instead of half. Maybe changing that would be a better solution --- core/reorder/mc64.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp index 97dd37b90fc..aed164f7bdf 100644 --- a/core/reorder/mc64.cpp +++ b/core/reorder/mc64.cpp @@ -50,7 +50,7 @@ void initialize_weights(const matrix::Csr* host_mtx, for (IndexType row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_end = row_ptrs[row + 1]; - auto row_max = -inf; + auto row_max = static_cast>(-inf); for (IndexType idx = row_begin; idx < row_end; idx++) { const auto weight = calculate_weight(values[idx]); weights[idx] = weight; @@ -67,11 +67,13 @@ void initialize_weights(const matrix::Csr* host_mtx, } } }; - if (strategy == - gko::experimental::reorder::mc64_strategy::max_diagonal_sum) { - run_computation([](ValueType a) { return abs(a); }); + if (strategy == mc64_strategy::max_diagonal_sum) { + run_computation( + [](ValueType a) -> remove_complex { return abs(a); }); } else { - run_computation([](ValueType a) { return std::log2(abs(a)); }); + run_computation([](ValueType a) -> remove_complex { + return std::log2(abs(a)); + }); } } From 60123dc486ca2981f18e4cfe3ecf1e26b6e9a9ec Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 19 Dec 2023 08:15:20 +0000 Subject: [PATCH 39/62] fix hip memory.hip.hpp for half --- hip/components/memory.hip.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hip/components/memory.hip.hpp b/hip/components/memory.hip.hpp index d8238c11795..cba8dd3c29d 100644 --- a/hip/components/memory.hip.hpp +++ b/hip/components/memory.hip.hpp @@ -60,6 +60,12 @@ struct gcc_atomic_intrinsic_type_map { }; +template <> +struct gcc_atomic_intrinsic_type_map<__half> { + using type = int16; +}; + + #if HIP_VERSION >= 50100000 // These intrinsics can be found used in clang/test/SemaCUDA/atomic-ops.cu // in the LLVM source code From 8f1e28fb7fcde8867e018a19b5e4d04f316a907b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Sat, 20 Apr 2024 23:59:22 +0200 Subject: [PATCH 40/62] WIP: can compile but three tests are still failed --- core/matrix/batch_csr.cpp | 21 +++++++++++ core/matrix/permutation.cpp | 7 ++-- core/test/utils/batch_helpers.hpp | 2 +- include/ginkgo/core/base/half.hpp | 36 +++---------------- include/ginkgo/core/matrix/batch_csr.hpp | 21 ++++++++++- reference/test/factorization/lu_kernels.cpp | 2 +- reference/test/matrix/scaled_permutation.cpp | 3 +- reference/test/reorder/mc64_kernels.cpp | 2 +- reference/test/stop/residual_norm_kernels.cpp | 6 ++-- test/stop/residual_norm_kernels.cpp | 6 ++-- 10 files changed, 62 insertions(+), 44 deletions(-) diff --git a/core/matrix/batch_csr.cpp b/core/matrix/batch_csr.cpp index 1b1dc22a6c4..50ccc0a13d8 100644 --- a/core/matrix/batch_csr.cpp +++ b/core/matrix/batch_csr.cpp @@ -263,6 +263,27 @@ void Csr::move_to( } +#if GINKGO_ENABLE_HALF +template +void Csr::convert_to( + Csr>, IndexType>* result) const +{ + result->values_ = this->values_; + result->col_idxs_ = this->col_idxs_; + result->row_ptrs_ = this->row_ptrs_; + result->set_size(this->get_size()); +} + + +template +void Csr::move_to( + Csr>, IndexType>* result) +{ + this->convert_to(result); +} +#endif + + #define GKO_DECLARE_BATCH_CSR_MATRIX(ValueType) class Csr GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_CSR_MATRIX); diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index 0fe7ba2b2ce..127a1edfd63 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -267,8 +267,11 @@ void dispatch_dense(const LinOp* op, Functor fn) { using matrix::Dense; using std::complex; - run, std::complex>(op, - fn); + run, +#endif + double, float, std::complex, std::complex>(op, fn); } diff --git a/core/test/utils/batch_helpers.hpp b/core/test/utils/batch_helpers.hpp index eff6626de31..efff46cfbaa 100644 --- a/core/test/utils/batch_helpers.hpp +++ b/core/test/utils/batch_helpers.hpp @@ -136,7 +136,7 @@ std::unique_ptr generate_diag_dominant_batch_matrix( static_cast(num_cols)}, {}}; auto engine = std::default_random_engine(42); - auto rand_diag_dist = std::normal_distribution(20.0, 1.0); + auto rand_diag_dist = std::normal_distribution<>(20.0, 1.0); for (int row = 0; row < num_rows; ++row) { std::uniform_int_distribution rand_nnz_dist{1, row + 1}; const auto k = rand_nnz_dist(engine); diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 8df7b14fec9..f388ca7fcad 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -1,34 +1,6 @@ -/************************************************************* -Copyright (c) 2017-2023, the Ginkgo authors -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions -are met: - -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the -documentation and/or other materials provided with the distribution. - -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*************************************************************/ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause #ifndef GKO_PUBLIC_CORE_BASE_HALF_HPP_ #define GKO_PUBLIC_CORE_BASE_HALF_HPP_ @@ -326,7 +298,7 @@ struct precision_converter { */ class half { public: - // TODO: NVHPC (host side) may not use zero initialzation for the data + // TODO: NVHPC (host side) may not use zero initialization for the data // member by default constructor in some cases. Not sure whether it is // caused by something else in jacobi or isai. GKO_ATTRIBUTES half() noexcept : data_(0){}; diff --git a/include/ginkgo/core/matrix/batch_csr.hpp b/include/ginkgo/core/matrix/batch_csr.hpp index e431454063d..766ad1facb1 100644 --- a/include/ginkgo/core/matrix/batch_csr.hpp +++ b/include/ginkgo/core/matrix/batch_csr.hpp @@ -46,10 +46,14 @@ namespace matrix { template class Csr final : public EnableBatchLinOp>, +#if GINKGO_ENABLE_HALF + public ConvertibleTo< + Csr>, IndexType>>, +#endif public ConvertibleTo, IndexType>> { friend class EnablePolymorphicObject; friend class Csr, IndexType>; - friend class Csr, IndexType>; + friend class Csr, IndexType>; static_assert(std::is_same::value, "IndexType must be a 32 bit integer"); @@ -68,6 +72,21 @@ class Csr final void move_to(Csr, IndexType>* result) override; +#if GINKGO_ENABLE_HALF + friend class Csr>, + IndexType>; + using ConvertibleTo< + Csr>, IndexType>>::convert_to; + using ConvertibleTo< + Csr>, IndexType>>::move_to; + + void convert_to(Csr>, IndexType>* + result) const override; + + void move_to(Csr>, IndexType>* + result) override; +#endif + /** * Creates a mutable view (of matrix::Csr type) of one item of the * batch::matrix::Csr object. Does not perform any deep diff --git a/reference/test/factorization/lu_kernels.cpp b/reference/test/factorization/lu_kernels.cpp index 0c31a37ed45..2e48b440dbe 100644 --- a/reference/test/factorization/lu_kernels.cpp +++ b/reference/test/factorization/lu_kernels.cpp @@ -268,7 +268,7 @@ TYPED_TEST(Lu, FactorizeNonsymmetricWorks) GKO_ASSERT_MTX_EQ_SPARSITY(lu->get_combined(), this->mtx_lu); GKO_ASSERT_MTX_NEAR(lu->get_combined(), this->mtx_lu, - 15 * r::value); + 30 * r::value); ASSERT_EQ(lu->get_storage_type(), gko::experimental::factorization::storage_type::combined_lu); ASSERT_EQ(lu->get_lower_factor(), nullptr); diff --git a/reference/test/matrix/scaled_permutation.cpp b/reference/test/matrix/scaled_permutation.cpp index ba65705bf29..6d8d49f5662 100644 --- a/reference/test/matrix/scaled_permutation.cpp +++ b/reference/test/matrix/scaled_permutation.cpp @@ -145,8 +145,7 @@ TYPED_TEST(ScaledPermutation, CombineWithInverse) using index_type = typename TestFixture::index_type; const gko::size_type size = 20; auto rng = std::default_random_engine{3754}; - auto dist = std::uniform_real_distribution>{ - 1.0, 2.0}; + auto dist = std::uniform_real_distribution<>{1.0, 2.0}; auto perm = gko::matrix::ScaledPermutation::create( this->exec, size); std::iota(perm->get_permutation(), perm->get_permutation() + size, 0); diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp index 15f90839e1b..e61994c42fe 100644 --- a/reference/test/reorder/mc64_kernels.cpp +++ b/reference/test/reorder/mc64_kernels.cpp @@ -118,7 +118,7 @@ class Mc64 : public ::testing::Test { 0., 2., 3., 0.}}, final_dual_u{ref, I{0., 1., -1., -2., 0., 0.}}, final_distance{ref, I{inf(), inf(), 1., 0., inf(), 1.}}, - zero_tol{1e-14} + zero_tol{1e-4} {} std::pair, diff --git a/reference/test/stop/residual_norm_kernels.cpp b/reference/test/stop/residual_norm_kernels.cpp index 443bd8afef8..e7eef0565d2 100644 --- a/reference/test/stop/residual_norm_kernels.cpp +++ b/reference/test/stop/residual_norm_kernels.cpp @@ -85,7 +85,8 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) @@ -838,7 +839,8 @@ TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ImplicitResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp index a0a144bcf3b..93df4d69b72 100644 --- a/test/stop/residual_norm_kernels.cpp +++ b/test/stop/residual_norm_kernels.cpp @@ -96,7 +96,8 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) @@ -558,7 +559,8 @@ TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges) for (auto baseline : {mode::rhs_norm, mode::initial_resnorm, mode::absolute}) { gko::remove_complex factor = - (baseline == mode::absolute) ? 0.0 : r::value; + (baseline == mode::absolute) ? gko::zero>() + : r::value; auto criterion = gko::stop::ImplicitResidualNorm::build() .with_reduction_factor(factor) .with_baseline(baseline) From 6dbd616f6b7f4523af7b621216cd9fc9ed85eccd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 3 Jul 2024 15:29:37 +0200 Subject: [PATCH 41/62] fix config, ambiguous namespace, and batch --- core/config/config_helper.hpp | 4 ++- core/test/utils.hpp | 6 ++-- cuda/solver/batch_cg_kernels.cu | 48 ++++++++++++++--------------- hip/solver/batch_cg_kernels.hip.cpp | 40 ++++++++++++------------ 4 files changed, 50 insertions(+), 48 deletions(-) diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp index 555bb75c2a8..1b3a8b676c5 100644 --- a/core/config/config_helper.hpp +++ b/core/config/config_helper.hpp @@ -200,7 +200,9 @@ get_value(const pnode& config) * This is specialization for floating point type */ template -inline std::enable_if_t::value, ValueType> +inline std::enable_if_t::value || + std::is_same::value, + ValueType> get_value(const pnode& config) { auto val = config.get_real(); diff --git a/core/test/utils.hpp b/core/test/utils.hpp index f7683bcbd28..1be20f44ce8 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -468,7 +468,7 @@ struct TupleTypenameNameGenerator { }; -namespace detail { +namespace temporary_test { // singly linked list of all our supported precisions @@ -497,10 +497,10 @@ struct next_precision_impl> { }; -} // namespace detail +} // namespace temporary_test template -using next_precision = typename detail::next_precision_impl::type; +using next_precision = typename temporary_test::next_precision_impl::type; #define SKIP_IF_HALF(type) \ diff --git a/cuda/solver/batch_cg_kernels.cu b/cuda/solver/batch_cg_kernels.cu index 32e66d7ee54..5f6fde93f96 100644 --- a/cuda/solver/batch_cg_kernels.cu +++ b/cuda/solver/batch_cg_kernels.cu @@ -75,13 +75,14 @@ template using settings = gko::kernels::batch_cg::settings; -template +template class kernel_caller { public: - using value_type = CuValueType; + using cu_value_type = cuda_type; + ; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{std::move(exec)}, settings_{settings} {} @@ -91,36 +92,36 @@ public: void launch_apply_kernel( const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, + const cu_value_type* const __restrict__ b_values, + cu_value_type* const __restrict__ x_values, + cu_value_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) const { batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); + as_cuda_type(settings_.residual_tol), + logger, prec, mat, b_values, x_values, + workspace_data); } template void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = ceildiv(mat.num_rows, align_multiple) * align_multiple; - const int shmem_per_blk = - get_max_dynamic_shared_memory(exec_); + const int shmem_per_blk = get_max_dynamic_shared_memory< + StopType, PrecType, LogType, BatchMatrixType, cu_value_type>(exec_); const int block_size = get_num_threads_per_block( + BatchMatrixType, cu_value_type>( exec_, mat.num_rows); GKO_ASSERT(block_size >= 2 * config::warp_size); @@ -128,18 +129,18 @@ public: padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_cg::compute_shared_storage( + cu_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(cu_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); - GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(value_type) == 0); + sconf.gmem_stride_bytes * num_batch_items / sizeof(cu_value_type)); + GKO_ASSERT(sconf.gmem_stride_bytes % sizeof(cu_value_type) == 0); - value_type* const workspace_data = workspace.get_data(); + cu_value_type* const workspace_data = workspace.get_data(); // TODO: split compilation // Only instantiate when full optimizations has been enabled. Otherwise, @@ -190,7 +191,7 @@ public: private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -203,9 +204,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using cu_value_type = cuda_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index 6d5e3bff3b3..b6b10c5f35b 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -54,13 +54,13 @@ template using settings = gko::kernels::batch_cg::settings; -template +template class kernel_caller { public: - using value_type = HipValueType; + using hip_value_type = hip_type; kernel_caller(std::shared_ptr exec, - const settings> settings) + const settings> settings) : exec_{exec}, settings_{settings} {} @@ -71,15 +71,16 @@ class kernel_caller { const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, PrecType& prec, const BatchMatrixType& mat, const value_type* const __restrict__ b_values, - value_type* const __restrict__ x_values, - value_type* const __restrict__ workspace_data, const int& block_size, - const size_t& shared_size) const + hip_value_type* const __restrict__ x_values, + hip_value_type* const __restrict__ workspace_data, + const int& block_size, const size_t& shared_size) const { batch_single_kernels::apply_kernel <<get_stream()>>>(sconf, settings_.max_iterations, - settings_.residual_tol, logger, prec, mat, - b_values, x_values, workspace_data); + as_hip_type(settings_.residual_tol), + logger, prec, mat, b_values, x_values, + workspace_data); } @@ -87,10 +88,10 @@ class kernel_caller { typename LogType> void call_kernel( LogType logger, const BatchMatrixType& mat, PrecType prec, - const gko::batch::multi_vector::uniform_batch& b, - const gko::batch::multi_vector::uniform_batch& x) const + const gko::batch::multi_vector::uniform_batch& b, + const gko::batch::multi_vector::uniform_batch& x) const { - using real_type = gko::remove_complex; + using real_type = gko::remove_complex; const size_type num_batch_items = mat.num_batch_items; constexpr int align_multiple = 8; const int padded_num_rows = @@ -110,20 +111,20 @@ class kernel_caller { padded_num_rows, mat.get_single_item_num_nnz()); const auto sconf = gko::kernels::batch_cg::compute_shared_storage( + hip_value_type>( shmem_per_blk, padded_num_rows, mat.get_single_item_num_nnz(), b.num_rhs); const size_t shared_size = - sconf.n_shared * padded_num_rows * sizeof(value_type) + + sconf.n_shared * padded_num_rows * sizeof(hip_value_type) + (sconf.prec_shared ? prec_size : 0); - auto workspace = gko::array( + auto workspace = gko::array( exec_, - sconf.gmem_stride_bytes * num_batch_items / sizeof(value_type)); + sconf.gmem_stride_bytes * num_batch_items / sizeof(hip_value_type)); bool is_stride_aligned = - sconf.gmem_stride_bytes % sizeof(value_type) == 0; + sconf.gmem_stride_bytes % sizeof(hip_value_type) == 0; GKO_ASSERT(is_stride_aligned); - value_type* const workspace_data = workspace.get_data(); + hip_value_type* const workspace_data = workspace.get_data(); // Only instantiate when full optimizations has been enabled. Otherwise, // just use the default one with no shared memory. @@ -173,7 +174,7 @@ class kernel_caller { private: std::shared_ptr exec_; - const settings> settings_; + const settings> settings_; }; @@ -186,9 +187,8 @@ void apply(std::shared_ptr exec, batch::MultiVector* const x, batch::log::detail::log_data>& logdata) { - using hip_value_type = hip_type; auto dispatcher = batch::solver::create_dispatcher( - kernel_caller(exec, settings), settings, mat, precon); + kernel_caller(exec, settings), settings, mat, precon); dispatcher.apply(b, x, logdata); } From cd270e1c4f28baebd628376a50232e7e7efda5d6 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 3 Jul 2024 16:01:14 +0200 Subject: [PATCH 42/62] update format --- core/matrix/dense.cpp | 3 ++- core/matrix/dense_kernels.hpp | 1 - core/test/base/extended_float.cpp | 1 - dpcpp/matrix/csr_kernels.dp.cpp | 29 ++++++++++++++--------------- include/ginkgo/core/base/half.hpp | 1 - include/ginkgo/core/base/types.hpp | 1 - 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 3bc94f04011..c9872a94b05 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -1545,7 +1545,8 @@ template void gather_mixed_real_complex(Function fn, LinOp* out) { #ifdef GINKGO_MIXED_PRECISION - run, next_precision>>(out, fn); + run, + next_precision>>(out, fn); #else precision_dispatch(fn, out); #endif diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index 95b1ca754f5..d785fc4a45e 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -8,7 +8,6 @@ #include - #include #include #include diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index 1828b0c027b..7bc1d312ac2 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -10,7 +10,6 @@ #include - #include namespace { diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 61ff76325c5..468360bd72a 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -664,13 +664,13 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { - abstract_classical_spmv( - num_rows, val, col_idxs, row_ptrs, b, c, item_ct1); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(subgroup_size)]] { + abstract_classical_spmv( + num_rows, val, col_idxs, row_ptrs, b, + c, item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { @@ -718,14 +718,13 @@ void abstract_classical_spmv( { if (subgroup_size > 1) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, row_ptrs, b, beta, c, - item_ct1); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, + row_ptrs, b, beta, c, item_ct1); + }); }); } else { queue->submit([&](sycl::handler& cgh) { diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index f388ca7fcad..4be38838091 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -9,7 +9,6 @@ #include #include - #include #include diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 41a1048ba40..317471d8c93 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -17,7 +17,6 @@ #include #include - #include From 69d5b5949c19b37dd2889e6f1783987a9788d67c Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 17 Sep 2024 23:51:49 +0200 Subject: [PATCH 43/62] check the failed tests - use Csr in residual norm for half apply support - use higher tolerance for mc64 due to half range - some example can not finish in half precision for mc64 - skip some test in half due to half range - fix the half limit value Co-authored-by: Marcel Koch --- core/reorder/mc64.cpp | 15 +++++++------ include/ginkgo/core/base/half.hpp | 12 ++++++++-- reference/test/reorder/mc64.cpp | 3 ++- reference/test/reorder/mc64_kernels.cpp | 7 +++++- .../test/solver/batch_bicgstab_kernels.cpp | 22 ++++++++++++------- reference/test/solver/batch_cg_kernels.cpp | 13 ++++++++--- test/stop/residual_norm_kernels.cpp | 13 ++++++++--- 7 files changed, 60 insertions(+), 25 deletions(-) diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp index aed164f7bdf..4aa53fcde86 100644 --- a/core/reorder/mc64.cpp +++ b/core/reorder/mc64.cpp @@ -37,8 +37,8 @@ void initialize_weights(const matrix::Csr* host_mtx, array>& row_maxima_array, gko::experimental::reorder::mc64_strategy strategy) { - constexpr auto inf = - std::numeric_limits>::infinity(); + auto inf = static_cast>( + std::numeric_limits>::infinity()); const auto num_rows = host_mtx->get_size()[0]; const auto row_ptrs = host_mtx->get_const_row_ptrs(); const auto col_idxs = host_mtx->get_const_col_idxs(); @@ -181,7 +181,8 @@ void shortest_augmenting_path( addressable_priority_queue& queue, std::vector& q_j, ValueType tolerance) { - constexpr auto inf = std::numeric_limits::infinity(); + auto inf = + static_cast(std::numeric_limits::infinity()); auto weights = weights_array.get_data(); auto dual_u = dual_u_array.get_data(); auto distance = distance_array.get_data(); @@ -435,8 +436,8 @@ void compute_scaling(const matrix::Csr* host_mtx, mc64_strategy strategy, ValueType* row_scaling, ValueType* col_scaling) { - constexpr auto inf = - std::numeric_limits>::infinity(); + auto inf = static_cast>( + std::numeric_limits>::infinity()); const auto num_rows = host_mtx->get_size()[0]; const auto weights = weights_array.get_const_data(); const auto dual_u = dual_u_array.get_const_data(); @@ -540,8 +541,8 @@ std::unique_ptr Mc64::generate_impl( marked_cols.fill(0); matched_idxs.fill(0); unmatched_rows.fill(0); - constexpr auto inf = - std::numeric_limits>::infinity(); + auto inf = static_cast>( + std::numeric_limits>::infinity()); dual_u.fill(inf); distance.fill(inf); diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 4be38838091..1bb76be7741 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -648,9 +648,17 @@ struct numeric_limits { return numeric_limits::infinity(); } - static constexpr float min() { return numeric_limits::min(); } + static constexpr float min() { return 1.0f / (1ll << 14); } - static constexpr float max() { return numeric_limits::max(); } + // The maximal exponent is 15, and the maximal significant is + // 1 + (2^-10 - 1) / 2^-10 + static constexpr float max() + { + return (1ll << 15) * + (1.0f + static_cast((1ll << 10) - 1) / (1ll << 10)); + } + + static constexpr float lowest() { return -max(); }; static constexpr float quiet_NaN() { diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp index 2c64538e9b2..8b5c8d4aa38 100644 --- a/reference/test/reorder/mc64.cpp +++ b/reference/test/reorder/mc64.cpp @@ -91,6 +91,7 @@ TYPED_TEST(Mc64, CanBeCreatedWithReorderingStrategy) reorder_type::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) + .with_tolerance(1e-4) .on(this->exec) ->generate(this->not_id3_mtx); @@ -123,7 +124,7 @@ TYPED_TEST(Mc64, CanBeCreatedWithTolerance) using real_type = typename TestFixture::real_type; auto mc64 = reorder_type::build() - .with_tolerance(real_type{1e-10}) + .with_tolerance(real_type{1e-4}) .on(this->exec) ->generate(this->id3_mtx); diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp index e61994c42fe..937fe87daa4 100644 --- a/reference/test/reorder/mc64_kernels.cpp +++ b/reference/test/reorder/mc64_kernels.cpp @@ -284,6 +284,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleSum) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) + .with_tolerance(1e-4) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -307,6 +308,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleProduct) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) + .with_tolerance(1e-4) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -354,6 +356,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) + .with_tolerance(1e-4) .on(this->ref); auto mc64 = mc64_factory->generate(mtx); // get components @@ -362,7 +365,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct) mtx = mtx->scale_permute(row_perm, col_perm); - GKO_ASSERT_MTX_NEAR(mtx, expected_result, r::value); + GKO_ASSERT_MTX_NEAR(mtx, expected_result, 20 * r::value); } @@ -373,6 +376,8 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeExampleProduct) using value_type = typename TestFixture::value_type; using matrix_type = typename TestFixture::matrix_type; using perm_type = typename TestFixture::perm_type; + // this example can not be finished in half precision + SKIP_IF_HALF(value_type); // read input data std::ifstream mtx_stream{gko::matrices::location_nontrivial_mc64_example}; auto mtx = gko::share(gko::read(mtx_stream, this->ref)); diff --git a/reference/test/solver/batch_bicgstab_kernels.cpp b/reference/test/solver/batch_bicgstab_kernels.cpp index ddb6d09e12a..62e4e9cf6fe 100644 --- a/reference/test/solver/batch_bicgstab_kernels.cpp +++ b/reference/test/solver/batch_bicgstab_kernels.cpp @@ -105,8 +105,13 @@ TYPED_TEST(BatchBicgstab, StencilSystemLoggerLogsResidual) ASSERT_LE( res_log_array[i] / this->linear_system.host_rhs_norm->at(i, 0, 0), this->solver_settings.residual_tol); - ASSERT_NEAR(res_log_array[i], res.host_res_norm->get_const_values()[i], - 10 * this->eps); + if (!std::is_same::value) { + // There is no guarantee of this condition. We disable this check in + // half. + ASSERT_NEAR(res_log_array[i], + res.host_res_norm->get_const_values()[i], + 10 * this->eps); + } } } @@ -125,7 +130,7 @@ TYPED_TEST(BatchBicgstab, StencilSystemLoggerLogsIterations) auto iter_array = res.log_data->iter_counts.get_const_data(); for (size_t i = 0; i < this->num_batch_items; i++) { - ASSERT_EQ(iter_array[i], ref_iters); + ASSERT_LE(iter_array[i], ref_iters); } } @@ -136,7 +141,7 @@ TYPED_TEST(BatchBicgstab, CanSolveDenseSystem) using real_type = gko::remove_complex; using Solver = typename TestFixture::solver_type; using Mtx = typename TestFixture::Mtx; - const real_type tol = 1e-5; + const real_type tol = 1e-4; const int max_iters = 1000; auto solver_factory = Solver::build() @@ -161,7 +166,7 @@ TYPED_TEST(BatchBicgstab, CanSolveDenseSystem) for (size_t i = 0; i < num_batch_items; i++) { ASSERT_LE(res.host_res_norm->get_const_values()[i] / linear_system.host_rhs_norm->get_const_values()[i], - tol); + tol * 10); } } @@ -173,7 +178,7 @@ TYPED_TEST(BatchBicgstab, ApplyLogsResAndIters) using Solver = typename TestFixture::solver_type; using Mtx = typename TestFixture::Mtx; using Logger = gko::batch::log::BatchConvergence; - const real_type tol = 1e-5; + const real_type tol = 1e-4; const int max_iters = 1000; auto solver_factory = Solver::build() @@ -216,7 +221,7 @@ TYPED_TEST(BatchBicgstab, CanSolveEllSystem) using real_type = gko::remove_complex; using Solver = typename TestFixture::solver_type; using Mtx = typename TestFixture::EllMtx; - const real_type tol = 1e-5; + const real_type tol = 1e-4; const int max_iters = 1000; auto solver_factory = Solver::build() @@ -252,7 +257,7 @@ TYPED_TEST(BatchBicgstab, CanSolveCsrSystem) using real_type = gko::remove_complex; using Solver = typename TestFixture::solver_type; using Mtx = typename TestFixture::CsrMtx; - const real_type tol = 1e-5; + const real_type tol = 1e-4; const int max_iters = 1000; auto solver_factory = Solver::build() @@ -288,6 +293,7 @@ TYPED_TEST(BatchBicgstab, CanSolveDenseHpdSystem) using real_type = gko::remove_complex; using Solver = typename TestFixture::solver_type; using Mtx = typename TestFixture::Mtx; + SKIP_IF_HALF(value_type); const real_type tol = 1e-5; const int max_iters = 1000; auto solver_factory = diff --git a/reference/test/solver/batch_cg_kernels.cpp b/reference/test/solver/batch_cg_kernels.cpp index 4ccabfb8849..c7f29ba132c 100644 --- a/reference/test/solver/batch_cg_kernels.cpp +++ b/reference/test/solver/batch_cg_kernels.cpp @@ -80,7 +80,7 @@ TYPED_TEST(BatchCg, SolvesStencilSystem) for (size_t i = 0; i < this->num_batch_items; i++) { ASSERT_LE(res.host_res_norm->get_const_values()[i] / this->linear_system.host_rhs_norm->get_const_values()[i], - this->solver_settings.residual_tol); + 5 * this->solver_settings.residual_tol); } GKO_ASSERT_BATCH_MTX_NEAR(res.x, this->linear_system.exact_sol, this->eps * 10); @@ -101,8 +101,13 @@ TYPED_TEST(BatchCg, StencilSystemLoggerLogsResidual) ASSERT_LE( res_log_array[i] / this->linear_system.host_rhs_norm->at(i, 0, 0), this->solver_settings.residual_tol); - ASSERT_NEAR(res_log_array[i], res.host_res_norm->get_const_values()[i], - 10 * this->eps); + if (!std::is_same::value) { + // There is no guarantee of this condition. We disable this check in + // half. + ASSERT_NEAR(res_log_array[i], + res.host_res_norm->get_const_values()[i], + 10 * this->eps); + } } } @@ -133,6 +138,7 @@ TYPED_TEST(BatchCg, ApplyLogsResAndIters) using Solver = typename TestFixture::solver_type; using Mtx = typename TestFixture::Mtx; using Logger = gko::batch::log::BatchConvergence; + SKIP_IF_HALF(value_type); const real_type tol = 1e-6; const int max_iters = 1000; auto solver_factory = @@ -174,6 +180,7 @@ TYPED_TEST(BatchCg, CanSolveHpdSystem) using real_type = gko::remove_complex; using Solver = typename TestFixture::solver_type; using Mtx = typename TestFixture::Mtx; + SKIP_IF_HALF(value_type); const real_type tol = 1e-6; const int max_iters = 1000; auto solver_factory = diff --git a/test/stop/residual_norm_kernels.cpp b/test/stop/residual_norm_kernels.cpp index 93df4d69b72..7be3e7fde48 100644 --- a/test/stop/residual_norm_kernels.cpp +++ b/test/stop/residual_norm_kernels.cpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "core/test/utils.hpp" @@ -81,13 +82,16 @@ TYPED_TEST(ResidualNorm, CanIgorneResidualNorm) gko::NotSupported); } + TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) { using Mtx = typename TestFixture::Mtx; using NormVector = typename TestFixture::NormVector; using T = typename TestFixture::ValueType; + // use csr to use half apply + using Csr = gko::matrix::Csr; using mode = gko::stop::mode; - std::shared_ptr mtx = gko::initialize({1.0}, this->exec); + std::shared_ptr mtx = gko::initialize({1.0}, this->exec); std::shared_ptr rhs = gko::initialize({0.0}, this->exec); std::shared_ptr x = gko::initialize({0.0}, this->exec); std::shared_ptr res_norm = @@ -117,6 +121,7 @@ TYPED_TEST(ResidualNorm, CheckIfResZeroConverges) } } + TYPED_TEST(ResidualNorm, WaitsTillResidualGoal) { using Mtx = typename TestFixture::Mtx; @@ -547,10 +552,12 @@ TYPED_TEST_SUITE(ImplicitResidualNorm, gko::test::ValueTypes, TYPED_TEST(ImplicitResidualNorm, CheckIfResZeroConverges) { - using Mtx = typename TestFixture::Mtx; using T = typename TestFixture::ValueType; + using Mtx = typename TestFixture::Mtx; + // use csr to use half apply + using Csr = gko::matrix::Csr; using gko::stop::mode; - std::shared_ptr mtx = gko::initialize({1.0}, this->exec); + std::shared_ptr mtx = gko::initialize({1.0}, this->exec); std::shared_ptr rhs = gko::initialize({0.0}, this->exec); std::shared_ptr x = gko::initialize({0.0}, this->exec); std::shared_ptr implicit_sq_res_norm = From 57fc170dd3587f58e0b0608d1955668679517277 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 18 Sep 2024 03:37:28 +0200 Subject: [PATCH 44/62] fix windows and icpx --- include/ginkgo/core/preconditioner/ic.hpp | 2 +- reference/test/reorder/mc64.cpp | 3 ++- reference/test/reorder/mc64_kernels.cpp | 7 ++++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp index aea43af3cf1..9260bfbb891 100644 --- a/include/ginkgo/core/preconditioner/ic.hpp +++ b/include/ginkgo/core/preconditioner/ic.hpp @@ -441,7 +441,7 @@ class Ic : public EnableLinOp>, public Transposable { generate_default_solver(const std::shared_ptr& exec, const std::shared_ptr& mtx) { - constexpr gko::remove_complex default_reduce_residual{1e-4}; + const gko::remove_complex default_reduce_residual{1e-4}; const unsigned int default_max_iters{ static_cast(mtx->get_size()[0])}; diff --git a/reference/test/reorder/mc64.cpp b/reference/test/reorder/mc64.cpp index 8b5c8d4aa38..2ba8735292d 100644 --- a/reference/test/reorder/mc64.cpp +++ b/reference/test/reorder/mc64.cpp @@ -86,12 +86,13 @@ TYPED_TEST(Mc64, HasSensibleDefaults) TYPED_TEST(Mc64, CanBeCreatedWithReorderingStrategy) { using reorder_type = typename TestFixture::reorder_type; + using real_type = typename TestFixture::real_type; auto mc64 = reorder_type::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) - .with_tolerance(1e-4) + .with_tolerance(real_type{1e-4}) .on(this->exec) ->generate(this->not_id3_mtx); diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp index 937fe87daa4..56126ae1fb8 100644 --- a/reference/test/reorder/mc64_kernels.cpp +++ b/reference/test/reorder/mc64_kernels.cpp @@ -284,7 +284,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleSum) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_sum) - .with_tolerance(1e-4) + .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -304,11 +304,12 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingExampleProduct) { using index_type = typename TestFixture::index_type; using value_type = typename TestFixture::value_type; + using real_type = typename TestFixture::real_type; auto mc64_factory = gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) - .with_tolerance(1e-4) + .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(this->mtx); @@ -356,7 +357,7 @@ TYPED_TEST(Mc64, CreatesCorrectPermutationAndScalingLargeTrivialExampleProduct) gko::experimental::reorder::Mc64::build() .with_strategy( gko::experimental::reorder::mc64_strategy::max_diagonal_product) - .with_tolerance(1e-4) + .with_tolerance(real_type{1e-4}) .on(this->ref); auto mc64 = mc64_factory->generate(mtx); // get components From 18e825f7d2137c6d458547a3a0970bea890b5db1 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 18 Sep 2024 21:09:02 +0200 Subject: [PATCH 45/62] hip does not support atomic on 16 bits --- .../cuda_hip/factorization/par_ic_kernels.cpp | 25 ++++++++++----- .../factorization/par_ict_kernels.cpp | 22 +++++++++---- .../factorization/par_ilu_kernels.cpp | 30 +++++++++++------ .../factorization/par_ilut_sweep_kernels.cpp | 32 ++++++++++++------- 4 files changed, 72 insertions(+), 37 deletions(-) diff --git a/common/cuda_hip/factorization/par_ic_kernels.cpp b/common/cuda_hip/factorization/par_ic_kernels.cpp index ee8b7c97f64..f3656ac8a29 100644 --- a/common/cuda_hip/factorization/par_ic_kernels.cpp +++ b/common/cuda_hip/factorization/par_ic_kernels.cpp @@ -123,14 +123,23 @@ void compute_factor(std::shared_ptr exec, auto nnz = l->get_num_stored_elements(); auto num_blocks = ceildiv(nnz, default_block_size); if (num_blocks > 0) { - for (size_type i = 0; i < iterations; ++i) { - kernel::ic_sweep<<get_stream()>>>( - a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(), - as_device_type(a_lower->get_const_values()), - l->get_const_row_ptrs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(a_lower); + } else +#endif + { + for (size_type i = 0; i < iterations; ++i) { + kernel::ic_sweep<<get_stream()>>>( + a_lower->get_const_row_idxs(), + a_lower->get_const_col_idxs(), + as_device_type(a_lower->get_const_values()), + l->get_const_row_ptrs(), l->get_const_col_idxs(), + as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } } } diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp index 94aa5e5124e..be1866b256e 100644 --- a/common/cuda_hip/factorization/par_ict_kernels.cpp +++ b/common/cuda_hip/factorization/par_ict_kernels.cpp @@ -390,13 +390,21 @@ void compute_factor(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); if (num_blocks > 0) { - kernel::ict_sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(l); + } else +#endif + { + kernel::ict_sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements())); + } } } diff --git a/common/cuda_hip/factorization/par_ilu_kernels.cpp b/common/cuda_hip/factorization/par_ilu_kernels.cpp index 8bf71c471a8..5238fcf19c7 100644 --- a/common/cuda_hip/factorization/par_ilu_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilu_kernels.cpp @@ -94,16 +94,26 @@ void compute_l_u_factors(std::shared_ptr exec, const auto grid_dim = static_cast( ceildiv(num_elements, static_cast(block_size))); if (grid_dim > 0) { - for (size_type i = 0; i < iterations; ++i) { - kernel::compute_l_u_factors<<get_stream()>>>( - num_elements, system_matrix->get_const_row_idxs(), - system_matrix->get_const_col_idxs(), - as_device_type(system_matrix->get_const_values()), - l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(), - as_device_type(l_factor->get_values()), - u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(), - as_device_type(u_factor->get_values())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(system_matrix); + } else +#endif + { + for (size_type i = 0; i < iterations; ++i) { + kernel::compute_l_u_factors<<get_stream()>>>( + num_elements, system_matrix->get_const_row_idxs(), + system_matrix->get_const_col_idxs(), + as_device_type(system_matrix->get_const_values()), + l_factor->get_const_row_ptrs(), + l_factor->get_const_col_idxs(), + as_device_type(l_factor->get_values()), + u_factor->get_const_row_ptrs(), + u_factor->get_const_col_idxs(), + as_device_type(u_factor->get_values())); + } } } } diff --git a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp index 52f62b50e6a..7ac64270b69 100644 --- a/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_sweep_kernels.cpp @@ -154,18 +154,26 @@ void compute_l_u_factors(syn::value_list, auto block_size = default_block_size / subwarp_size; auto num_blocks = ceildiv(total_nnz, block_size); if (num_blocks > 0) { - kernel::sweep - <<get_stream()>>>( - a->get_const_row_ptrs(), a->get_const_col_idxs(), - as_device_type(a->get_const_values()), l->get_const_row_ptrs(), - l_coo->get_const_row_idxs(), l->get_const_col_idxs(), - as_device_type(l->get_values()), - static_cast(l->get_num_stored_elements()), - u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), - as_device_type(u->get_values()), u_csc->get_const_row_ptrs(), - u_csc->get_const_col_idxs(), - as_device_type(u_csc->get_values()), - static_cast(u->get_num_stored_elements())); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(a); + } else +#endif + { + kernel::sweep + <<get_stream()>>>( + a->get_const_row_ptrs(), a->get_const_col_idxs(), + as_device_type(a->get_const_values()), + l->get_const_row_ptrs(), l_coo->get_const_row_idxs(), + l->get_const_col_idxs(), as_device_type(l->get_values()), + static_cast(l->get_num_stored_elements()), + u_coo->get_const_row_idxs(), u_coo->get_const_col_idxs(), + as_device_type(u->get_values()), + u_csc->get_const_row_ptrs(), u_csc->get_const_col_idxs(), + as_device_type(u_csc->get_values()), + static_cast(u->get_num_stored_elements())); + } } } From 825f76f6320e2cd1b0f682ef1c080e67b47311bd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 18 Sep 2024 22:41:52 +0200 Subject: [PATCH 46/62] fix batch --- core/solver/batch_dispatch.hpp | 4 +- dpcpp/base/batch_multi_vector_kernels.hpp | 41 --------------------- dpcpp/preconditioner/batch_block_jacobi.hpp | 7 +++- hip/solver/batch_cg_kernels.hip.cpp | 2 +- 4 files changed, 8 insertions(+), 46 deletions(-) diff --git a/core/solver/batch_dispatch.hpp b/core/solver/batch_dispatch.hpp index 7374e4cdec1..72873fb91ec 100644 --- a/core/solver/batch_dispatch.hpp +++ b/core/solver/batch_dispatch.hpp @@ -87,7 +87,7 @@ using DeviceValueType = gko::kernels::hip::hip_type; namespace gko { namespace kernels { -namespace host { +namespace dpcpp { template @@ -97,7 +97,7 @@ inline std::decay_t as_device_type(T val) } -} // namespace host +} // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/dpcpp/base/batch_multi_vector_kernels.hpp b/dpcpp/base/batch_multi_vector_kernels.hpp index 74abaeda86f..96ada23f42c 100644 --- a/dpcpp/base/batch_multi_vector_kernels.hpp +++ b/dpcpp/base/batch_multi_vector_kernels.hpp @@ -65,25 +65,6 @@ __dpct_inline__ void add_scaled_kernel( } -template -__dpct_inline__ void single_rhs_compute_conj_dot( - const int num_rows, const ValueType* const __restrict__ x, - const ValueType* const __restrict__ y, ValueType& result, - sycl::nd_item<3> item_ct1) -{ - const auto group = item_ct1.get_group(); - const auto group_size = item_ct1.get_local_range().size(); - const auto tid = item_ct1.get_local_linear_id(); - - ValueType val = zero(); - - for (int r = tid; r < num_rows; r += group_size) { - val += conj(x[r]) * y[r]; - } - result = sycl::reduce_over_group(group, val, sycl::plus<>()); -} - - template __dpct_inline__ void single_rhs_compute_conj_dot_sg( const int num_rows, const ValueType* const __restrict__ x, @@ -174,28 +155,6 @@ __dpct_inline__ void single_rhs_compute_norm2_sg( } -template -__dpct_inline__ void single_rhs_compute_norm2( - const int num_rows, const ValueType* const __restrict__ x, - gko::remove_complex& result, sycl::nd_item<3> item_ct1) -{ - const auto group = item_ct1.get_group(); - const auto group_size = item_ct1.get_local_range().size(); - const auto tid = item_ct1.get_local_linear_id(); - - using real_type = typename gko::remove_complex; - real_type val = zero(); - - for (int r = tid; r < num_rows; r += group_size) { - val += squared_norm(x[r]); - } - - val = sycl::reduce_over_group(group, val, sycl::plus<>()); - - result = sqrt(val); -} - - template __dpct_inline__ void compute_norm2_kernel( const gko::batch::multi_vector::batch_item& x, diff --git a/dpcpp/preconditioner/batch_block_jacobi.hpp b/dpcpp/preconditioner/batch_block_jacobi.hpp index a7431f919a5..04c21f97991 100644 --- a/dpcpp/preconditioner/batch_block_jacobi.hpp +++ b/dpcpp/preconditioner/batch_block_jacobi.hpp @@ -129,8 +129,11 @@ class BlockJacobi final { sum += block_val * r[dense_block_col + idx_start]; } - // reduction - sum = sycl::reduce_over_group(sg, sum, sycl::plus<>()); + // reduction (it does not support half) + // sum = sycl::reduce_over_group(sg, sum, sycl::plus<>()); + for (int i = sg_size / 2; i > 0; i /= 2) { + sum += sg.shuffle_down(sum, i); + } if (sg_tid == 0) { z[row_idx] = sum; diff --git a/hip/solver/batch_cg_kernels.hip.cpp b/hip/solver/batch_cg_kernels.hip.cpp index b6b10c5f35b..22a4f416e06 100644 --- a/hip/solver/batch_cg_kernels.hip.cpp +++ b/hip/solver/batch_cg_kernels.hip.cpp @@ -70,7 +70,7 @@ class kernel_caller { void launch_apply_kernel( const gko::kernels::batch_cg::storage_config& sconf, LogType& logger, PrecType& prec, const BatchMatrixType& mat, - const value_type* const __restrict__ b_values, + const hip_value_type* const __restrict__ b_values, hip_value_type* const __restrict__ x_values, hip_value_type* const __restrict__ workspace_data, const int& block_size, const size_t& shared_size) const From 81d63ac97bd908b076d8bb501ad5e8f367ba9fba Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 19 Sep 2024 15:06:03 +0200 Subject: [PATCH 47/62] add miss instantiation --- core/base/mixed_precision_types.hpp | 190 +++++++++++----------- include/ginkgo/core/base/types.hpp | 236 ++++++++++++++-------------- 2 files changed, 214 insertions(+), 212 deletions(-) diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index 0f1f9869f91..ebf4291c053 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -15,97 +15,97 @@ #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT1(_macro, ...) \ - GKO_ADAPT_HF(_macro(float, half, half, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(float, half, float, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(float, half, double, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(float, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, float, half, __VA_ARGS__)); \ template _macro(float, float, float, __VA_ARGS__); \ template _macro(float, float, double, __VA_ARGS__); \ - GKO_ADAPT_HF(_macro(float, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, double, half, __VA_ARGS__)); \ template _macro(float, double, float, __VA_ARGS__); \ template _macro(float, double, double, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT2(_macro, ...) \ - GKO_ADAPT_HF(_macro(double, half, half, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(double, half, float, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(double, half, double, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(double, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, float, half, __VA_ARGS__)); \ template _macro(double, float, float, __VA_ARGS__); \ template _macro(double, float, double, __VA_ARGS__); \ - GKO_ADAPT_HF(_macro(double, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, double, half, __VA_ARGS__)); \ template _macro(double, double, float, __VA_ARGS__); \ template _macro(double, double, double, __VA_ARGS__) -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT3(_macro, ...) \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT4(_macro, ...) \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + template _macro(std::complex, std::complex, \ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ - GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, half, float, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, half, double, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, float, half, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, float, float, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, float, double, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, double, half, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, double, float, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, double, double, __VA_ARGS__)) - -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)) + GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, double, __VA_ARGS__)) + +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #else @@ -125,11 +125,11 @@ std::complex, __VA_ARGS__) #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT5(_macro, ...) \ - GKO_ADAPT_HF(_macro(half, half, half, __VA_ARGS__)) + GKO_ADAPT_HF(template _macro(half, half, half, __VA_ARGS__)) -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__)) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_SPLIT6(_macro, ...) \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #endif @@ -151,34 +151,36 @@ #ifdef GINKGO_MIXED_PRECISION #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, float, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(half, double, __VA_ARGS__)); \ - GKO_ADAPT_HF(_macro(float, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, float, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, double, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(float, half, __VA_ARGS__)); \ template _macro(float, float, __VA_ARGS__); \ template _macro(float, double, __VA_ARGS__); \ - GKO_ADAPT_HF(_macro(double, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(double, half, __VA_ARGS__)); \ template _macro(double, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ - GKO_ADAPT_HF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ GKO_ADAPT_HF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ - GKO_ADAPT_HF( \ - _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex, \ + __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - GKO_ADAPT_HF(_macro(half, half, __VA_ARGS__)); \ + GKO_ADAPT_HF(template _macro(half, half, __VA_ARGS__)); \ template _macro(float, float, __VA_ARGS__); \ template _macro(double, double, __VA_ARGS__); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_HF( \ + template _macro(std::complex, std::complex, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ template _macro(std::complex, std::complex, __VA_ARGS__) #endif diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 317471d8c93..68cde9c6548 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -409,7 +409,7 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, // cuda half operation is supported from arch 5.3 #if GINKGO_ENABLE_HALF && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 530) -#define GKO_ADAPT_HF(_macro) template _macro +#define GKO_ADAPT_HF(_macro) _macro #else #define GKO_ADAPT_HF(_macro) \ static_assert(true, \ @@ -428,13 +428,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(template _macro(half)); \ template _macro(float); \ template <> \ _macro(double) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half)); \ + GKO_ADAPT_HF(template _macro(half)); \ template _macro(float); \ template _macro(double) #endif @@ -455,14 +455,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex)); \ template _macro(std::complex); \ template <> \ _macro(std::complex) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - GKO_ADAPT_HF(_macro(std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex)); \ template _macro(std::complex); \ template _macro(std::complex) #endif @@ -485,28 +485,28 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half, half)); \ + GKO_ADAPT_HF(template _macro(half, half)); \ template _macro(float, float); \ template <> \ _macro(double, double) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ template _macro(std::complex, std::complex); \ template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(std::complex, half)); \ + GKO_ADAPT_HF(template _macro(std::complex, half)); \ template _macro(std::complex, float); \ template <> \ _macro(std::complex, double) GKO_NOT_IMPLEMENTED; #else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - GKO_ADAPT_HF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + GKO_ADAPT_HF(template _macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_HF(template _macro(std::complex, half)); \ + template _macro(std::complex, float); \ template _macro(std::complex, double) #endif @@ -535,38 +535,40 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, */ #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half, int32)); \ + GKO_ADAPT_HF(template _macro(half, int32)); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(half, int64)); \ + GKO_ADAPT_HF(template _macro(half, int64)); \ template _macro(float, int64); \ template <> \ _macro(double, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half, int32)); \ + GKO_ADAPT_HF(template _macro(half, int32)); \ template _macro(float, int32); \ template _macro(double, int32); \ - GKO_ADAPT_HF(_macro(half, int64)); \ + GKO_ADAPT_HF(template _macro(half, int64)); \ template _macro(float, int64); \ template _macro(double, int64) #endif #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \ + GKO_ADAPT_HF(template _macro(half, int32)); \ template _macro(float, int32); \ template <> \ _macro(double, int32) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(template _macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INT32_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half, int32)); \ + GKO_ADAPT_HF(template _macro(half, int32)); \ template _macro(float, int32); \ template _macro(double, int32); \ - GKO_ADAPT_HF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32) #endif @@ -583,21 +585,21 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - GKO_ADAPT_HF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template <> \ _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template <> \ _macro(std::complex, int64) GKO_NOT_IMPLEMENTED #else #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - GKO_ADAPT_HF(_macro(std::complex, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32)); \ template _macro(std::complex, int32); \ template _macro(std::complex, int32); \ - GKO_ADAPT_HF(_macro(std::complex, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int64)); \ template _macro(std::complex, int64); \ template _macro(std::complex, int64) #endif @@ -615,9 +617,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ - GKO_ADAPT_HF(_macro(half, int32, int32)); \ - GKO_ADAPT_HF(_macro(half, int32, int64)); \ - GKO_ADAPT_HF(_macro(half, int64, int64)); \ + GKO_ADAPT_HF(template _macro(half, int32, int32)); \ + GKO_ADAPT_HF(template _macro(half, int32, int64)); \ + GKO_ADAPT_HF(template _macro(half, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -630,9 +632,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro) \ - GKO_ADAPT_HF(_macro(half, int32, int32)); \ - GKO_ADAPT_HF(_macro(half, int32, int64)); \ - GKO_ADAPT_HF(_macro(half, int64, int64)); \ + GKO_ADAPT_HF(template _macro(half, int32, int32)); \ + GKO_ADAPT_HF(template _macro(half, int32, int64)); \ + GKO_ADAPT_HF(template _macro(half, int64, int64)); \ template _macro(float, int32, int32); \ template _macro(float, int32, int64); \ template _macro(float, int64, int64); \ @@ -654,9 +656,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ - GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ - GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -670,9 +672,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ _macro); \ - GKO_ADAPT_HF(_macro(std::complex, int32, int32)); \ - GKO_ADAPT_HF(_macro(std::complex, int32, int64)); \ - GKO_ADAPT_HF(_macro(std::complex, int64, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32, int32)); \ + GKO_ADAPT_HF(template _macro(std::complex, int32, int64)); \ + GKO_ADAPT_HF(template _macro(std::complex, int64, int64)); \ template _macro(std::complex, int32, int32); \ template _macro(std::complex, int32, int64); \ template _macro(std::complex, int64, int64); \ @@ -683,38 +685,36 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template <> \ - _macro(float, double) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(double, float) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(half, double) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(double, half) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(float, half)); \ - GKO_ADAPT_HF(_macro(half, float)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template <> \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template <> \ + _macro(float, double) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, float) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(template <> _macro(half, double) GKO_NOT_IMPLEMENTED); \ + GKO_ADAPT_HF(template <> _macro(double, half) GKO_NOT_IMPLEMENTED); \ + GKO_ADAPT_HF(template _macro(float, half)); \ + GKO_ADAPT_HF(template _macro(half, float)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template <> _macro(std::complex, std::complex) \ + GKO_NOT_IMPLEMENTED); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template <> _macro(std::complex, std::complex) \ + GKO_NOT_IMPLEMENTED); \ + template <> \ + _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template <> \ - _macro(double, double) GKO_NOT_IMPLEMENTED; \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ - template <> \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(template _macro(half, half)); \ + template _macro(float, float); \ + template <> \ + _macro(double, double) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ + template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED #else /** @@ -726,18 +726,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - GKO_ADAPT_HF(_macro(half, double)); \ - GKO_ADAPT_HF(_macro(double, half)); \ - GKO_ADAPT_HF(_macro(float, half)); \ - GKO_ADAPT_HF(_macro(half, float)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + template _macro(float, double); \ + template _macro(double, float); \ + GKO_ADAPT_HF(template _macro(half, double)); \ + GKO_ADAPT_HF(template _macro(double, half)); \ + GKO_ADAPT_HF(template _macro(float, half)); \ + GKO_ADAPT_HF(template _macro(half, float)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -750,13 +750,13 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ + GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ + GKO_ADAPT_HF(template _macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) #endif @@ -769,15 +769,15 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - GKO_ADAPT_HF(_macro(std::complex, half)); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE_PAIR(_macro) \ + GKO_ADAPT_HF(template _macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + GKO_ADAPT_HF(template _macro(std::complex, half)); \ + template _macro(std::complex, float); \ + template _macro(std::complex, double); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) @@ -790,18 +790,18 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(char, char); \ - template _macro(int32, int32); \ - template _macro(int64, int64); \ - template _macro(unsigned int, unsigned int); \ - template _macro(unsigned long, unsigned long); \ - GKO_ADAPT_HF(_macro(half, half)); \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(long double, long double); \ - GKO_ADAPT_HF(_macro(std::complex, std::complex)); \ - template _macro(std::complex, std::complex); \ +#define GKO_INSTANTIATE_FOR_EACH_COMBINED_VALUE_AND_INDEX_TYPE(_macro) \ + template _macro(char, char); \ + template _macro(int32, int32); \ + template _macro(int64, int64); \ + template _macro(unsigned int, unsigned int); \ + template _macro(unsigned long, unsigned long); \ + GKO_ADAPT_HF(template _macro(half, half)); \ + template _macro(float, float); \ + template _macro(double, double); \ + template _macro(long double, long double); \ + GKO_ADAPT_HF(template _macro(std::complex, std::complex)); \ + template _macro(std::complex, std::complex); \ template _macro(std::complex, std::complex) /** @@ -812,16 +812,16 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ - GKO_ADAPT_HF(_macro(half)); \ - template _macro(float); \ - template _macro(double); \ - GKO_ADAPT_HF(_macro(std::complex)); \ - template _macro(std::complex); \ - template _macro(std::complex); \ - template _macro(size_type); \ - template _macro(bool); \ - template _macro(int32); \ +#define GKO_INSTANTIATE_FOR_EACH_POD_TYPE(_macro) \ + GKO_ADAPT_HF(template _macro(half)); \ + template _macro(float); \ + template _macro(double); \ + GKO_ADAPT_HF(template _macro(std::complex)); \ + template _macro(std::complex); \ + template _macro(std::complex); \ + template _macro(size_type); \ + template _macro(bool); \ + template _macro(int32); \ template _macro(int64) From 2a6d382d39cbd8da83fb73d811794892e94b9bd0 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 24 Sep 2024 10:55:19 +0200 Subject: [PATCH 48/62] update documentation, remove half.hpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Marcel Koch Co-authored-by: Thomas Grützmacher --- CMakeLists.txt | 2 +- accessor/cuda_helper.hpp | 5 +++++ accessor/hip_helper.hpp | 5 +++++ benchmark/run_all_benchmarks.sh | 2 +- common/cuda_hip/components/reduction.hpp | 2 +- common/cuda_hip/matrix/csr_kernels.template.cpp | 4 ++-- core/base/device_matrix_data_kernels.hpp | 1 - core/base/utils.hpp | 1 - core/components/absolute_array_kernels.hpp | 1 - core/components/fill_array_kernels.hpp | 1 - core/components/format_conversion_kernels.hpp | 1 - core/components/precision_conversion_kernels.hpp | 1 - core/components/prefix_sum_kernels.hpp | 1 - core/components/reduce_array_kernels.hpp | 1 - core/distributed/matrix_kernels.hpp | 1 - core/factorization/cholesky_kernels.hpp | 1 - core/factorization/factorization_kernels.hpp | 1 - core/factorization/ilu_kernels.hpp | 1 - core/factorization/lu_kernels.hpp | 1 - core/factorization/par_ic_kernels.hpp | 1 - core/factorization/par_ict_kernels.hpp | 1 - core/factorization/par_ilu_kernels.hpp | 1 - core/factorization/par_ilut_kernels.hpp | 1 - core/matrix/coo_kernels.hpp | 1 - core/matrix/csr_kernels.hpp | 1 - core/matrix/csr_lookup.hpp | 1 - core/matrix/dense_kernels.hpp | 1 - core/matrix/diagonal_kernels.hpp | 1 - core/matrix/fbcsr_kernels.hpp | 1 - core/matrix/fft_kernels.hpp | 1 - core/matrix/sparsity_csr_kernels.hpp | 1 - core/preconditioner/jacobi_utils.hpp | 1 - core/reorder/rcm_kernels.hpp | 1 - core/solver/bicg_kernels.hpp | 1 - core/solver/bicgstab_kernels.hpp | 1 - core/solver/cb_gmres_accessor.hpp | 1 - core/solver/cb_gmres_kernels.hpp | 1 - core/solver/cg_kernels.hpp | 1 - core/solver/cgs_kernels.hpp | 1 - core/solver/common_gmres_kernels.hpp | 1 - core/solver/gmres_kernels.hpp | 1 - core/solver/idr_kernels.hpp | 1 - core/solver/ir_kernels.hpp | 1 - core/solver/multigrid_kernels.hpp | 1 - core/stop/criterion_kernels.hpp | 1 - core/stop/residual_norm_kernels.hpp | 1 - include/ginkgo/core/base/array.hpp | 1 - include/ginkgo/core/base/batch_multi_vector.hpp | 2 +- include/ginkgo/core/base/dim.hpp | 1 - include/ginkgo/core/base/exception.hpp | 1 - include/ginkgo/core/base/executor.hpp | 1 - include/ginkgo/core/base/half.hpp | 13 +++++-------- include/ginkgo/core/base/index_set.hpp | 1 - include/ginkgo/core/base/intrinsics.hpp | 1 - include/ginkgo/core/base/lin_op.hpp | 1 - include/ginkgo/core/base/matrix_assembly_data.hpp | 1 - include/ginkgo/core/base/matrix_data.hpp | 1 - include/ginkgo/core/base/range.hpp | 1 - include/ginkgo/core/base/range_accessors.hpp | 1 - include/ginkgo/core/base/utils_helper.hpp | 1 - include/ginkgo/core/base/version.hpp | 1 - include/ginkgo/core/distributed/partition.hpp | 1 - include/ginkgo/core/factorization/factorization.hpp | 1 - include/ginkgo/core/factorization/ic.hpp | 1 - include/ginkgo/core/factorization/ilu.hpp | 1 - include/ginkgo/core/factorization/par_ic.hpp | 1 - include/ginkgo/core/factorization/par_ict.hpp | 1 - include/ginkgo/core/factorization/par_ilu.hpp | 1 - include/ginkgo/core/factorization/par_ilut.hpp | 1 - include/ginkgo/core/matrix/dense.hpp | 1 - include/ginkgo/core/matrix/permutation.hpp | 1 - include/ginkgo/core/multigrid/fixed_coarsening.hpp | 1 - include/ginkgo/core/multigrid/pgm.hpp | 1 - include/ginkgo/core/reorder/rcm.hpp | 1 - include/ginkgo/core/reorder/scaled_reordered.hpp | 1 - include/ginkgo/core/solver/bicg.hpp | 1 - include/ginkgo/core/solver/bicgstab.hpp | 1 - include/ginkgo/core/solver/cb_gmres.hpp | 1 - include/ginkgo/core/solver/cg.hpp | 1 - include/ginkgo/core/solver/cgs.hpp | 1 - include/ginkgo/core/solver/fcg.hpp | 1 - include/ginkgo/core/solver/gmres.hpp | 1 - include/ginkgo/core/solver/idr.hpp | 1 - include/ginkgo/core/solver/ir.hpp | 1 - include/ginkgo/core/solver/multigrid.hpp | 1 - include/ginkgo/core/solver/triangular.hpp | 1 - include/ginkgo/core/stop/stopping_status.hpp | 1 - 87 files changed, 21 insertions(+), 93 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d77f1c99cbc..c6a9ef0f817 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF) option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF) option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) -option(GINKGO_ENABLE_HALF "Enable the half operation" ON) +option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON) option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF) diff --git a/accessor/cuda_helper.hpp b/accessor/cuda_helper.hpp index 6ea7b6881d9..3efc6eb22b7 100644 --- a/accessor/cuda_helper.hpp +++ b/accessor/cuda_helper.hpp @@ -21,6 +21,11 @@ struct __half; namespace gko { + + +class half; + + namespace acc { namespace detail { diff --git a/accessor/hip_helper.hpp b/accessor/hip_helper.hpp index cd2f4f67a13..8827fd6eb11 100644 --- a/accessor/hip_helper.hpp +++ b/accessor/hip_helper.hpp @@ -21,6 +21,11 @@ struct __half; namespace gko { + + +class half; + + namespace acc { namespace detail { diff --git a/benchmark/run_all_benchmarks.sh b/benchmark/run_all_benchmarks.sh index 0efc0f0b3c2..36018045541 100755 --- a/benchmark/run_all_benchmarks.sh +++ b/benchmark/run_all_benchmarks.sh @@ -114,7 +114,7 @@ elif [ "${BENCHMARK_PRECISION}" == "half" ]; then BENCH_SUFFIX="_half" else echo "BENCHMARK_PRECISION is set to the not supported \"${BENCHMARK_PRECISION}\"." 1>&2 - echo "Currently supported values: \"double\", \"single\", \"dcomplex\" and \"scomplex\"" 1>&2 + echo "Currently supported values: \"double\", \"single\", \"half\", \"dcomplex\" and \"scomplex\"" 1>&2 exit 1 fi diff --git a/common/cuda_hip/components/reduction.hpp b/common/cuda_hip/components/reduction.hpp index b2f74fd8598..cdf599179a5 100644 --- a/common/cuda_hip/components/reduction.hpp +++ b/common/cuda_hip/components/reduction.hpp @@ -73,7 +73,7 @@ __device__ __forceinline__ int choose_pivot(const Group& group, bool is_pivoted) { using real = remove_complex; - real lmag = real(is_pivoted ? -one() : abs(local_data)); + auto lmag = static_cast(is_pivoted ? -one() : abs(local_data)); const auto pivot = reduce(group, group.thread_rank(), [&](int lidx, int ridx) { const auto rmag = group.shfl(lmag, ridx); diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index 76e77884d8b..cd0cd84ea29 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -278,7 +278,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmv( { using arithmetic_type = typename output_accessor::arithmetic_type; using output_type = typename output_accessor::storage_type; - const arithmetic_type scale_factor = static_cast(alpha[0]); + const auto scale_factor = static_cast(alpha[0]); spmv_kernel(nwarps, num_rows, val, col_idxs, row_ptrs, srow, b, c, [&scale_factor](const arithmetic_type& x) { return static_cast(scale_factor * x); @@ -486,7 +486,7 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_reduce( const IndexType* __restrict__ last_row, const MatrixValueType* __restrict__ alpha, acc::range c) { - const arithmetic_type alpha_val = static_cast(alpha[0]); + const auto alpha_val = static_cast(alpha[0]); merge_path_reduce( nwarps, last_val, last_row, c, [&alpha_val](const arithmetic_type& x) { return alpha_val * x; }); diff --git a/core/base/device_matrix_data_kernels.hpp b/core/base/device_matrix_data_kernels.hpp index 019427b4a83..bcaeebdf0cb 100644 --- a/core/base/device_matrix_data_kernels.hpp +++ b/core/base/device_matrix_data_kernels.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/base/utils.hpp b/core/base/utils.hpp index 157a82b1a8f..061c6e303ed 100644 --- a/core/base/utils.hpp +++ b/core/base/utils.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/components/absolute_array_kernels.hpp b/core/components/absolute_array_kernels.hpp index 8965215fb96..7617883cd1c 100644 --- a/core/components/absolute_array_kernels.hpp +++ b/core/components/absolute_array_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/core/components/fill_array_kernels.hpp b/core/components/fill_array_kernels.hpp index 5bb18ada799..2608cabe409 100644 --- a/core/components/fill_array_kernels.hpp +++ b/core/components/fill_array_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/components/format_conversion_kernels.hpp b/core/components/format_conversion_kernels.hpp index 5f4ad5e519e..10be3a10232 100644 --- a/core/components/format_conversion_kernels.hpp +++ b/core/components/format_conversion_kernels.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/components/precision_conversion_kernels.hpp b/core/components/precision_conversion_kernels.hpp index 3157a04c703..8443a657502 100644 --- a/core/components/precision_conversion_kernels.hpp +++ b/core/components/precision_conversion_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/core/components/prefix_sum_kernels.hpp b/core/components/prefix_sum_kernels.hpp index aa4a812cc73..8b68b54e29f 100644 --- a/core/components/prefix_sum_kernels.hpp +++ b/core/components/prefix_sum_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/components/reduce_array_kernels.hpp b/core/components/reduce_array_kernels.hpp index ef79c3b18be..b124e6ec2e3 100644 --- a/core/components/reduce_array_kernels.hpp +++ b/core/components/reduce_array_kernels.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include "core/base/kernel_declaration.hpp" diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp index e3f5801ad03..f24e8c9945e 100644 --- a/core/distributed/matrix_kernels.hpp +++ b/core/distributed/matrix_kernels.hpp @@ -8,7 +8,6 @@ #include #include -#include #include #include #include diff --git a/core/factorization/cholesky_kernels.hpp b/core/factorization/cholesky_kernels.hpp index 630707cdd18..db889ce1162 100644 --- a/core/factorization/cholesky_kernels.hpp +++ b/core/factorization/cholesky_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/core/factorization/factorization_kernels.hpp b/core/factorization/factorization_kernels.hpp index c73856fd44a..bab3dd16bd2 100644 --- a/core/factorization/factorization_kernels.hpp +++ b/core/factorization/factorization_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp index 1b9fd4cb590..2371c17fda4 100644 --- a/core/factorization/ilu_kernels.hpp +++ b/core/factorization/ilu_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp index 9c26cc95736..f497398cb90 100644 --- a/core/factorization/lu_kernels.hpp +++ b/core/factorization/lu_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/core/factorization/par_ic_kernels.hpp b/core/factorization/par_ic_kernels.hpp index 8827c55a2ee..59d2d97ffce 100644 --- a/core/factorization/par_ic_kernels.hpp +++ b/core/factorization/par_ic_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/factorization/par_ict_kernels.hpp b/core/factorization/par_ict_kernels.hpp index 29ac0def3e1..25172c0d649 100644 --- a/core/factorization/par_ict_kernels.hpp +++ b/core/factorization/par_ict_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp index 1fde2d7abab..16d20859c3e 100644 --- a/core/factorization/par_ilu_kernels.hpp +++ b/core/factorization/par_ilu_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/factorization/par_ilut_kernels.hpp b/core/factorization/par_ilut_kernels.hpp index ede8d858792..2d8ac7b4f88 100644 --- a/core/factorization/par_ilut_kernels.hpp +++ b/core/factorization/par_ilut_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/matrix/coo_kernels.hpp b/core/matrix/coo_kernels.hpp index 50833621675..a2cc44b74d9 100644 --- a/core/matrix/coo_kernels.hpp +++ b/core/matrix/coo_kernels.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_MATRIX_COO_KERNELS_HPP_ -#include #include #include #include diff --git a/core/matrix/csr_kernels.hpp b/core/matrix/csr_kernels.hpp index 23676c1810e..6013e014c8a 100644 --- a/core/matrix/csr_kernels.hpp +++ b/core/matrix/csr_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp index 129736841c6..a7b687c3618 100644 --- a/core/matrix/csr_lookup.hpp +++ b/core/matrix/csr_lookup.hpp @@ -8,7 +8,6 @@ #include -#include #include #include #include diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index d785fc4a45e..7422b431aa0 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -8,7 +8,6 @@ #include -#include #include #include #include diff --git a/core/matrix/diagonal_kernels.hpp b/core/matrix/diagonal_kernels.hpp index 4baf2fa8bc5..630c76e43ad 100644 --- a/core/matrix/diagonal_kernels.hpp +++ b/core/matrix/diagonal_kernels.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_MATRIX_DIAGONAL_KERNELS_HPP_ -#include #include #include #include diff --git a/core/matrix/fbcsr_kernels.hpp b/core/matrix/fbcsr_kernels.hpp index 9f8d10d5be9..7a644d48d78 100644 --- a/core/matrix/fbcsr_kernels.hpp +++ b/core/matrix/fbcsr_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/core/matrix/fft_kernels.hpp b/core/matrix/fft_kernels.hpp index b843f65521c..bd0e231c394 100644 --- a/core/matrix/fft_kernels.hpp +++ b/core/matrix/fft_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include diff --git a/core/matrix/sparsity_csr_kernels.hpp b/core/matrix/sparsity_csr_kernels.hpp index 655c5a76dde..e07bb980dce 100644 --- a/core/matrix/sparsity_csr_kernels.hpp +++ b/core/matrix/sparsity_csr_kernels.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_MATRIX_SPARSITY_CSR_KERNELS_HPP_ -#include #include #include #include diff --git a/core/preconditioner/jacobi_utils.hpp b/core/preconditioner/jacobi_utils.hpp index 1320524a1a7..b0aa8b5f38a 100644 --- a/core/preconditioner/jacobi_utils.hpp +++ b/core/preconditioner/jacobi_utils.hpp @@ -6,7 +6,6 @@ #define GKO_CORE_PRECONDITIONER_JACOBI_UTILS_HPP_ -#include #include #include diff --git a/core/reorder/rcm_kernels.hpp b/core/reorder/rcm_kernels.hpp index 737182a63d3..a89b2732cb0 100644 --- a/core/reorder/rcm_kernels.hpp +++ b/core/reorder/rcm_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/solver/bicg_kernels.hpp b/core/solver/bicg_kernels.hpp index 46fadb8ccc4..5e94d8ca350 100644 --- a/core/solver/bicg_kernels.hpp +++ b/core/solver/bicg_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/solver/bicgstab_kernels.hpp b/core/solver/bicgstab_kernels.hpp index cc2ebada4c1..e3bfbdcdcb6 100644 --- a/core/solver/bicgstab_kernels.hpp +++ b/core/solver/bicgstab_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/solver/cb_gmres_accessor.hpp b/core/solver/cb_gmres_accessor.hpp index 72e9618cc0e..a5d95793d15 100644 --- a/core/solver/cb_gmres_accessor.hpp +++ b/core/solver/cb_gmres_accessor.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index 1f012416a39..29a84f25ba1 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -8,7 +8,6 @@ #include #include -#include #include #include #include diff --git a/core/solver/cg_kernels.hpp b/core/solver/cg_kernels.hpp index dc05acd169c..bec5f04d0e5 100644 --- a/core/solver/cg_kernels.hpp +++ b/core/solver/cg_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/solver/cgs_kernels.hpp b/core/solver/cgs_kernels.hpp index 9b0847b858b..d64aeedb549 100644 --- a/core/solver/cgs_kernels.hpp +++ b/core/solver/cgs_kernels.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/core/solver/common_gmres_kernels.hpp b/core/solver/common_gmres_kernels.hpp index cd7eb821d3d..0209284c446 100644 --- a/core/solver/common_gmres_kernels.hpp +++ b/core/solver/common_gmres_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/core/solver/gmres_kernels.hpp b/core/solver/gmres_kernels.hpp index 31feeee5e84..21bb5854816 100644 --- a/core/solver/gmres_kernels.hpp +++ b/core/solver/gmres_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/core/solver/idr_kernels.hpp b/core/solver/idr_kernels.hpp index e988febf0ac..3d579bd01af 100644 --- a/core/solver/idr_kernels.hpp +++ b/core/solver/idr_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/core/solver/ir_kernels.hpp b/core/solver/ir_kernels.hpp index f6fa94cef66..a411c9f375d 100644 --- a/core/solver/ir_kernels.hpp +++ b/core/solver/ir_kernels.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include diff --git a/core/solver/multigrid_kernels.hpp b/core/solver/multigrid_kernels.hpp index c6f5c0abc50..73c660cbefb 100644 --- a/core/solver/multigrid_kernels.hpp +++ b/core/solver/multigrid_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/core/stop/criterion_kernels.hpp b/core/stop/criterion_kernels.hpp index 014763c6079..62e4135ee37 100644 --- a/core/stop/criterion_kernels.hpp +++ b/core/stop/criterion_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include diff --git a/core/stop/residual_norm_kernels.hpp b/core/stop/residual_norm_kernels.hpp index 665004d37bc..7625dadefeb 100644 --- a/core/stop/residual_norm_kernels.hpp +++ b/core/stop/residual_norm_kernels.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/include/ginkgo/core/base/array.hpp b/include/ginkgo/core/base/array.hpp index a2ba2a394ba..e0cf8c22ab3 100644 --- a/include/ginkgo/core/base/array.hpp +++ b/include/ginkgo/core/base/array.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include diff --git a/include/ginkgo/core/base/batch_multi_vector.hpp b/include/ginkgo/core/base/batch_multi_vector.hpp index 7ccee45ebd3..be47c8a3ee8 100644 --- a/include/ginkgo/core/base/batch_multi_vector.hpp +++ b/include/ginkgo/core/base/batch_multi_vector.hpp @@ -447,7 +447,7 @@ class MultiVector private: batch_dim<2> batch_size_; array values_; -}; // namespace batch +}; } // namespace batch diff --git a/include/ginkgo/core/base/dim.hpp b/include/ginkgo/core/base/dim.hpp index 3ad0ee7f619..ffa38aa6a76 100644 --- a/include/ginkgo/core/base/dim.hpp +++ b/include/ginkgo/core/base/dim.hpp @@ -8,7 +8,6 @@ #include -#include #include diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index 74a06c60c7b..17e09e00d18 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -9,7 +9,6 @@ #include #include -#include #include diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index ec3c42eb387..963e30bfddd 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 1bb76be7741..c4dfaed3cdd 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -128,6 +128,7 @@ constexpr UintType create_ones(int n) static_cast(1); } + template struct float_traits { using type = typename basic_float_traits::type; @@ -354,13 +355,11 @@ class half { template \ GKO_ATTRIBUTES friend std::enable_if_t< \ !std::is_same::value && std::is_scalar::value, \ - typename std::conditional::value, T, \ - half>::type> \ + std::conditional_t::value, T, half>> \ operator _op(const half hf, const T val) \ { \ using type = \ - typename std::conditional::value, T, \ - half>::type; \ + std::conditional_t::value, T, half>; \ auto result = static_cast(hf); \ result _opeq static_cast(val); \ return result; \ @@ -368,13 +367,11 @@ class half { template \ GKO_ATTRIBUTES friend std::enable_if_t< \ !std::is_same::value && std::is_scalar::value, \ - typename std::conditional::value, T, \ - half>::type> \ + std::conditional_t::value, T, half>> \ operator _op(const T val, const half hf) \ { \ using type = \ - typename std::conditional::value, T, \ - half>::type; \ + std::conditional_t::value, T, half>; \ auto result = static_cast(val); \ result _opeq static_cast(hf); \ return result; \ diff --git a/include/ginkgo/core/base/index_set.hpp b/include/ginkgo/core/base/index_set.hpp index 7285a3ff880..260896d6b2f 100644 --- a/include/ginkgo/core/base/index_set.hpp +++ b/include/ginkgo/core/base/index_set.hpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include diff --git a/include/ginkgo/core/base/intrinsics.hpp b/include/ginkgo/core/base/intrinsics.hpp index f5220c384a3..37e7f361781 100644 --- a/include/ginkgo/core/base/intrinsics.hpp +++ b/include/ginkgo/core/base/intrinsics.hpp @@ -8,7 +8,6 @@ #include -#include #include diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index 06f874374a6..26e1c1b9baa 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/base/matrix_assembly_data.hpp b/include/ginkgo/core/base/matrix_assembly_data.hpp index ac3ab91e687..6993f2004f2 100644 --- a/include/ginkgo/core/base/matrix_assembly_data.hpp +++ b/include/ginkgo/core/base/matrix_assembly_data.hpp @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 4c6f146e474..218c79a6fea 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 716f65fa797..680bc47bcb6 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -8,7 +8,6 @@ #include -#include #include #include #include diff --git a/include/ginkgo/core/base/range_accessors.hpp b/include/ginkgo/core/base/range_accessors.hpp index 5401988f963..56335b8dd97 100644 --- a/include/ginkgo/core/base/range_accessors.hpp +++ b/include/ginkgo/core/base/range_accessors.hpp @@ -8,7 +8,6 @@ #include -#include #include #include diff --git a/include/ginkgo/core/base/utils_helper.hpp b/include/ginkgo/core/base/utils_helper.hpp index c19460a2c65..951ea4bbf5d 100644 --- a/include/ginkgo/core/base/utils_helper.hpp +++ b/include/ginkgo/core/base/utils_helper.hpp @@ -11,7 +11,6 @@ #include #include -#include #include #include diff --git a/include/ginkgo/core/base/version.hpp b/include/ginkgo/core/base/version.hpp index 2f8efd1cbce..9fad9430527 100644 --- a/include/ginkgo/core/base/version.hpp +++ b/include/ginkgo/core/base/version.hpp @@ -9,7 +9,6 @@ #include #include -#include #include diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp index ba4835a4c3f..89adb22f3e7 100644 --- a/include/ginkgo/core/distributed/partition.hpp +++ b/include/ginkgo/core/distributed/partition.hpp @@ -7,7 +7,6 @@ #include -#include #include #include diff --git a/include/ginkgo/core/factorization/factorization.hpp b/include/ginkgo/core/factorization/factorization.hpp index 01cfa2aec3b..39345f59a44 100644 --- a/include/ginkgo/core/factorization/factorization.hpp +++ b/include/ginkgo/core/factorization/factorization.hpp @@ -7,7 +7,6 @@ #include -#include #include #include #include diff --git a/include/ginkgo/core/factorization/ic.hpp b/include/ginkgo/core/factorization/ic.hpp index c430c914207..616360ce039 100644 --- a/include/ginkgo/core/factorization/ic.hpp +++ b/include/ginkgo/core/factorization/ic.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/factorization/ilu.hpp b/include/ginkgo/core/factorization/ilu.hpp index 839035c5e0e..80f11ab7b6f 100644 --- a/include/ginkgo/core/factorization/ilu.hpp +++ b/include/ginkgo/core/factorization/ilu.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ic.hpp b/include/ginkgo/core/factorization/par_ic.hpp index 54c0b3eeb66..b5f14a997b4 100644 --- a/include/ginkgo/core/factorization/par_ic.hpp +++ b/include/ginkgo/core/factorization/par_ic.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ict.hpp b/include/ginkgo/core/factorization/par_ict.hpp index d4cf34b137a..bc2e38eadf4 100644 --- a/include/ginkgo/core/factorization/par_ict.hpp +++ b/include/ginkgo/core/factorization/par_ict.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilu.hpp b/include/ginkgo/core/factorization/par_ilu.hpp index 5c97718bc2c..88d183a939c 100644 --- a/include/ginkgo/core/factorization/par_ilu.hpp +++ b/include/ginkgo/core/factorization/par_ilu.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/factorization/par_ilut.hpp b/include/ginkgo/core/factorization/par_ilut.hpp index afd2d834ab6..c73e3a1b905 100644 --- a/include/ginkgo/core/factorization/par_ilut.hpp +++ b/include/ginkgo/core/factorization/par_ilut.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp index 232d92c7702..18d4879543d 100644 --- a/include/ginkgo/core/matrix/dense.hpp +++ b/include/ginkgo/core/matrix/dense.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp index ee5ec427816..5549b75f694 100644 --- a/include/ginkgo/core/matrix/permutation.hpp +++ b/include/ginkgo/core/matrix/permutation.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/multigrid/fixed_coarsening.hpp b/include/ginkgo/core/multigrid/fixed_coarsening.hpp index becc149e433..86c21acba39 100644 --- a/include/ginkgo/core/multigrid/fixed_coarsening.hpp +++ b/include/ginkgo/core/multigrid/fixed_coarsening.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp index ebb19fd2c89..d07001be2f1 100644 --- a/include/ginkgo/core/multigrid/pgm.hpp +++ b/include/ginkgo/core/multigrid/pgm.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp index 661dea03c55..589d38e29d1 100644 --- a/include/ginkgo/core/reorder/rcm.hpp +++ b/include/ginkgo/core/reorder/rcm.hpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/reorder/scaled_reordered.hpp b/include/ginkgo/core/reorder/scaled_reordered.hpp index 65627ec3e4c..862a2135bca 100644 --- a/include/ginkgo/core/reorder/scaled_reordered.hpp +++ b/include/ginkgo/core/reorder/scaled_reordered.hpp @@ -8,7 +8,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp index 36188946b96..2a43c1ca3f8 100644 --- a/include/ginkgo/core/solver/bicg.hpp +++ b/include/ginkgo/core/solver/bicg.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp index e89e65387b7..a57a6c27aa4 100644 --- a/include/ginkgo/core/solver/bicgstab.hpp +++ b/include/ginkgo/core/solver/bicgstab.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp index 5cab2c466eb..976712cd673 100644 --- a/include/ginkgo/core/solver/cb_gmres.hpp +++ b/include/ginkgo/core/solver/cb_gmres.hpp @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp index 6ba5efe4226..984d5d1f104 100644 --- a/include/ginkgo/core/solver/cg.hpp +++ b/include/ginkgo/core/solver/cg.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp index ef5d0ac5226..bde23d76910 100644 --- a/include/ginkgo/core/solver/cgs.hpp +++ b/include/ginkgo/core/solver/cgs.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp index 5d3f60de0ef..dfaf252b557 100644 --- a/include/ginkgo/core/solver/fcg.hpp +++ b/include/ginkgo/core/solver/fcg.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp index 19f45303d27..3ba3acf94bb 100644 --- a/include/ginkgo/core/solver/gmres.hpp +++ b/include/ginkgo/core/solver/gmres.hpp @@ -10,7 +10,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp index 66e574b28b9..9f167d9b2eb 100644 --- a/include/ginkgo/core/solver/idr.hpp +++ b/include/ginkgo/core/solver/idr.hpp @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp index dba9b50d901..91949261a79 100644 --- a/include/ginkgo/core/solver/ir.hpp +++ b/include/ginkgo/core/solver/ir.hpp @@ -9,7 +9,6 @@ #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp index 697f5dbb512..2d0278b538e 100644 --- a/include/ginkgo/core/solver/multigrid.hpp +++ b/include/ginkgo/core/solver/multigrid.hpp @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/solver/triangular.hpp b/include/ginkgo/core/solver/triangular.hpp index 794b4f4fc5c..2d42e3bb97a 100644 --- a/include/ginkgo/core/solver/triangular.hpp +++ b/include/ginkgo/core/solver/triangular.hpp @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/include/ginkgo/core/stop/stopping_status.hpp b/include/ginkgo/core/stop/stopping_status.hpp index d09404d4a6a..58c2f137c8d 100644 --- a/include/ginkgo/core/stop/stopping_status.hpp +++ b/include/ginkgo/core/stop/stopping_status.hpp @@ -7,7 +7,6 @@ #include -#include #include From 8731fc3d2a814973abecbda02c4bf1d045f2589a Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 24 Sep 2024 14:38:29 +0200 Subject: [PATCH 49/62] put function in gko not std MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Grützmacher --- core/test/utils/assertions.hpp | 2 +- core/test/utils/matrix_generator_test.cpp | 4 ++-- .../factorization_kernels.dp.cpp | 2 +- dpcpp/factorization/par_ic_kernels.dp.cpp | 4 ++-- dpcpp/factorization/par_ict_kernels.dp.cpp | 2 +- .../par_ilut_filter_kernels.hpp.inc | 4 ++-- .../par_ilut_select_kernels.hpp.inc | 6 ++--- dpcpp/preconditioner/isai_kernels.dp.cpp | 4 ++-- dpcpp/solver/cb_gmres_kernels.dp.cpp | 10 ++++---- dpcpp/solver/common_gmres_kernels.dp.inc | 10 ++++---- dpcpp/solver/idr_kernels.dp.cpp | 6 ++--- dpcpp/stop/residual_norm_kernels.dp.cpp | 2 +- include/ginkgo/core/base/math.hpp | 24 ++++++++++++++----- reference/test/solver/gcr_kernels.cpp | 2 +- test/base/device_matrix_data_kernels.cpp | 2 +- test/solver/cb_gmres_kernels.cpp | 2 +- test/test_install/test_install.cpp | 2 +- 17 files changed, 50 insertions(+), 38 deletions(-) diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 3a275b8ee53..174d4536657 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -259,7 +259,7 @@ void save_matrices_to_disk(Ostream& os, const MatrixData1& first, template double get_relative_error(const MatrixData1& first, const MatrixData2& second) { - using std::abs; + using gko::abs; using vt = typename detail::biggest_valuetype< typename MatrixData1::value_type, typename MatrixData2::value_type>::type; diff --git a/core/test/utils/matrix_generator_test.cpp b/core/test/utils/matrix_generator_test.cpp index af268af1471..7c798d2e835 100644 --- a/core/test/utils/matrix_generator_test.cpp +++ b/core/test/utils/matrix_generator_test.cpp @@ -278,8 +278,8 @@ TYPED_TEST(MatrixGenerator, CanGenerateTridiagInverseMatrix) auto lower = gko::test::detail::get_rand_value(dist, engine); auto upper = gko::test::detail::get_rand_value(dist, engine); // make diagonally dominant - auto diag = std::abs(gko::test::detail::get_rand_value(dist, engine)) + - std::abs(lower) + std::abs(upper); + auto diag = gko::abs(gko::test::detail::get_rand_value(dist, engine)) + + gko::abs(lower) + gko::abs(upper); gko::size_type size = 50; if (std::is_same>::value) { // half precision can only handle small matrix diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp index 1d9912b4f12..fa4fed8230c 100644 --- a/dpcpp/factorization/factorization_kernels.dp.cpp +++ b/dpcpp/factorization/factorization_kernels.dp.cpp @@ -450,7 +450,7 @@ void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs, l_col_idxs[l_diag_idx] = row; // compute square root with sentinel if (use_sqrt) { - diag_val = std::sqrt(diag_val); + diag_val = gko::sqrt(diag_val); if (!is_finite(diag_val)) { diag_val = one(); } diff --git a/dpcpp/factorization/par_ic_kernels.dp.cpp b/dpcpp/factorization/par_ic_kernels.dp.cpp index 5428460fac5..ee85b103f67 100644 --- a/dpcpp/factorization/par_ic_kernels.dp.cpp +++ b/dpcpp/factorization/par_ic_kernels.dp.cpp @@ -41,7 +41,7 @@ void ic_init(const IndexType* __restrict__ l_row_ptrs, return; } auto l_nz = l_row_ptrs[row + 1] - 1; - auto diag = std::sqrt(l_vals[l_nz]); + auto diag = gko::sqrt(l_vals[l_nz]); if (is_finite(diag)) { l_vals[l_nz] = diag; } else { @@ -93,7 +93,7 @@ void ic_sweep(const IndexType* __restrict__ a_row_idxs, lh_col_begin += l_col >= lh_row; } auto to_write = row == col - ? std::sqrt(a_val - sum) + ? gko::sqrt(a_val - sum) : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; if (is_finite(to_write)) { l_vals[l_nz] = to_write; diff --git a/dpcpp/factorization/par_ict_kernels.dp.cpp b/dpcpp/factorization/par_ict_kernels.dp.cpp index fb99b662dec..31759890961 100644 --- a/dpcpp/factorization/par_ict_kernels.dp.cpp +++ b/dpcpp/factorization/par_ict_kernels.dp.cpp @@ -356,7 +356,7 @@ void ict_sweep(const IndexType* __restrict__ a_row_ptrs, if (subwarp.thread_rank() == 0) { auto to_write = row == col - ? std::sqrt(a_val - sum) + ? gko::sqrt(a_val - sum) : (a_val - sum) / l_vals[l_row_ptrs[col + 1] - 1]; if (is_finite(to_write)) { l_vals[l_nz] = to_write; diff --git a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc index d2345848d1f..6081bc0f417 100644 --- a/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_filter_kernels.hpp.inc @@ -102,7 +102,7 @@ void threshold_filter_nnz(const IndexType* __restrict__ row_ptrs, row_ptrs, num_rows, [&](IndexType idx, IndexType row_begin, IndexType row_end) { auto diag_idx = lower ? row_end - 1 : row_begin; - return std::abs(vals[idx]) >= threshold || idx == diag_idx; + return gko::abs(vals[idx]) >= threshold || idx == diag_idx; }, nnz, item_ct1); } @@ -140,7 +140,7 @@ void threshold_filter(const IndexType* __restrict__ old_row_ptrs, old_row_ptrs, old_col_idxs, old_vals, num_rows, [&](IndexType idx, IndexType row_begin, IndexType row_end) { auto diag_idx = lower ? row_end - 1 : row_begin; - return std::abs(old_vals[idx]) >= threshold || idx == diag_idx; + return gko::abs(old_vals[idx]) >= threshold || idx == diag_idx; }, new_row_ptrs, new_row_idxs, new_col_idxs, new_vals, item_ct1); } diff --git a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc index 1ebfe6ed320..430bf650e07 100644 --- a/dpcpp/factorization/par_ilut_select_kernels.hpp.inc +++ b/dpcpp/factorization/par_ilut_select_kernels.hpp.inc @@ -38,7 +38,7 @@ void build_searchtree(const ValueType* __restrict__ input, IndexType size, for (int i = 0; i < sampleselect_oversampling; ++i) { auto lidx = idx * sampleselect_oversampling + i; auto val = input[static_cast(lidx * stride)]; - samples[i] = std::abs(val); + samples[i] = gko::abs(val); } bitonic_sort(samples, sh_samples, @@ -113,7 +113,7 @@ void count_buckets(const ValueType* __restrict__ input, IndexType size, auto end = min(block_end, size); for (IndexType i = begin; i < end; i += default_block_size) { // traverse the search tree with the input element - auto el = std::abs(input[i]); + auto el = gko::abs(input[i]); IndexType tree_idx{}; #pragma unroll for (int level = 0; level < sampleselect_searchtree_height; ++level) { @@ -297,7 +297,7 @@ void filter_bucket(const ValueType* __restrict__ input, IndexType size, auto found = bucket == oracles[i]; auto ofs = atomic_add(&*counter, IndexType{found}); if (found) { - output[ofs] = std::abs(input[i]); + output[ofs] = gko::abs(input[i]); } } } diff --git a/dpcpp/preconditioner/isai_kernels.dp.cpp b/dpcpp/preconditioner/isai_kernels.dp.cpp index 4082035ff9f..bb55ab6854f 100644 --- a/dpcpp/preconditioner/isai_kernels.dp.cpp +++ b/dpcpp/preconditioner/isai_kernels.dp.cpp @@ -365,7 +365,7 @@ void generate_general_inverse( if (spd) { auto diag = subwarp.shfl(sol, num_elems - 1); - sol /= std::sqrt(diag); + sol /= gko::sqrt(diag); } return sol; @@ -531,7 +531,7 @@ void scale_excess_solution(const IndexType* __restrict__ excess_block_ptrs, return; } const auto diag = excess_solution[block_end - 1]; - const ValueType scal = one() / std::sqrt(diag); + const ValueType scal = one() / gko::sqrt(diag); for (size_type i = block_begin + local_id; i < block_end; i += subwarp_size) { diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index 8747dcb60a7..edceeeb01ab 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -285,9 +285,9 @@ void multinorminf_without_stop_kernel( i += default_dot_dim) { const auto next_krylov_idx = i * stride_next_krylov + col_idx; local_max = - (local_max >= std::abs(next_krylov_basis[next_krylov_idx])) + (local_max >= gko::abs(next_krylov_basis[next_krylov_idx])) ? local_max - : std::abs(next_krylov_basis[next_krylov_idx]); + : gko::abs(next_krylov_basis[next_krylov_idx]); } } reduction_helper[tidx * (default_dot_dim + 1) + tidy] = local_max; @@ -373,7 +373,7 @@ void multinorm2_inf_kernel( local_res += squared_norm(num); if (compute_inf) { local_max = - ((local_max >= std::abs(num)) ? local_max : std::abs(num)); + ((local_max >= gko::abs(num)) ? local_max : gko::abs(num)); } } } @@ -729,8 +729,8 @@ void check_arnoldi_norms( gko::cb_gmres::detail::has_3d_scaled_accessor::value; if (col_idx < num_rhs && !stop_status[col_idx].has_stopped()) { - const auto num0 = (std::sqrt(eta_squared * arnoldi_norm[col_idx])); - const auto num11 = std::sqrt(arnoldi_norm[col_idx + stride_norm]); + const auto num0 = (gko::sqrt(eta_squared * arnoldi_norm[col_idx])); + const auto num11 = gko::sqrt(arnoldi_norm[col_idx + stride_norm]); const auto num2 = has_scalar ? (arnoldi_norm[col_idx + 2 * stride_norm]) : remove_complex{}; if (num11 < num0) { diff --git a/dpcpp/solver/common_gmres_kernels.dp.inc b/dpcpp/solver/common_gmres_kernels.dp.inc index 0b5de8188f2..f8a54fe5116 100644 --- a/dpcpp/solver/common_gmres_kernels.dp.inc +++ b/dpcpp/solver/common_gmres_kernels.dp.inc @@ -72,12 +72,12 @@ void calculate_sin_and_cos_kernel(size_type col_idx, size_type num_cols, register_cos = zero(); register_sin = one(); } else { - const auto scale = std::abs(this_hess) + std::abs(next_hess); + const auto scale = gko::abs(this_hess) + gko::abs(next_hess); const auto hypotenuse = scale * - std::sqrt( - std::abs(this_hess / scale) * std::abs(this_hess / scale) + - std::abs(next_hess / scale) * std::abs(next_hess / scale)); + gko::sqrt( + gko::abs(this_hess / scale) * gko::abs(this_hess / scale) + + gko::abs(next_hess / scale) * gko::abs(next_hess / scale)); register_cos = conj(this_hess) / hypotenuse; register_sin = conj(next_hess) / hypotenuse; } @@ -102,7 +102,7 @@ void calculate_residual_norm_kernel(size_type col_idx, size_type num_cols, const auto next_rnc = -conj(register_sin) * this_rnc; residual_norm_collection[iter * stride_residual_norm_collection + col_idx] = register_cos * this_rnc; - residual_norm[col_idx] = std::abs(next_rnc); + residual_norm[col_idx] = gko::abs(next_rnc); residual_norm_collection[(iter + 1) * stride_residual_norm_collection + col_idx] = next_rnc; } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index a5531f2dc40..aff389ccd50 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -127,7 +127,7 @@ void orthonormalize_subspace_vectors_kernel( const remove_complex& b) { return a + b; }); item_ct1.barrier(sycl::access::fence_space::local_space); - norm = std::sqrt(reduction_helper_real[0]); + norm = gko::sqrt(reduction_helper_real[0]); for (size_type j = tidx; j < num_cols; j += block_size) { values[row * stride + j] /= norm; } @@ -542,8 +542,8 @@ void compute_omega_kernel( if (!stop_status[global_id].has_stopped()) { auto thr = omega[global_id]; omega[global_id] /= tht[global_id]; - auto absrho = std::abs( - thr / (std::sqrt(real(tht[global_id])) * residual_norm[global_id])); + auto absrho = gko::abs( + thr / (gko::sqrt(real(tht[global_id])) * residual_norm[global_id])); if (absrho < kappa) { omega[global_id] *= kappa / absrho; diff --git a/dpcpp/stop/residual_norm_kernels.dp.cpp b/dpcpp/stop/residual_norm_kernels.dp.cpp index ddb617a1a84..e67b599dbb5 100644 --- a/dpcpp/stop/residual_norm_kernels.dp.cpp +++ b/dpcpp/stop/residual_norm_kernels.dp.cpp @@ -108,7 +108,7 @@ void implicit_residual_norm( cgh.parallel_for( sycl::range<1>{tau->get_size()[1]}, [=](sycl::id<1> idx_id) { const auto tidx = idx_id[0]; - if (std::sqrt(std::abs(tau_val[tidx])) <= + if (gko::sqrt(gko::abs(tau_val[tidx])) <= rel_residual_goal * orig_tau_val[tidx]) { stop_status_val[tidx].converge(stoppingId, setFinalized); device_storage_val[1] = true; diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 3e9925b8a34..5145167d78f 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -30,28 +30,40 @@ class complex; } -namespace std { -inline gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } +// when using gko, abs will be ambiguous. delete that, get_relative_error can +// not find proper half +namespace gko { +using std::abs; +using std::sqrt; -inline gko::half abs(std::complex a) +GKO_ATTRIBUTES GKO_INLINE gko::half abs(gko::half a) +{ + return gko::half((a > 0) ? a : -a); +} + +GKO_ATTRIBUTES GKO_INLINE gko::half abs(std::complex a) { // Using float abs not sqrt on norm to avoid overflow return gko::half(abs(std::complex(a))); } -inline gko::half sqrt(gko::half a) { return gko::half(sqrt(float(a))); } +GKO_ATTRIBUTES GKO_INLINE gko::half sqrt(gko::half a) +{ + return gko::half(std::sqrt(float(a))); +} -inline std::complex sqrt(std::complex a) +GKO_ATTRIBUTES GKO_INLINE std::complex sqrt( + std::complex a) { return std::complex(sqrt(std::complex( static_cast(a.real()), static_cast(a.imag())))); } -} // namespace std +} // namespace gko namespace gko { diff --git a/reference/test/solver/gcr_kernels.cpp b/reference/test/solver/gcr_kernels.cpp index d3f29fffee9..6077af49af6 100644 --- a/reference/test/solver/gcr_kernels.cpp +++ b/reference/test/solver/gcr_kernels.cpp @@ -458,7 +458,7 @@ TYPED_TEST(Gcr, SolveWithImplicitResNormCritIsDisabled) template gko::remove_complex infNorm(gko::matrix::Dense* mat, size_t col = 0) { - using std::abs; + using gko::abs; using no_cpx_t = gko::remove_complex; no_cpx_t norm = 0.0; for (size_t i = 0; i < mat->get_size()[0]; ++i) { diff --git a/test/base/device_matrix_data_kernels.cpp b/test/base/device_matrix_data_kernels.cpp index 9f2abd2730e..5f1633aa268 100644 --- a/test/base/device_matrix_data_kernels.cpp +++ b/test/base/device_matrix_data_kernels.cpp @@ -316,7 +316,7 @@ TYPED_TEST(DeviceMatrixData, SumsDuplicates) arrays.values.set_executor(this->exec->get_master()); for (int i = 0; i < arrays.values.get_size(); i++) { max_error = std::max( - max_error, std::abs(arrays.values.get_const_data()[i] - + max_error, gko::abs(arrays.values.get_const_data()[i] - ref_arrays.values.get_const_data()[i])); } // when Hip with GNU < 7, it will give a little difference. diff --git a/test/solver/cb_gmres_kernels.cpp b/test/solver/cb_gmres_kernels.cpp index 022899d21e6..98eb295091b 100644 --- a/test/solver/cb_gmres_kernels.cpp +++ b/test/solver/cb_gmres_kernels.cpp @@ -146,7 +146,7 @@ class CbGmres : public CommonTestFixture { auto& krylov_bases = range_helper.get_bases(); d_to_host = d_range_helper.get_bases(); const auto tolerance = r::value; - using std::abs; + using gko::abs; for (gko::size_type i = 0; i < krylov_bases.get_size(); ++i) { const auto ref_value = krylov_bases.get_const_data()[i]; const auto dev_value = d_to_host.get_const_data()[i]; diff --git a/test/test_install/test_install.cpp b/test/test_install/test_install.cpp index 2f4cdeda6e4..3d990d3e5eb 100644 --- a/test/test_install/test_install.cpp +++ b/test/test_install/test_install.cpp @@ -25,7 +25,7 @@ void assert_similar_matrices(gko::ptr_param> m1, assert(m1->get_size()[1] == m2->get_size()[1]); for (gko::size_type i = 0; i < m1->get_size()[0]; ++i) { for (gko::size_type j = 0; j < m2->get_size()[1]; ++j) { - assert(std::abs(m1->at(i, j) - m2->at(i, j)) < prec); + assert(gko::abs(m1->at(i, j) - m2->at(i, j)) < prec); } } } From 64406f30e3a8bb49bef9af1454f5c1451a143b13 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 2 Oct 2024 16:19:11 +0200 Subject: [PATCH 50/62] fix after rebase --- core/config/type_descriptor_helper.hpp | 3 +++ core/log/solver_progress.cpp | 8 ++++++++ cuda/solver/common_trs_kernels.cuh | 4 ++++ 3 files changed, 15 insertions(+) diff --git a/core/config/type_descriptor_helper.hpp b/core/config/type_descriptor_helper.hpp index 0edc4376f1a..63a953e3a1e 100644 --- a/core/config/type_descriptor_helper.hpp +++ b/core/config/type_descriptor_helper.hpp @@ -8,6 +8,7 @@ #include +#include #include #include #include @@ -38,8 +39,10 @@ struct type_string {}; TYPE_STRING_OVERLOAD(void, "void"); TYPE_STRING_OVERLOAD(double, "float64"); TYPE_STRING_OVERLOAD(float, "float32"); +TYPE_STRING_OVERLOAD(half, "float16"); TYPE_STRING_OVERLOAD(std::complex, "complex"); TYPE_STRING_OVERLOAD(std::complex, "complex"); +TYPE_STRING_OVERLOAD(std::complex, "complex"); TYPE_STRING_OVERLOAD(int32, "int32"); TYPE_STRING_OVERLOAD(int64, "int64"); diff --git a/core/log/solver_progress.cpp b/core/log/solver_progress.cpp index effa0279bba..4d1566e159f 100644 --- a/core/log/solver_progress.cpp +++ b/core/log/solver_progress.cpp @@ -247,6 +247,14 @@ class SolverProgressStore : public SolverProgress { run, gko::matrix::Dense, gko::matrix::Dense>, gko::matrix::Dense>, +#if GINKGO_ENABLE_HALF + gko::matrix::Dense, + gko::matrix::Dense>, + gko::WritableToMatrixData, + gko::WritableToMatrixData, int32>, + gko::WritableToMatrixData, + gko::WritableToMatrixData, int64>, +#endif // fallback for other matrix types gko::WritableToMatrixData, gko::WritableToMatrixData, diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh index 362d22a653c..3dea9bd457c 100644 --- a/cuda/solver/common_trs_kernels.cuh +++ b/cuda/solver/common_trs_kernels.cuh @@ -359,6 +359,10 @@ struct float_to_unsigned_impl { using type = uint32; }; +template <> +struct float_to_unsigned_impl<__half> { + using type = uint16; +}; /** * Checks if a floating point number representation matches the representation From baa95f7eb9746ef60b19001d93492a54bfe85181 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 3 Oct 2024 15:41:58 +0200 Subject: [PATCH 51/62] hip does not support 16bit shuffle --- .../factorization/par_ilut_select_common.cpp | 14 +++++++++++--- .../factorization/par_ilut_select_kernels.cpp | 17 +++++++++++++---- test/factorization/par_ilut_kernels.cpp | 19 +++++++++++++++++++ 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/common/cuda_hip/factorization/par_ilut_select_common.cpp b/common/cuda_hip/factorization/par_ilut_select_common.cpp index fccb89fcf5a..6751615ff69 100644 --- a/common/cuda_hip/factorization/par_ilut_select_common.cpp +++ b/common/cuda_hip/factorization/par_ilut_select_common.cpp @@ -43,9 +43,17 @@ void sampleselect_count(std::shared_ptr exec, auto num_threads_total = ceildiv(size, items_per_thread); auto num_blocks = static_cast(ceildiv(num_threads_total, default_block_size)); - // pick sample, build searchtree - kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>( - as_device_type(values), size, as_device_type(tree)); +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(values); + } else +#endif + { + // pick sample, build searchtree + kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>( + as_device_type(values), size, as_device_type(tree)); + } // determine bucket sizes if (num_blocks > 0) { kernel::count_buckets<< exec, // base case auto out_ptr = reinterpret_cast(tmp1.get_data()); - kernel::basecase_select<<<1, kernel::basecase_block_size, 0, - exec->get_stream()>>>( - as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); - threshold = exec->copy_val_to_host(out_ptr); + +#ifdef GKO_COMPILING_HIP + if constexpr (std::is_same, half>::value) { + // HIP does not support 16bit atomic operation + GKO_NOT_SUPPORTED(m); + } else +#endif + { + kernel::basecase_select<<<1, kernel::basecase_block_size, 0, + exec->get_stream()>>>( + as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr)); + threshold = exec->copy_val_to_host(out_ptr); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/test/factorization/par_ilut_kernels.cpp b/test/factorization/par_ilut_kernels.cpp index 0aaac36e4b3..b95d32af0f3 100644 --- a/test/factorization/par_ilut_kernels.cpp +++ b/test/factorization/par_ilut_kernels.cpp @@ -239,6 +239,9 @@ TYPED_TEST_SUITE(ParIlut, gko::test::ValueIndexTypes, TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_select(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() / 3); } @@ -246,12 +249,18 @@ TYPED_TEST(ParIlut, KernelThresholdSelectIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdSelectMinIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_select(this->mtx_l, this->dmtx_l, 0); } TYPED_TEST(ParIlut, KernelThresholdSelectMaxIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_select(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() - 1); } @@ -318,6 +327,7 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) using Coo = typename TestFixture::Coo; using value_type = typename TestFixture::value_type; using index_type = typename TestFixture::index_type; + SKIP_IF_HALF(value_type); this->test_filter(this->mtx_l, this->dmtx_l, 0.5, true); auto res = Csr::create(this->ref, this->mtx_size); auto dres = Csr::create(this->exec, this->mtx_size); @@ -343,6 +353,9 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxNullptrCooIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_filter_approx(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() / 2); } @@ -350,12 +363,18 @@ TYPED_TEST(ParIlut, KernelThresholdFilterApproxLowerIsEquivalentToRef) TYPED_TEST(ParIlut, KernelThresholdFilterApproxNoneLowerIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_filter_approx(this->mtx_l, this->dmtx_l, 0); } TYPED_TEST(ParIlut, KernelThresholdFilterApproxAllLowerIsEquivalentToRef) { + using value_type = typename TestFixture::value_type; + SKIP_IF_HALF(value_type); + this->test_filter_approx(this->mtx_l, this->dmtx_l, this->mtx_l->get_num_stored_elements() - 1); } From 4bb8093c33d769b4adfcf0e0a9968f96f3b0ba3a Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 7 Oct 2024 22:06:43 +0200 Subject: [PATCH 52/62] merge two #if block --- core/solver/multigrid.cpp | 56 ++++++++++----------------------------- 1 file changed, 14 insertions(+), 42 deletions(-) diff --git a/core/solver/multigrid.cpp b/core/solver/multigrid.cpp index 2ae444a3d82..f67c5413ff7 100644 --- a/core/solver/multigrid.cpp +++ b/core/solver/multigrid.cpp @@ -317,13 +317,9 @@ void MultigridState::generate(const LinOp* system_matrix_in, auto next_nrows = mg_level_list.at(i)->get_coarse_op()->get_size()[0]; auto mg_level = mg_level_list.at(i); - run, + half, std::complex, #endif std::complex, std::complex>( mg_level, @@ -462,13 +458,9 @@ void MultigridState::run_mg_cycle(multigrid::cycle cycle, size_type level, return; } auto mg_level = multigrid->get_mg_level_list().at(level); - run, + half, std::complex, #endif std::complex, std::complex>( mg_level, [&, this](auto mg_level) { @@ -718,13 +710,9 @@ void Multigrid::generate() break; } - run, + half, std::complex, #endif std::complex, std::complex>( mg_level, @@ -763,13 +751,9 @@ void Multigrid::generate() auto last_mg_level = mg_level_list_.back(); // generate coarsest solver - run, + half, std::complex, #endif std::complex, std::complex>( last_mg_level, @@ -887,13 +871,9 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* b, LinOp* x, b, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, + half, std::complex, #endif std::complex, std::complex>(first_mg_level, lambda, b, x); @@ -933,13 +913,9 @@ void Multigrid::apply_with_initial_guess_impl(const LinOp* alpha, alpha, b, beta, x); }; auto first_mg_level = this->get_mg_level_list().front(); - run, + half, std::complex, #endif std::complex, std::complex>(first_mg_level, lambda, alpha, b, beta, x); @@ -1005,13 +981,9 @@ void Multigrid::apply_dense_impl(const VectorType* b, VectorType* x, auto first_mg_level = this->get_mg_level_list().front(); - run, + half, std::complex, #endif std::complex, std::complex>(first_mg_level, lambda, b, x); From c539398c7e326aef027c6b0a4238a2e2d0d2d989 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 8 Oct 2024 23:38:07 +0200 Subject: [PATCH 53/62] do not use attributes in sqrt and abs --- examples/custom-matrix-format/CMakeLists.txt | 3 +- include/ginkgo/core/base/half.hpp | 34 ++++---------------- include/ginkgo/core/base/math.hpp | 16 ++++----- 3 files changed, 15 insertions(+), 38 deletions(-) diff --git a/examples/custom-matrix-format/CMakeLists.txt b/examples/custom-matrix-format/CMakeLists.txt index 493437f9789..4093f40e51e 100644 --- a/examples/custom-matrix-format/CMakeLists.txt +++ b/examples/custom-matrix-format/CMakeLists.txt @@ -14,6 +14,7 @@ endif() add_executable(custom-matrix-format custom-matrix-format.cpp stencil_kernel.cu) target_link_libraries(custom-matrix-format Ginkgo::ginkgo OpenMP::OpenMP_CXX) - +target_compile_options(custom-matrix-format + PRIVATE $<$:--expt-relaxed-constexpr>) # workaround for clang-cuda/g++ interaction set_target_properties(custom-matrix-format PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index c4dfaed3cdd..27d991cfd7e 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -78,7 +78,6 @@ struct basic_float_traits { static constexpr bool rounds_to_nearest = true; }; -// #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) template <> struct basic_float_traits<__half> { using type = __half; @@ -87,7 +86,6 @@ struct basic_float_traits<__half> { static constexpr int exponent_bits = 5; static constexpr bool rounds_to_nearest = true; }; -// #endif template <> struct basic_float_traits { @@ -424,10 +422,6 @@ class half { // Rounding to even const auto result = conv::shift_sign(data_) | exp | conv::shift_significand(data_); - // return result + ((result & 1) && - // ((data_ >> (f32_traits::significand_bits - - // f16_traits::significand_bits - 1)) & - // 1)); const auto tail = data_ & static_cast( (1 << conv::significand_offset) - 1); @@ -504,19 +498,12 @@ class complex { value_type imag() const noexcept { return imag_; } - operator std::complex() const noexcept { return std::complex(static_cast(real_), static_cast(imag_)); } - // operator std::complex() const noexcept - // { - // return std::complex(static_cast(real_), - // static_cast(imag_)); - // } - template complex& operator=(const V& val) { @@ -578,18 +565,11 @@ class complex { result_f *= val_f; real_ = result_f.real(); imag_ = result_f.imag(); - // auto tmp = real_; - // real_ = real_ * val.real() - imag_ * val.imag(); - // imag_ = tmp * val.imag() + imag_ * val.real(); return *this; } template complex& operator/=(const complex& val) { - // auto real = val.real(); - // auto imag = val.imag(); - // (*this) *= complex{val.real(), -val.imag()}; - // (*this) /= (real * real + imag * imag); auto val_f = static_cast>(val); auto result_f = static_cast>(*this); result_f /= val_f; @@ -601,13 +581,13 @@ class complex { // It's for MacOS. // TODO: check whether mac compiler always use complex version even when real // half -#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES friend complex operator _op( \ - const complex lhf, const complex rhf) \ - { \ - auto a = lhf; \ - a _opeq rhf; \ - return a; \ +#define COMPLEX_HALF_OPERATOR(_op, _opeq) \ + friend complex operator _op(const complex lhf, \ + const complex rhf) \ + { \ + auto a = lhf; \ + a _opeq rhf; \ + return a; \ } COMPLEX_HALF_OPERATOR(+, +=) diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 5145167d78f..7d8edd0564e 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -32,31 +32,27 @@ class complex; } -// when using gko, abs will be ambiguous. delete that, get_relative_error can -// not find proper half namespace gko { + + using std::abs; using std::sqrt; -GKO_ATTRIBUTES GKO_INLINE gko::half abs(gko::half a) -{ - return gko::half((a > 0) ? a : -a); -} +GKO_INLINE gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } -GKO_ATTRIBUTES GKO_INLINE gko::half abs(std::complex a) +GKO_INLINE gko::half abs(std::complex a) { // Using float abs not sqrt on norm to avoid overflow return gko::half(abs(std::complex(a))); } -GKO_ATTRIBUTES GKO_INLINE gko::half sqrt(gko::half a) +GKO_INLINE gko::half sqrt(gko::half a) { return gko::half(std::sqrt(float(a))); } -GKO_ATTRIBUTES GKO_INLINE std::complex sqrt( - std::complex a) +GKO_INLINE std::complex sqrt(std::complex a) { return std::complex(sqrt(std::complex( static_cast(a.real()), static_cast(a.imag())))); From d0e2446e145eedde09f5c06c5dec7fa951594947 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 8 Oct 2024 23:47:28 +0200 Subject: [PATCH 54/62] make half constexpr --- .../jacobi_generate_kernels.instantiate.cpp | 8 +-- include/ginkgo/core/base/half.hpp | 62 +++++++++---------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp index ca0c480c08e..fdb0ad11e9e 100644 --- a/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_generate_kernels.instantiate.cpp @@ -160,7 +160,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate( accuracy, block_cond, [&subwarp, &block_size, &row, &block_data, &storage_scheme, &block_id] { - using target = reduce_precision; + using target = device_type>; return validate_precision_reduction_feasibility< max_block_size, target>( subwarp, block_size, row, @@ -170,8 +170,8 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate( }, [&subwarp, &block_size, &row, &block_data, &storage_scheme, &block_id] { - using target = - reduce_precision>; + using target = device_type< + reduce_precision>>; return validate_precision_reduction_feasibility< max_block_size, target>( subwarp, block_size, row, @@ -195,7 +195,7 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_generate( ValueType, prec, copy_matrix( subwarp, block_size, row, 1, perm, trans_perm, - reinterpret_cast( + reinterpret_cast*>( block_data + storage_scheme.get_group_offset(block_id)) + storage_scheme.get_block_offset(block_id), storage_scheme.get_stride())); diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index 27d991cfd7e..ff453a62b81 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -299,47 +299,47 @@ class half { // TODO: NVHPC (host side) may not use zero initialization for the data // member by default constructor in some cases. Not sure whether it is // caused by something else in jacobi or isai. - GKO_ATTRIBUTES half() noexcept : data_(0){}; + constexpr half() noexcept : data_(0){}; template ::value>> - GKO_ATTRIBUTES half(const T val) + constexpr half(const T val) { this->float2half(static_cast(val)); } - GKO_ATTRIBUTES half(const half& val) = default; + constexpr half(const half& val) = default; template - GKO_ATTRIBUTES half& operator=(const V val) + constexpr half& operator=(const V val) { this->float2half(static_cast(val)); return *this; } - GKO_ATTRIBUTES operator float() const noexcept + constexpr operator float() const noexcept { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - return __half2float(reinterpret_cast(data_)); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // return __half2float(reinterpret_cast(data_)); + // #else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) const auto bits = half2float(data_); return reinterpret_cast(bits); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } // can not use half operator _op(const half) for half + half // operation will cast it to float and then do float operation such that it // becomes float in the end. -#define HALF_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES friend half operator _op(const half lhf, const half rhf) \ - { \ - return static_cast(static_cast(lhf) \ - _op static_cast(rhf)); \ - } \ - GKO_ATTRIBUTES half& operator _opeq(const half& hf) \ - { \ - auto result = *this _op hf; \ - this->float2half(result); \ - return *this; \ +#define HALF_OPERATOR(_op, _opeq) \ + friend constexpr half operator _op(const half lhf, const half rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + constexpr half& operator _opeq(const half& hf) \ + { \ + auto result = *this _op hf; \ + this->float2half(result); \ + return *this; \ } HALF_OPERATOR(+, +=) HALF_OPERATOR(-, -=) @@ -351,7 +351,7 @@ class half { // If it is integer, using half as type #define HALF_FRIEND_OPERATOR(_op, _opeq) \ template \ - GKO_ATTRIBUTES friend std::enable_if_t< \ + constexpr friend std::enable_if_t< \ !std::is_same::value && std::is_scalar::value, \ std::conditional_t::value, T, half>> \ operator _op(const half hf, const T val) \ @@ -363,7 +363,7 @@ class half { return result; \ } \ template \ - GKO_ATTRIBUTES friend std::enable_if_t< \ + constexpr friend std::enable_if_t< \ !std::is_same::value && std::is_scalar::value, \ std::conditional_t::value, T, half>> \ operator _op(const T val, const half hf) \ @@ -381,7 +381,7 @@ class half { HALF_FRIEND_OPERATOR(/, /=) // the negative - GKO_ATTRIBUTES half operator-() const + constexpr half operator-() const { auto val = 0.0f - *this; return half(val); @@ -393,17 +393,17 @@ class half { // TODO: do we really need this one? // Without it, everything can be constexpr, which might make stuff easier. - GKO_ATTRIBUTES void float2half(float val) noexcept + constexpr void float2half(float val) noexcept { -#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - const auto tmp = __float2half_rn(val); - data_ = reinterpret_cast(tmp); -#else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // const auto tmp = __float2half_rn(val); + // data_ = reinterpret_cast(tmp); + // #else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) data_ = float2half(reinterpret_cast(val)); -#endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } - static GKO_ATTRIBUTES uint16 float2half(uint32 data_) noexcept + static constexpr uint16 float2half(uint32 data_) noexcept { using conv = detail::precision_converter; if (f32_traits::is_inf(data_)) { @@ -434,7 +434,7 @@ class half { } } - static GKO_ATTRIBUTES uint32 half2float(uint16 data_) noexcept + static constexpr uint32 half2float(uint16 data_) noexcept { using conv = detail::precision_converter; if (f16_traits::is_inf(data_)) { From 0d777df72c036ea5b3922ae5f5e3d1a68c80caef Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Thu, 17 Oct 2024 01:45:09 +0200 Subject: [PATCH 55/62] isolate half out of device completely --- .../cuda_hip/distributed/matrix_kernels.cpp | 14 +-- ...obi_advanced_apply_kernels.instantiate.cpp | 2 +- .../preconditioner/jacobi_kernels.cpp | 4 +- ...acobi_simple_apply_kernels.instantiate.cpp | 2 +- include/ginkgo/core/base/half.hpp | 85 ++++++++++++++----- 5 files changed, 73 insertions(+), 34 deletions(-) diff --git a/common/cuda_hip/distributed/matrix_kernels.cpp b/common/cuda_hip/distributed/matrix_kernels.cpp index 88988febbb0..551eafe6c8c 100644 --- a/common/cuda_hip/distributed/matrix_kernels.cpp +++ b/common/cuda_hip/distributed/matrix_kernels.cpp @@ -137,11 +137,11 @@ void separate_local_nonlocal( col_range_starting_indices[range_id]; }; - using input_type = input_type; + using input_type = input_type, GlobalIndexType>; auto input_it = thrust::make_zip_iterator(thrust::make_tuple( input.get_const_row_idxs(), input.get_const_col_idxs(), - input.get_const_values(), row_range_ids.get_const_data(), - col_range_ids.get_const_data())); + as_device_type(input.get_const_values()), + row_range_ids.get_const_data(), col_range_ids.get_const_data())); // copy and transform local entries into arrays local_row_idxs.resize_and_reset(num_local_elements); @@ -157,9 +157,9 @@ void separate_local_nonlocal( thrust::copy_if( policy, local_it, local_it + input.get_num_stored_elements(), range_ids_it, - thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(), - local_col_idxs.get_data(), - local_values.get_data())), + thrust::make_zip_iterator(thrust::make_tuple( + local_row_idxs.get_data(), local_col_idxs.get_data(), + as_device_type(local_values.get_data()))), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; @@ -185,7 +185,7 @@ void separate_local_nonlocal( range_ids_it, thrust::make_zip_iterator(thrust::make_tuple( non_local_row_idxs.get_data(), non_local_col_idxs.get_data(), - non_local_values.get_data())), + as_device_type(non_local_values.get_data()))), [local_part, row_part_ids, col_part_ids] __host__ __device__( const thrust::tuple& tuple) { auto row_part = row_part_ids[thrust::get<0>(tuple)]; diff --git a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp index 0ecc3d0d44b..131c530d2ee 100644 --- a/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_advanced_apply_kernels.instantiate.cpp @@ -90,7 +90,7 @@ __launch_bounds__(warps_per_block* config::warp_size) advanced_adaptive_apply( ValueType, block_precisions[block_id], multiply_vec( subwarp, block_size, v, - reinterpret_cast( + reinterpret_cast*>( blocks + storage_scheme.get_group_offset(block_id)) + storage_scheme.get_block_offset(block_id) + subwarp.thread_rank(), diff --git a/common/cuda_hip/preconditioner/jacobi_kernels.cpp b/common/cuda_hip/preconditioner/jacobi_kernels.cpp index f3b099e7c18..6f2d4ae3974 100644 --- a/common/cuda_hip/preconditioner/jacobi_kernels.cpp +++ b/common/cuda_hip/preconditioner/jacobi_kernels.cpp @@ -206,11 +206,11 @@ __launch_bounds__(warps_per_block* config::warp_size) adaptive_transpose_jacobi( GKO_PRECONDITIONER_JACOBI_RESOLVE_PRECISION( ValueType, block_precisions[block_id], auto local_block = - reinterpret_cast( + reinterpret_cast*>( blocks + storage_scheme.get_group_offset(block_id)) + storage_scheme.get_block_offset(block_id); auto local_out_block = - reinterpret_cast( + reinterpret_cast*>( out_blocks + storage_scheme.get_group_offset(block_id)) + storage_scheme.get_block_offset(block_id); for (int i = rank; i < block_size * block_size; i += subwarp_size) { diff --git a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp index 734385970e3..faf869718a6 100644 --- a/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp +++ b/common/cuda_hip/preconditioner/jacobi_simple_apply_kernels.instantiate.cpp @@ -84,7 +84,7 @@ __global__ void __launch_bounds__(warps_per_block* config::warp_size) ValueType, block_precisions[block_id], multiply_vec( subwarp, block_size, v, - reinterpret_cast( + reinterpret_cast*>( blocks + storage_scheme.get_group_offset(block_id)) + storage_scheme.get_block_offset(block_id) + subwarp.thread_rank(), diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index ff453a62b81..acb6db7141f 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -7,6 +7,7 @@ #include +#include #include #include @@ -285,6 +286,29 @@ struct precision_converter { } }; +template +constexpr void copy_by_char_impl(char* dst, const char* src) +{ + *dst = *src; + copy_by_char_impl(dst + 1, src + 1); +} + +template <> +constexpr void copy_by_char_impl<1>(char* dst, const char* src) +{ + *dst = *src; +} + +template +constexpr void copy_by_char(DstType& dst, const SrcType& src) +{ + static_assert(sizeof(DstType) == sizeof(SrcType), + "Type size must be the same."); + static_assert(sizeof(DstType) % sizeof(char) == 0, + "Type size must be divisible by char"); + copy_by_char_impl((char*)(&dst), (const char*)(&src)); +} + } // namespace detail @@ -302,44 +326,46 @@ class half { constexpr half() noexcept : data_(0){}; template ::value>> - constexpr half(const T val) + half(const T val) : data_(0) { this->float2half(static_cast(val)); } - constexpr half(const half& val) = default; + half(const half& val) { data_ = val.data_; }; template - constexpr half& operator=(const V val) + half& operator=(const V val) { this->float2half(static_cast(val)); return *this; } - constexpr operator float() const noexcept + operator float() const noexcept { // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // return __half2float(reinterpret_cast(data_)); // #else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) const auto bits = half2float(data_); - return reinterpret_cast(bits); + float ans(0); + detail::copy_by_char(ans, bits); + return ans; // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } // can not use half operator _op(const half) for half + half // operation will cast it to float and then do float operation such that it // becomes float in the end. -#define HALF_OPERATOR(_op, _opeq) \ - friend constexpr half operator _op(const half lhf, const half rhf) \ - { \ - return static_cast(static_cast(lhf) \ - _op static_cast(rhf)); \ - } \ - constexpr half& operator _opeq(const half& hf) \ - { \ - auto result = *this _op hf; \ - this->float2half(result); \ - return *this; \ +#define HALF_OPERATOR(_op, _opeq) \ + friend half operator _op(const half lhf, const half rhf) \ + { \ + return static_cast(static_cast(lhf) \ + _op static_cast(rhf)); \ + } \ + half& operator _opeq(const half& hf) \ + { \ + auto result = *this _op hf; \ + this->float2half(result); \ + return *this; \ } HALF_OPERATOR(+, +=) HALF_OPERATOR(-, -=) @@ -351,7 +377,7 @@ class half { // If it is integer, using half as type #define HALF_FRIEND_OPERATOR(_op, _opeq) \ template \ - constexpr friend std::enable_if_t< \ + friend std::enable_if_t< \ !std::is_same::value && std::is_scalar::value, \ std::conditional_t::value, T, half>> \ operator _op(const half hf, const T val) \ @@ -363,7 +389,7 @@ class half { return result; \ } \ template \ - constexpr friend std::enable_if_t< \ + friend std::enable_if_t< \ !std::is_same::value && std::is_scalar::value, \ std::conditional_t::value, T, half>> \ operator _op(const T val, const half hf) \ @@ -381,7 +407,7 @@ class half { HALF_FRIEND_OPERATOR(/, /=) // the negative - constexpr half operator-() const + half operator-() const { auto val = 0.0f - *this; return half(val); @@ -392,14 +418,18 @@ class half { using f32_traits = detail::float_traits; // TODO: do we really need this one? - // Without it, everything can be constexpr, which might make stuff easier. - constexpr void float2half(float val) noexcept + // Without it, everything can be GKO_INLINE GKO_ATTRIBUTES, which might make + // stuff easier. + void float2half(float val) noexcept { // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // const auto tmp = __float2half_rn(val); // data_ = reinterpret_cast(tmp); // #else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - data_ = float2half(reinterpret_cast(val)); + uint32 bit_val(0); + detail::copy_by_char(bit_val, val); + // std::memcpy(&bit_val, &val, sizeof(float)); + data_ = float2half(bit_val); // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } @@ -617,20 +647,28 @@ struct numeric_limits { // a constexpr constructor. static constexpr float epsilon() { + // 0x1400 + // 0b0 00101 0000 000000 return gko::detail::float_traits::eps; } static constexpr float infinity() { + // 0b0 11111 0000000000 return numeric_limits::infinity(); } - static constexpr float min() { return 1.0f / (1ll << 14); } + static constexpr float min() + { + // 0b0 00001 0000000000 + return 1.0f / (1ll << 14); + } // The maximal exponent is 15, and the maximal significant is // 1 + (2^-10 - 1) / 2^-10 static constexpr float max() { + // 0b0 11110 1111111111 return (1ll << 15) * (1.0f + static_cast((1ll << 10) - 1) / (1ll << 10)); } @@ -639,6 +677,7 @@ struct numeric_limits { static constexpr float quiet_NaN() { + // 0x7FFF return numeric_limits::quiet_NaN(); } }; From 56e2af83b44d884771cc84a4a93c5fcc6274c302 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Fri, 18 Oct 2024 09:52:30 +0200 Subject: [PATCH 56/62] bits constexpr construct half and make numeric_limit in half --- CMakeLists.txt | 5 + common/cuda_hip/base/math.hpp | 46 +++++++- common/cuda_hip/components/merging.hpp | 4 +- .../factorization/par_ict_kernels.cpp | 2 +- .../factorization/par_ilut_select_kernels.hpp | 2 +- .../factorization/par_ilut_spgeam_kernels.cpp | 2 +- .../cuda_hip/matrix/csr_kernels.template.cpp | 2 +- common/cuda_hip/reorder/rcm_kernels.cpp | 3 +- common/unified/base/kernel_launch.hpp | 2 + .../accessor/reduced_row_major_ginkgo.cpp | 14 ++- cuda/base/config.hpp | 3 +- cuda/test/base/math.cu | 4 +- hip/base/config.hip.hpp | 2 +- hip/test/base/math.hip.cpp | 4 +- include/ginkgo/core/base/half.hpp | 110 ++++++------------ reference/test/reorder/mc64_kernels.cpp | 5 +- 16 files changed, 106 insertions(+), 104 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6a9ef0f817..339699bed38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,11 @@ option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be tim option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF) option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF) option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON) +# We do not support MSVC. SYCL will come later +if(MSVC OR GINKGO_BUILD_SYCL) + message(STATUS "HALF is not supported in MSVC, and later support in SYCL") + set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE) +endif() option(GINKGO_SKIP_DEPENDENCY_UPDATE "Do not update dependencies each time the project is rebuilt" ON) option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF) diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index cf746206f46..75fd3976c4f 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -11,6 +11,21 @@ #include +#ifdef GKO_COMPILING_CUDA + + +#include + + +#elif defined(GKO_COMPILING_HIP) + + +#include + + +#endif + + namespace gko { @@ -18,16 +33,35 @@ namespace gko { // __device__ function (even though it is constexpr) template struct device_numeric_limits { - static constexpr auto inf = std::numeric_limits::infinity(); - static constexpr auto max = std::numeric_limits::max(); - static constexpr auto min = std::numeric_limits::min(); + static constexpr auto inf() { return std::numeric_limits::infinity(); } + static constexpr auto max() { return std::numeric_limits::max(); } + static constexpr auto min() { return std::numeric_limits::min(); } }; template <> struct device_numeric_limits<__half> { - static constexpr auto inf = std::numeric_limits::infinity(); - static constexpr auto max = std::numeric_limits::max(); - static constexpr auto min = std::numeric_limits::min(); + // from __half documentation, it accepts unsigned short + // __half does not have constexpr + static GKO_ATTRIBUTES GKO_INLINE auto inf() + { + __half_raw bits; + bits.x = static_cast(0b0111110000000000u); + return __half{bits}; + } + + static GKO_ATTRIBUTES GKO_INLINE auto max() + { + __half_raw bits; + bits.x = static_cast(0b0111101111111111u); + return __half{bits}; + } + + static GKO_ATTRIBUTES GKO_INLINE auto min() + { + __half_raw bits; + bits.x = static_cast(0b0000010000000000u); + return __half{bits}; + } }; diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp index ab070741fbd..b832a97176e 100644 --- a/common/cuda_hip/components/merging.hpp +++ b/common/cuda_hip/components/merging.hpp @@ -131,7 +131,7 @@ __forceinline__ __device__ void group_merge(const ValueType* __restrict__ a, IndexType a_begin{}; IndexType b_begin{}; auto lane = static_cast(group.thread_rank()); - auto sentinel = device_numeric_limits::max; + auto sentinel = device_numeric_limits::max(); auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel); auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel); for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) { @@ -240,7 +240,7 @@ __forceinline__ __device__ void sequential_merge( auto c_size = a_size + b_size; IndexType a_begin{}; IndexType b_begin{}; - auto sentinel = device_numeric_limits::max; + auto sentinel = device_numeric_limits::max(); auto a_cur = checked_load(a, a_begin, a_size, sentinel); auto b_cur = checked_load(b, b_begin, b_size, sentinel); for (IndexType c_begin{}; c_begin < c_size; c_begin++) { diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp index be1866b256e..420a1775e58 100644 --- a/common/cuda_hip/factorization/par_ict_kernels.cpp +++ b/common/cuda_hip/factorization/par_ict_kernels.cpp @@ -128,7 +128,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init( IndexType l_new_begin = l_new_row_ptrs[row]; - constexpr auto sentinel = device_numeric_limits::max; + constexpr auto sentinel = device_numeric_limits::max(); // load column indices and values for the first merge step auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel); auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero()); diff --git a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp index 86f58717963..72af3685c39 100644 --- a/common/cuda_hip/factorization/par_ilut_select_kernels.hpp +++ b/common/cuda_hip/factorization/par_ilut_select_kernels.hpp @@ -254,7 +254,7 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select( const ValueType* __restrict__ input, IndexType size, IndexType rank, ValueType* __restrict__ out) { - constexpr auto sentinel = device_numeric_limits::inf; + const auto sentinel = device_numeric_limits::inf(); ValueType local[basecase_local_size]; __shared__ ValueType sh_local[basecase_size]; for (int i = 0; i < basecase_local_size; ++i) { diff --git a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp index 6cc77660394..e5ab2b80a75 100644 --- a/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp +++ b/common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp @@ -150,7 +150,7 @@ __global__ __launch_bounds__(default_block_size) void tri_spgeam_init( IndexType l_new_begin = l_new_row_ptrs[row]; IndexType u_new_begin = u_new_row_ptrs[row]; - constexpr auto sentinel = device_numeric_limits::max; + constexpr auto sentinel = device_numeric_limits::max(); // load column indices and values for the first merge step auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel); auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero()); diff --git a/common/cuda_hip/matrix/csr_kernels.template.cpp b/common/cuda_hip/matrix/csr_kernels.template.cpp index cd0cd84ea29..477d68ffae2 100644 --- a/common/cuda_hip/matrix/csr_kernels.template.cpp +++ b/common/cuda_hip/matrix/csr_kernels.template.cpp @@ -1193,7 +1193,7 @@ __global__ __launch_bounds__(default_block_size) void build_csr_lookup( const auto i = base_i + lane; const auto col = i < row_len ? local_cols[i] - : device_numeric_limits::max; + : device_numeric_limits::max(); const auto rel_col = static_cast(col - min_col); const auto block = rel_col / bitmap_block_size; const auto col_in_block = rel_col % bitmap_block_size; diff --git a/common/cuda_hip/reorder/rcm_kernels.cpp b/common/cuda_hip/reorder/rcm_kernels.cpp index 72729db30f1..ab982c219f9 100644 --- a/common/cuda_hip/reorder/rcm_kernels.cpp +++ b/common/cuda_hip/reorder/rcm_kernels.cpp @@ -21,6 +21,7 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/components/memory.hpp" #include "common/cuda_hip/components/thread_ids.hpp" @@ -524,7 +525,7 @@ __global__ __launch_bounds__(default_block_size) void ubfs_min_neighbor_kernel( const auto begin = row_ptrs[row]; const auto end = row_ptrs[row + 1]; const auto cur_level = node_levels[row]; - auto min_neighbor = device_numeric_limits::max; + auto min_neighbor = device_numeric_limits::max(); for (auto nz = begin; nz < end; nz++) { const auto col = col_idxs[nz]; const auto neighbor_level = node_levels[col]; diff --git a/common/unified/base/kernel_launch.hpp b/common/unified/base/kernel_launch.hpp index 455d3d67a6d..d4810e1aa95 100644 --- a/common/unified/base/kernel_launch.hpp +++ b/common/unified/base/kernel_launch.hpp @@ -17,6 +17,7 @@ #if defined(GKO_COMPILING_CUDA) #define GKO_KERNEL __device__ +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" @@ -43,6 +44,7 @@ GKO_INLINE GKO_ATTRIBUTES constexpr unpack_member_type unpack_member(T value) #elif defined(GKO_COMPILING_HIP) #define GKO_KERNEL __device__ +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/core/test/accessor/reduced_row_major_ginkgo.cpp b/core/test/accessor/reduced_row_major_ginkgo.cpp index 13ab40cf933..5431c45590a 100644 --- a/core/test/accessor/reduced_row_major_ginkgo.cpp +++ b/core/test/accessor/reduced_row_major_ginkgo.cpp @@ -35,12 +35,7 @@ class ReducedStorage3d : public ::testing::Test { using st_type = typename std::tuple_element<1, decltype(ArithmeticStorageType{})>::type; using rcar_type = gko::acc::remove_complex_t; - static constexpr rcar_type delta{ - std::is_same::value - ? 0 - : std::numeric_limits< - gko::acc::remove_complex_t>::epsilon() * - 1e1}; + static const rcar_type delta; // Type for `check_accessor_correctness` to forward the indices using t = std::tuple; @@ -121,6 +116,13 @@ class ReducedStorage3d : public ::testing::Test { } }; +template +const typename ReducedStorage3d::rcar_type ReducedStorage3d::delta = + std::is_same::value + ? 0 + : std::numeric_limits>::epsilon() * + 1e1; + using ReducedStorage3dTypes = ::testing::Types, std::tuple, std::tuple, std::tuple, diff --git a/cuda/base/config.hpp b/cuda/base/config.hpp index fe280c76dec..f89cb0702f6 100644 --- a/cuda/base/config.hpp +++ b/cuda/base/config.hpp @@ -6,10 +6,9 @@ #define GKO_CUDA_BASE_CONFIG_HPP_ +#include #include -#include "common/cuda_hip/base/math.hpp" - namespace gko { namespace kernels { diff --git a/cuda/test/base/math.cu b/cuda/test/base/math.cu index 71532b45e80..0a2529df11f 100644 --- a/cuda/test/base/math.cu +++ b/cuda/test/base/math.cu @@ -25,7 +25,7 @@ namespace kernel { template __device__ bool test_real_is_finite_function(FuncType isfin) { - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; @@ -45,7 +45,7 @@ __device__ bool test_complex_is_finite_function(FuncType isfin) "Template type must be a complex type."); using T = gko::remove_complex; using c_type = gko::kernels::cuda::cuda_type; - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; diff --git a/hip/base/config.hip.hpp b/hip/base/config.hip.hpp index 114eb2f0f0a..832b750f0fd 100644 --- a/hip/base/config.hip.hpp +++ b/hip/base/config.hip.hpp @@ -6,9 +6,9 @@ #define GKO_HIP_BASE_CONFIG_HIP_HPP_ +#include #include -#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" diff --git a/hip/test/base/math.hip.cpp b/hip/test/base/math.hip.cpp index 01fb96afa7c..4f4283ebd9c 100644 --- a/hip/test/base/math.hip.cpp +++ b/hip/test/base/math.hip.cpp @@ -31,7 +31,7 @@ namespace kernel { template __device__ bool test_real_is_finite_function(FuncType isfin) { - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; @@ -51,7 +51,7 @@ __device__ bool test_complex_is_finite_function(FuncType isfin) "Template type must be a complex type."); using T = gko::remove_complex; using c_type = gko::kernels::hip::hip_type; - constexpr T inf = gko::device_numeric_limits::inf; + constexpr T inf = gko::device_numeric_limits::inf(); constexpr T quiet_nan = NAN; bool test_true{}; bool test_false{}; diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index acb6db7141f..afc98fd476c 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -14,27 +14,9 @@ #include -#ifdef __CUDA_ARCH__ - - -#include - - -#elif defined(__HIP_DEVICE_COMPILE__) - - -#include - - -#else - - class __half; -#endif // __CUDA_ARCH__ - - namespace gko { @@ -286,29 +268,6 @@ struct precision_converter { } }; -template -constexpr void copy_by_char_impl(char* dst, const char* src) -{ - *dst = *src; - copy_by_char_impl(dst + 1, src + 1); -} - -template <> -constexpr void copy_by_char_impl<1>(char* dst, const char* src) -{ - *dst = *src; -} - -template -constexpr void copy_by_char(DstType& dst, const SrcType& src) -{ - static_assert(sizeof(DstType) == sizeof(SrcType), - "Type size must be the same."); - static_assert(sizeof(DstType) % sizeof(char) == 0, - "Type size must be divisible by char"); - copy_by_char_impl((char*)(&dst), (const char*)(&src)); -} - } // namespace detail @@ -320,6 +279,14 @@ constexpr void copy_by_char(DstType& dst, const SrcType& src) */ class half { public: + // create half value from the bits directly. + static constexpr half create_from_bits(uint16 bits) noexcept + { + half result; + result.data_ = bits; + return result; + } + // TODO: NVHPC (host side) may not use zero initialization for the data // member by default constructor in some cases. Not sure whether it is // caused by something else in jacobi or isai. @@ -331,7 +298,7 @@ class half { this->float2half(static_cast(val)); } - half(const half& val) { data_ = val.data_; }; + constexpr half(const half& val) : data_(0) { data_ = val.data_; }; template half& operator=(const V val) @@ -342,14 +309,10 @@ class half { operator float() const noexcept { - // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - // return __half2float(reinterpret_cast(data_)); - // #else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) const auto bits = half2float(data_); float ans(0); - detail::copy_by_char(ans, bits); + std::memcpy(&ans, &bits, sizeof(float)); return ans; - // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } // can not use half operator _op(const half) for half + half @@ -417,20 +380,11 @@ class half { using f16_traits = detail::float_traits; using f32_traits = detail::float_traits; - // TODO: do we really need this one? - // Without it, everything can be GKO_INLINE GKO_ATTRIBUTES, which might make - // stuff easier. void float2half(float val) noexcept { - // #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) - // const auto tmp = __float2half_rn(val); - // data_ = reinterpret_cast(tmp); - // #else // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) uint32 bit_val(0); - detail::copy_by_char(bit_val, val); - // std::memcpy(&bit_val, &val, sizeof(float)); + std::memcpy(&bit_val, &val, sizeof(float)); data_ = float2half(bit_val); - // #endif // defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) } static constexpr uint16 float2half(uint32 data_) noexcept @@ -643,42 +597,46 @@ struct numeric_limits { // 3/10 is approx. log_10(2) static constexpr int digits10{digits * 3 / 10}; - // Note: gko::half can't return gko::half here because it does not have - // a constexpr constructor. - static constexpr float epsilon() + static constexpr gko::half epsilon() { - // 0x1400 // 0b0 00101 0000 000000 - return gko::detail::float_traits::eps; + constexpr auto bits = static_cast(0b0'00101'0000000000u); + return gko::half::create_from_bits(bits); } - static constexpr float infinity() + static constexpr gko::half infinity() { // 0b0 11111 0000000000 - return numeric_limits::infinity(); + constexpr auto bits = static_cast(0b0'11111'0000000000u); + return gko::half::create_from_bits(bits); } - static constexpr float min() + static constexpr gko::half min() { - // 0b0 00001 0000000000 - return 1.0f / (1ll << 14); + // 0b0 00001 0000000000 (normal value) + constexpr auto bits = static_cast(0b0'00001'0000000000u); + return gko::half::create_from_bits(bits); } - // The maximal exponent is 15, and the maximal significant is - // 1 + (2^-10 - 1) / 2^-10 - static constexpr float max() + static constexpr gko::half max() { // 0b0 11110 1111111111 - return (1ll << 15) * - (1.0f + static_cast((1ll << 10) - 1) / (1ll << 10)); + constexpr auto bits = static_cast(0b0'11110'1111111111u); + return gko::half::create_from_bits(bits); } - static constexpr float lowest() { return -max(); }; + static constexpr gko::half lowest() + { + // 0b1 11110 1111111111 + constexpr auto bits = static_cast(0b1'11110'1111111111u); + return gko::half::create_from_bits(bits); + }; - static constexpr float quiet_NaN() + static constexpr gko::half quiet_NaN() { - // 0x7FFF - return numeric_limits::quiet_NaN(); + // 0b0 11111 1111111111 + constexpr auto bits = static_cast(0b0'11111'1111111111u); + return gko::half::create_from_bits(bits); } }; diff --git a/reference/test/reorder/mc64_kernels.cpp b/reference/test/reorder/mc64_kernels.cpp index 56126ae1fb8..d328bb3ef18 100644 --- a/reference/test/reorder/mc64_kernels.cpp +++ b/reference/test/reorder/mc64_kernels.cpp @@ -12,6 +12,7 @@ #include +#include #include #include #include @@ -134,8 +135,8 @@ class Mc64 : public ::testing::Test { { ASSERT_EQ(a.get_size(), b.get_size()); for (gko::size_type i = 0; i < a.get_size(); i++) { - if (std::isfinite(a.get_const_data()[i]) || - std::isfinite(b.get_const_data()[i])) { + if (gko::is_finite(a.get_const_data()[i]) || + gko::is_finite(b.get_const_data()[i])) { ASSERT_NEAR(a.get_const_data()[i], b.get_const_data()[i], r::value) << name << '[' << i << ']'; From 6cc26d7e55fb887d254a39f5d43a614dbda0bd33 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Mon, 21 Oct 2024 16:54:15 +0200 Subject: [PATCH 57/62] refine the code and fix error without half --- benchmark/CMakeLists.txt | 17 ++---- .../base/device_matrix_data_kernels.cpp | 22 ++++--- core/matrix/permutation.cpp | 2 +- core/multigrid/pgm.cpp | 3 +- core/reorder/mc64.cpp | 14 ++--- core/test/base/extended_float.cpp | 12 ++-- include/ginkgo/core/base/exception.hpp | 59 +++++++++---------- include/ginkgo/core/base/half.hpp | 3 + include/ginkgo/core/base/math.hpp | 7 +++ include/ginkgo/core/base/matrix_data.hpp | 6 +- include/ginkgo/core/base/types.hpp | 7 +-- test/components/fill_array_kernels.cpp | 5 +- test/mpi/matrix.cpp | 4 +- 13 files changed, 76 insertions(+), 85 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index c9c5e0e64f0..180bd66d738 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -77,17 +77,12 @@ function(ginkgo_add_single_benchmark_executable name use_lib_linops macro_def ty target_compile_definitions("${name}" PRIVATE "${macro_def}") ginkgo_benchmark_add_tuning_maybe("${name}") if("${use_lib_linops}") - if ("${type}" STREQUAL "h") - # only cuda supports half currently - if (GINKGO_BUILD_CUDA) - target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) - target_link_libraries("${name}" cusparse_linops_${type}) - endif() - else() - if (GINKGO_BUILD_CUDA) - target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) - target_link_libraries("${name}" cusparse_linops_${type}) - endif() + if(GINKGO_BUILD_CUDA) + target_compile_definitions("${name}" PRIVATE HAS_CUDA=1) + target_link_libraries("${name}" cusparse_linops_${type}) + endif() + # only cuda supports half currently + if(NOT ("${type}" STREQUAL "h")) if (GINKGO_BUILD_HIP) target_compile_definitions("${name}" PRIVATE HAS_HIP=1) target_link_libraries("${name}" hipsparse_linops_${type}) diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp index c8dabf63660..c05ed991dbd 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.cpp +++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp @@ -22,6 +22,15 @@ namespace GKO_DEVICE_NAMESPACE { namespace components { +// __half != only in __device__ +// Although gko::is_nonzero is constexpr, it still shows calling __device__ in +// __host__ +template +GKO_INLINE __device__ constexpr bool is_nonzero(T value) +{ + return value != zero(); +} + template void remove_zeros(std::shared_ptr exec, array& values, array& row_idxs, @@ -31,13 +40,9 @@ void remove_zeros(std::shared_ptr exec, auto value_ptr = as_device_type(values.get_const_data()); auto size = values.get_size(); // count nonzeros - // __half != is only device, can not call __device__ from a __host__ - // __device__ (is_nonzero) - auto nnz = - thrust::count_if(thrust_policy(exec), value_ptr, value_ptr + size, - [] __device__(device_value_type value) { - return value != zero(value); - }); + auto nnz = thrust::count_if( + thrust_policy(exec), value_ptr, value_ptr + size, + [] __device__(device_value_type value) { return is_nonzero(value); }); if (nnz < size) { using tuple_type = thrust::tuple; @@ -53,8 +58,7 @@ void remove_zeros(std::shared_ptr exec, as_device_type(new_values.get_data()))); thrust::copy_if(thrust_policy(exec), it, it + size, out_it, [] __device__(tuple_type entry) { - return thrust::get<2>(entry) != - zero(thrust::get<2>(entry)); + return is_nonzero(thrust::get<2>(entry)); }); // swap out storage values = std::move(new_values); diff --git a/core/matrix/permutation.cpp b/core/matrix/permutation.cpp index 127a1edfd63..b6b9ff2d7e4 100644 --- a/core/matrix/permutation.cpp +++ b/core/matrix/permutation.cpp @@ -268,7 +268,7 @@ void dispatch_dense(const LinOp* op, Functor fn) using matrix::Dense; using std::complex; run, #endif double, float, std::complex, std::complex>(op, fn); diff --git a/core/multigrid/pgm.cpp b/core/multigrid/pgm.cpp index 7e7ccf24037..0fa4b42f9bc 100644 --- a/core/multigrid/pgm.cpp +++ b/core/multigrid/pgm.cpp @@ -190,7 +190,8 @@ Pgm::generate_local( auto abs_mtx = local_matrix->compute_absolute(); // abs_mtx is already real valuetype, so transpose is enough auto weight_mtx = gko::as(abs_mtx->transpose()); - auto half_scalar = initialize>({half(0.5)}, exec); + auto half_scalar = + initialize>({real_type{0.5}}, exec); auto identity = matrix::Identity::create(exec, num_rows); // W = (abs_mtx + transpose(abs_mtx))/2 abs_mtx->apply(half_scalar, identity, half_scalar, weight_mtx); diff --git a/core/reorder/mc64.cpp b/core/reorder/mc64.cpp index 4aa53fcde86..26a1b5bb0ad 100644 --- a/core/reorder/mc64.cpp +++ b/core/reorder/mc64.cpp @@ -37,8 +37,7 @@ void initialize_weights(const matrix::Csr* host_mtx, array>& row_maxima_array, gko::experimental::reorder::mc64_strategy strategy) { - auto inf = static_cast>( - std::numeric_limits>::infinity()); + const auto inf = std::numeric_limits>::infinity(); const auto num_rows = host_mtx->get_size()[0]; const auto row_ptrs = host_mtx->get_const_row_ptrs(); const auto col_idxs = host_mtx->get_const_col_idxs(); @@ -50,7 +49,7 @@ void initialize_weights(const matrix::Csr* host_mtx, for (IndexType row = 0; row < num_rows; row++) { const auto row_begin = row_ptrs[row]; const auto row_end = row_ptrs[row + 1]; - auto row_max = static_cast>(-inf); + auto row_max = -inf; for (IndexType idx = row_begin; idx < row_end; idx++) { const auto weight = calculate_weight(values[idx]); weights[idx] = weight; @@ -181,8 +180,7 @@ void shortest_augmenting_path( addressable_priority_queue& queue, std::vector& q_j, ValueType tolerance) { - auto inf = - static_cast(std::numeric_limits::infinity()); + const auto inf = std::numeric_limits::infinity(); auto weights = weights_array.get_data(); auto dual_u = dual_u_array.get_data(); auto distance = distance_array.get_data(); @@ -436,8 +434,7 @@ void compute_scaling(const matrix::Csr* host_mtx, mc64_strategy strategy, ValueType* row_scaling, ValueType* col_scaling) { - auto inf = static_cast>( - std::numeric_limits>::infinity()); + const auto inf = std::numeric_limits>::infinity(); const auto num_rows = host_mtx->get_size()[0]; const auto weights = weights_array.get_const_data(); const auto dual_u = dual_u_array.get_const_data(); @@ -541,8 +538,7 @@ std::unique_ptr Mc64::generate_impl( marked_cols.fill(0); matched_idxs.fill(0); unmatched_rows.fill(0); - auto inf = static_cast>( - std::numeric_limits>::infinity()); + const auto inf = std::numeric_limits>::infinity(); dual_u.fill(inf); distance.fill(inf); diff --git a/core/test/base/extended_float.cpp b/core/test/base/extended_float.cpp index 7bc1d312ac2..5c9367dacae 100644 --- a/core/test/base/extended_float.cpp +++ b/core/test/base/extended_float.cpp @@ -112,8 +112,7 @@ TEST_F(FloatToHalf, ConvertsNan) { half x = create_from_bits("0" "11111111" "00000000000000000000001"); - #if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + #if defined(SYCL_LANGUAGE_VERSION) // Sycl put the 1000000000, but ours put mask ASSERT_EQ(get_bits(x), get_bits("0" "11111" "1000000000")); #else @@ -126,8 +125,7 @@ TEST_F(FloatToHalf, ConvertsNegNan) { half x = create_from_bits("1" "11111111" "00010000000000000000000"); - #if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + #if defined(SYCL_LANGUAGE_VERSION) // Sycl put the 1000000000, but ours put mask ASSERT_EQ(get_bits(x), get_bits("1" "11111" "1000000000")); #else @@ -254,8 +252,7 @@ TEST_F(HalfToFloat, ConvertsNan) { float x = create_from_bits("0" "11111" "0001001000"); - #if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + #if defined(SYCL_LANGUAGE_VERSION) // sycl keeps significand ASSERT_EQ(get_bits(x), get_bits("0" "11111111" "00010010000000000000000")); #else @@ -268,8 +265,7 @@ TEST_F(HalfToFloat, ConvertsNegNan) { float x = create_from_bits("1" "11111" "0000000001"); - #if defined(SYCL_LANGUAGE_VERSION) && \ - (__LIBSYCL_MAJOR_VERSION > 5 || (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) + #if defined(SYCL_LANGUAGE_VERSION) // sycl keeps significand ASSERT_EQ(get_bits(x), get_bits("1" "11111111" "00000000010000000000000")); #else diff --git a/include/ginkgo/core/base/exception.hpp b/include/ginkgo/core/base/exception.hpp index 17e09e00d18..febc5e17034 100644 --- a/include/ginkgo/core/base/exception.hpp +++ b/include/ginkgo/core/base/exception.hpp @@ -13,19 +13,21 @@ namespace gko { + + /** * The Error class is used to report exceptional behaviour in library * functions. Ginkgo uses C++ exception mechanism to this end, and the - * Error class represents a base class for all types of errors. The exact - * list of errors which could occur during the execution of a certain - * library routine is provided in the documentation of that routine, along - * with a short description of the situation when that error can occur. - * During runtime, these errors can be detected by using standard C++ - * try-catch blocks, and a human-readable error description can be obtained - * by calling the Error::what() method. + * Error class represents a base class for all types of errors. The exact list + * of errors which could occur during the execution of a certain library + * routine is provided in the documentation of that routine, along with a short + * description of the situation when that error can occur. + * During runtime, these errors can be detected by using standard C++ try-catch + * blocks, and a human-readable error description can be obtained by calling + * the Error::what() method. * - * As an example, trying to compute a matrix-vector product with arguments - * of incompatible size will result in a DimensionMismatch error, which is + * As an example, trying to compute a matrix-vector product with arguments of + * incompatible size will result in a DimensionMismatch error, which is * demonstrated in the following program. * * ```cpp @@ -66,8 +68,8 @@ class Error : public std::exception { {} /** - * Returns a human-readable string with a more detailed description of - * the error. + * Returns a human-readable string with a more detailed description of the + * error. */ virtual const char* what() const noexcept override { return what_.c_str(); } @@ -96,8 +98,8 @@ class NotImplemented : public Error { /** - * NotCompiled is thrown when attempting to call an operation which is a - * part of a module that was not compiled on the system. + * NotCompiled is thrown when attempting to call an operation which is a part of + * a module that was not compiled on the system. */ class NotCompiled : public Error { public: @@ -234,8 +236,7 @@ class CurandError : public Error { /** - * CusparseError is thrown when a cuSPARSE routine throws a non-zero error - * code. + * CusparseError is thrown when a cuSPARSE routine throws a non-zero error code. */ class CusparseError : public Error { public: @@ -304,8 +305,7 @@ class HipError : public Error { /** - * HipblasError is thrown when a hipBLAS routine throws a non-zero error - * code. + * HipblasError is thrown when a hipBLAS routine throws a non-zero error code. */ class HipblasError : public Error { public: @@ -328,8 +328,7 @@ class HipblasError : public Error { /** - * HiprandError is thrown when a hipRAND routine throws a non-zero error - * code. + * HiprandError is thrown when a hipRAND routine throws a non-zero error code. */ class HiprandError : public Error { public: @@ -436,8 +435,7 @@ class DimensionMismatch : public Error { * @param second_name The name of the second operator * @param second_rows The output dimension of the second operator * @param second_cols The input dimension of the second operator - * @param clarification An additional message describing the error - * further + * @param clarification An additional message describing the error further */ DimensionMismatch(const std::string& file, int line, const std::string& func, const std::string& first_name, @@ -469,8 +467,7 @@ class BadDimension : public Error { * @param op_name The name of the operator * @param op_num_rows The row dimension of the operator * @param op_num_cols The column dimension of the operator - * @param clarification An additional message further describing the - * error + * @param clarification An additional message further describing the error */ BadDimension(const std::string& file, int line, const std::string& func, const std::string& op_name, size_type op_num_rows, @@ -486,8 +483,8 @@ class BadDimension : public Error { /** * Error that denotes issues between block sizes and matrix dimensions * - * \tparam IndexType Type of index used by the linear algebra object that - * is incompatible with the required block size. + * \tparam IndexType Type of index used by the linear algebra object that is + * incompatible with the required block size. */ template class BlockSizeError : public Error { @@ -520,8 +517,7 @@ class ValueMismatch : public Error { * @param func The function name where the error occurred * @param val1 The first value to be compared. * @param val2 The second value to be compared. - * @param clarification An additional message further describing the - * error + * @param clarification An additional message further describing the error */ ValueMismatch(const std::string& file, int line, const std::string& func, size_type val1, size_type val2, @@ -580,9 +576,8 @@ class OutOfBoundsError : public Error { /** - * OverflowError is thrown when an index calculation for storage - * requirements overflows. This most likely means that the index type is too - * small. + * OverflowError is thrown when an index calculation for storage requirements + * overflows. This most likely means that the index type is too small. */ class OverflowError : public Error { public: @@ -619,8 +614,8 @@ class StreamError : public Error { /** - * KernelNotFound is thrown if Ginkgo cannot find a kernel which satisfies - * the criteria imposed by the input arguments. + * KernelNotFound is thrown if Ginkgo cannot find a kernel which satisfies the + * criteria imposed by the input arguments. */ class KernelNotFound : public Error { public: diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp index afc98fd476c..425532a2709 100644 --- a/include/ginkgo/core/base/half.hpp +++ b/include/ginkgo/core/base/half.hpp @@ -534,6 +534,7 @@ class complex { imag_ += val.imag(); return *this; } + template complex& operator-=(const complex& val) { @@ -541,6 +542,7 @@ class complex { imag_ -= val.imag(); return *this; } + template complex& operator*=(const complex& val) { @@ -551,6 +553,7 @@ class complex { imag_ = result_f.imag(); return *this; } + template complex& operator/=(const complex& val) { diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index 7d8edd0564e..fb73c9c3cd6 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -746,6 +746,13 @@ GKO_INLINE constexpr T one() return T(1); } +template <> +GKO_INLINE constexpr half one() +{ + constexpr auto bits = static_cast(0b0'01111'0000000000u); + return half::create_from_bits(bits); +} + /** * Returns the multiplicative identity for T. diff --git a/include/ginkgo/core/base/matrix_data.hpp b/include/ginkgo/core/base/matrix_data.hpp index 218c79a6fea..983e4a524ef 100644 --- a/include/ginkgo/core/base/matrix_data.hpp +++ b/include/ginkgo/core/base/matrix_data.hpp @@ -38,7 +38,7 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return ValueType(dist(gen)); + return static_cast(dist(gen)); } @@ -46,7 +46,9 @@ template typename std::enable_if::value, ValueType>::type get_rand_value(Distribution&& dist, Generator&& gen) { - return ValueType(dist(gen), dist(gen)); + using real_value_type = remove_complex; + return ValueType{static_cast(dist(gen)), + static_cast(dist(gen))}; } diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 68cde9c6548..32524a15918 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -139,13 +139,8 @@ using uint64 = std::uint64_t; */ using uintptr = std::uintptr_t; -// #if defined(SYCL_LANGUAGE_VERSION) && \ -// (__LIBSYCL_MAJOR_VERSION > 5 || \ -// (__LIBSYCL_MAJOR_VERSION == 5 && __LIBSYCL_MINOR_VERSION >= 7)) -// using half = sycl::half; -// #else + class half; -// #endif /** diff --git a/test/components/fill_array_kernels.cpp b/test/components/fill_array_kernels.cpp index ed190b80fbc..7786cb52d4b 100644 --- a/test/components/fill_array_kernels.cpp +++ b/test/components/fill_array_kernels.cpp @@ -57,9 +57,6 @@ TYPED_TEST(FillArray, FillSeqEqualsReference) this->exec, this->dvals.get_data(), this->total_size); this->dvals.set_executor(this->ref); - for (gko::size_type i = 2000; i < this->total_size; i++) { - std::cout << i << " " << this->seqs.get_data()[i] << " device " - << this->dvals.get_data()[i] << std::endl; - } + GKO_ASSERT_ARRAY_EQ(this->seqs, this->dvals); } diff --git a/test/mpi/matrix.cpp b/test/mpi/matrix.cpp index 759de58905d..3b594940208 100644 --- a/test/mpi/matrix.cpp +++ b/test/mpi/matrix.cpp @@ -690,7 +690,7 @@ TYPED_TEST(Matrix, CanConvertToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->dist_mat->convert_to(tmp); tmp->convert_to(res); @@ -717,7 +717,7 @@ TYPED_TEST(Matrix, CanMoveToNextPrecision) // If OtherT is more precise: 0, otherwise r auto residual = r::value < r::value ? gko::remove_complex{0} - : gko::remove_complex{r::value}; + : static_cast>(r::value); this->dist_mat->move_to(tmp); tmp->convert_to(res); From 3d15350751721c102b4b3934fbfd81c0157bdd9b Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 22 Oct 2024 01:49:47 +0200 Subject: [PATCH 58/62] reduce abs/sqrt location --- core/test/utils/assertions.hpp | 1 - cuda/base/types.hpp | 7 +- hip/base/types.hip.hpp | 29 ------- include/ginkgo/core/base/math.hpp | 122 ++++++------------------------ 4 files changed, 25 insertions(+), 134 deletions(-) diff --git a/core/test/utils/assertions.hpp b/core/test/utils/assertions.hpp index 174d4536657..87a4e878fc7 100644 --- a/core/test/utils/assertions.hpp +++ b/core/test/utils/assertions.hpp @@ -140,7 +140,6 @@ template void print_componentwise_error(Ostream& os, const MatrixData1& first, const MatrixData2& second) { - using std::abs; using vt = typename detail::biggest_valuetype< typename MatrixData1::value_type, typename MatrixData2::value_type>::type; diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index c7fe79b5a6f..367674ac163 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -100,10 +100,6 @@ __device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) #endif -namespace kernels { -namespace cuda { - - #ifdef __CUDACC__ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 @@ -138,6 +134,9 @@ __device__ __forceinline__ __half sqrt(const __half& val) #endif #endif + +namespace kernels { +namespace cuda { namespace detail { /** diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index a52dfe0b239..febead4f370 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -88,35 +88,8 @@ THRUST_HALF_FRIEND_OPERATOR(/, /=) namespace gko { -#if GINKGO_HIP_PLATFORM_NVCC -// from the cuda_fp16.hpp -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 -__device__ __forceinline__ bool is_nan(const __half& val) -{ - return __hisnan(val); -} -#if CUDA_VERSION >= 10020 -__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } -#else -__device__ __forceinline__ __half abs(const __half& val) -{ - return abs(static_cast(val)); -} -#endif -#else -__device__ __forceinline__ bool is_nan(const __half& val) -{ - return is_nan(static_cast(val)); -} -__device__ __forceinline__ __half abs(const __half& val) -{ - return abs(static_cast(val)); -} -#endif - -#else // Not nvidia device __device__ __forceinline__ bool is_nan(const __half& val) { return __hisnan(val); @@ -125,8 +98,6 @@ __device__ __forceinline__ bool is_nan(const __half& val) // rocm40 __habs is not constexpr __device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } -#endif - namespace kernels { namespace hip { diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp index fb73c9c3cd6..79802a08350 100644 --- a/include/ginkgo/core/base/math.hpp +++ b/include/ginkgo/core/base/math.hpp @@ -35,106 +35,6 @@ class complex; namespace gko { -using std::abs; -using std::sqrt; - -GKO_INLINE gko::half abs(gko::half a) { return gko::half((a > 0) ? a : -a); } - -GKO_INLINE gko::half abs(std::complex a) -{ - // Using float abs not sqrt on norm to avoid overflow - return gko::half(abs(std::complex(a))); -} - - -GKO_INLINE gko::half sqrt(gko::half a) -{ - return gko::half(std::sqrt(float(a))); -} - -GKO_INLINE std::complex sqrt(std::complex a) -{ - return std::complex(sqrt(std::complex( - static_cast(a.real()), static_cast(a.imag())))); -} - - -} // namespace gko - - -namespace gko { - - -// HIP should not see std::abs or std::sqrt, we want the custom implementation. -// Hence, provide the using declaration only for some cases -namespace kernels { -namespace reference { - - -using std::abs; - - -using std::sqrt; - - -} // namespace reference -} // namespace kernels - - -namespace kernels { -namespace omp { - - -using std::abs; - - -using std::sqrt; - - -} // namespace omp -} // namespace kernels - - -namespace kernels { -namespace cuda { - - -using std::abs; - - -using std::sqrt; - - -} // namespace cuda -} // namespace kernels - - -namespace kernels { -namespace dpcpp { - - -using std::abs; - - -using std::sqrt; - - -} // namespace dpcpp -} // namespace kernels - - -namespace test { - - -using std::abs; - - -using std::sqrt; - - -} // namespace test - - // type manipulations @@ -1030,6 +930,7 @@ GKO_INLINE constexpr auto squared_norm(const T& x) return real(conj(x) * x); } +using std::abs; /** * Returns the absolute value of the object. @@ -1055,6 +956,27 @@ abs(const T& x) return sqrt(squared_norm(x)); } +// increase the priority in function lookup +GKO_INLINE gko::half abs(const std::complex& x) +{ + // Using float abs not sqrt on norm to avoid overflow + return static_cast(abs(std::complex(x))); +} + + +using std::sqrt; + +GKO_INLINE gko::half sqrt(gko::half a) +{ + return gko::half(std::sqrt(float(a))); +} + +GKO_INLINE std::complex sqrt(std::complex a) +{ + return std::complex(sqrt(std::complex( + static_cast(a.real()), static_cast(a.imag())))); +} + /** * Returns the value of pi. From c4697a52bec797f8f9e3caf7ddc6c05957780c54 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 22 Oct 2024 17:01:04 +0200 Subject: [PATCH 59/62] move the math function to math --- .../base/device_matrix_data_kernels.cpp | 1 + common/cuda_hip/base/math.hpp | 67 ++++++++++ common/cuda_hip/base/types.hpp | 12 ++ .../factorization/factorization_kernels.cpp | 1 + cuda/base/types.hpp | 115 ------------------ hip/base/types.hip.hpp | 72 ----------- 6 files changed, 81 insertions(+), 187 deletions(-) diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp index c05ed991dbd..6a426cc53b8 100644 --- a/common/cuda_hip/base/device_matrix_data_kernels.cpp +++ b/common/cuda_hip/base/device_matrix_data_kernels.cpp @@ -12,6 +12,7 @@ #include #include +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/thrust.hpp" #include "common/cuda_hip/base/types.hpp" diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp index 75fd3976c4f..255568d4e8c 100644 --- a/common/cuda_hip/base/math.hpp +++ b/common/cuda_hip/base/math.hpp @@ -84,4 +84,71 @@ struct truncate_type_impl> { } // namespace gko +namespace thrust { + + +GKO_ATTRIBUTES GKO_INLINE complex<__half> sqrt(const complex<__half>& a) +{ + return sqrt(static_cast>(a)); +} + + +// Dircetly call float versrion from here? +template <> +GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) +{ + return abs(static_cast>(z)); +} + + +} // namespace thrust + + +namespace gko { +// It is required by NVHPC 23.3, isnan is undefined when NVHPC are only as host +// compiler. +#if defined(__CUDACC__) || defined(GKO_COMPILING_HIP) + +__device__ __forceinline__ bool is_nan(const __half& val) +{ + // from the cuda_fp16.hpp +#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) + return __hisnan(val); +#else + return isnan(static_cast(val)); +#endif +} + +__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) +{ + return is_nan(val.real()) || is_nan(val.imag()); +} + + +__device__ __forceinline__ __half abs(const __half& val) +{ +#if GINKGO_HIP_PLATFORM_HCC || \ + (CUDA_VERSION >= 10020 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) + return __habs(val); +#else + return abs(static_cast(val)); +#endif +} + +__device__ __forceinline__ __half sqrt(const __half& val) +{ +#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) + return hsqrt(val); +#else + return sqrt(static_cast(val)); +#endif +} + + +#endif + + +} // namespace gko + + #endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_ diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp index 08f0516d691..646331d0528 100644 --- a/common/cuda_hip/base/types.hpp +++ b/common/cuda_hip/base/types.hpp @@ -14,5 +14,17 @@ #error "Executor definition missing" #endif +#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ + GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ + const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ + { \ + return thrust::complex{lhs} _op thrust::complex(rhs); \ + } + +THRUST_HALF_FRIEND_OPERATOR(+, +=) +THRUST_HALF_FRIEND_OPERATOR(-, -=) +THRUST_HALF_FRIEND_OPERATOR(*, *=) +THRUST_HALF_FRIEND_OPERATOR(/, /=) + #endif // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_ diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp index 3a38175ab70..e3ec943b18c 100644 --- a/common/cuda_hip/factorization/factorization_kernels.cpp +++ b/common/cuda_hip/factorization/factorization_kernels.cpp @@ -7,6 +7,7 @@ #include #include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" #include "common/cuda_hip/base/runtime.hpp" #include "common/cuda_hip/base/types.hpp" #include "common/cuda_hip/components/cooperative_groups.hpp" diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 367674ac163..3905bb5b3c5 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -19,122 +19,7 @@ #include -// thrust calls the c function not the function from std -// Maybe override the function from thrust directlry -GKO_ATTRIBUTES GKO_INLINE __half hypot(__half a, __half b) -{ - return hypot(static_cast(a), static_cast(b)); -} - -GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> sqrt( - thrust::complex<__half> a) -{ - return sqrt(static_cast>(a)); -} - - -namespace thrust { - - -// Dircetly call float versrion from here? -template <> -GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) -{ - return abs(static_cast>(z)); -} - - -} // namespace thrust - - -#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ - const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ - { \ - return thrust::complex{lhs} _op thrust::complex(rhs); \ - } - -THRUST_HALF_FRIEND_OPERATOR(+, +=) -THRUST_HALF_FRIEND_OPERATOR(-, -=) -THRUST_HALF_FRIEND_OPERATOR(*, *=) -THRUST_HALF_FRIEND_OPERATOR(/, /=) - - namespace gko { - -// It is required by NVHPC 23.3, isnan is undefined when NVHPC are only as host -// compiler. -#ifdef __CUDACC__ - -// from the cuda_fp16.hpp -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - - -template <> -__device__ __forceinline__ bool is_nan(const __half& val) -{ - return __hisnan(val); -} - - -#else - - -template <> -__device__ __forceinline__ bool is_nan(const __half& val) -{ - return isnan(static_cast(val)); -} - - -#endif - - -template <> -__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val) -{ - return is_nan(val.real()) || is_nan(val.imag()); -} - - -#endif - - -#ifdef __CUDACC__ -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - -#if CUDA_VERSION >= 10020 -__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } -#else -__device__ __forceinline__ __half abs(const __half& val) -{ - return abs(static_cast(val)); -} -#endif - - -__device__ __forceinline__ __half sqrt(const __half& val) { return hsqrt(val); } - - -#else - - -__device__ __forceinline__ __half abs(const __half& val) -{ - return abs(static_cast(val)); -} - - -__device__ __forceinline__ __half sqrt(const __half& val) -{ - return sqrt(static_cast(val)); -} - - -#endif -#endif - - namespace kernels { namespace cuda { namespace detail { diff --git a/hip/base/types.hip.hpp b/hip/base/types.hip.hpp index febead4f370..6b78cceea99 100644 --- a/hip/base/types.hip.hpp +++ b/hip/base/types.hip.hpp @@ -26,79 +26,7 @@ #include "common/cuda_hip/base/runtime.hpp" - -// thrust calls the c function not the function from std -// Maybe override the function from thrust directlry -__device__ __forceinline__ __half hypot(__half a, __half b) -{ - return hypot(static_cast(a), static_cast(b)); -} - -__device__ __forceinline__ thrust::complex<__half> sqrt( - thrust::complex<__half> a) -{ - return sqrt(static_cast>(a)); -} - -__device__ __forceinline__ thrust::complex sqrt( - thrust::complex val) -{ - return thrust::sqrt(val); -} -__device__ __forceinline__ thrust::complex sqrt( - thrust::complex val) -{ - return thrust::sqrt(val); -} - -#if GINKGO_HIP_PLATFORM_NVCC && defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 -__device__ __forceinline__ __half sqrt(__half val) -{ - return sqrt(static_cast(val)); -} -#else -__device__ __forceinline__ __half sqrt(__half val) { return hsqrt(val); } -#endif - - -namespace thrust { - - -// Dircetly call float versrion from here? -template <> -GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z) -{ - return hypot(static_cast(z.real()), static_cast(z.imag())); -} - - -} // namespace thrust - -#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \ - GKO_ATTRIBUTES GKO_INLINE thrust::complex<__half> operator _op( \ - const thrust::complex<__half> lhs, const thrust::complex<__half> rhs) \ - { \ - return thrust::complex{lhs} _op thrust::complex(rhs); \ - } - -THRUST_HALF_FRIEND_OPERATOR(+, +=) -THRUST_HALF_FRIEND_OPERATOR(-, -=) -THRUST_HALF_FRIEND_OPERATOR(*, *=) -THRUST_HALF_FRIEND_OPERATOR(/, /=) - - namespace gko { - - -__device__ __forceinline__ bool is_nan(const __half& val) -{ - return __hisnan(val); -} - -// rocm40 __habs is not constexpr -__device__ __forceinline__ __half abs(const __half& val) { return __habs(val); } - - namespace kernels { namespace hip { namespace detail { From 377432a362ac46410816569f95cc08d5270a3635 Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Tue, 22 Oct 2024 23:02:18 +0200 Subject: [PATCH 60/62] nohalf --- core/test/utils.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/test/utils.hpp b/core/test/utils.hpp index 1be20f44ce8..bac7e942416 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -344,6 +344,9 @@ using ComplexValueTypesNoHalf = using ValueTypes = merge_type_list_t; +using ValueTypesNoHalf = + merge_type_list_t; + using IndexTypes = ::testing::Types; using IntegerTypes = merge_type_list_t>; From e806a0a154f416fa14ae269d3b76e93277d2cebd Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 23 Oct 2024 13:36:18 +0200 Subject: [PATCH 61/62] cbgmres without half --- core/test/solver/cb_gmres.cpp | 5 +++-- reference/test/solver/cb_gmres_kernels.cpp | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/test/solver/cb_gmres.cpp b/core/test/solver/cb_gmres.cpp index e014e5f8acc..5a5c2fdb36c 100644 --- a/core/test/solver/cb_gmres.cpp +++ b/core/test/solver/cb_gmres.cpp @@ -87,9 +87,10 @@ using st_ir2 = st_helper_type; using TestTypes = gko::test::merge_type_list_t< gko::test::cartesian_type_product_t< - gko::test::ValueTypes, ::testing::Types>, + gko::test::ValueTypesNoHalf, ::testing::Types>, gko::test::cartesian_type_product_t< - gko::test::RealValueTypes, ::testing::Types>>; + gko::test::RealValueTypesNoHalf, + ::testing::Types>>; TYPED_TEST_SUITE(CbGmres, TestTypes, PairTypenameNameGenerator); diff --git a/reference/test/solver/cb_gmres_kernels.cpp b/reference/test/solver/cb_gmres_kernels.cpp index 26c19bb8787..a2acb8534ea 100644 --- a/reference/test/solver/cb_gmres_kernels.cpp +++ b/reference/test/solver/cb_gmres_kernels.cpp @@ -136,9 +136,10 @@ using st_ir2 = st_helper_type; using TestTypes = gko::test::merge_type_list_t< gko::test::cartesian_type_product_t< - gko::test::ValueTypes, ::testing::Types>, + gko::test::ValueTypesNoHalf, ::testing::Types>, gko::test::cartesian_type_product_t< - gko::test::RealValueTypes, ::testing::Types>>; + gko::test::RealValueTypesNoHalf, + ::testing::Types>>; TYPED_TEST_SUITE(CbGmres, TestTypes, PairTypenameNameGenerator); From 3e49252a9fe391e754da0ead8a5211572a99ba3e Mon Sep 17 00:00:00 2001 From: "Yu-Hsiang M. Tsai" Date: Wed, 23 Oct 2024 13:36:27 +0200 Subject: [PATCH 62/62] direct without half --- test/solver/direct.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/solver/direct.cpp b/test/solver/direct.cpp index 431a7c40d59..e71046d0312 100644 --- a/test/solver/direct.cpp +++ b/test/solver/direct.cpp @@ -104,12 +104,13 @@ using Types = gko::test::ValueIndexTypes; #elif defined(GKO_COMPILING_CUDA) // CUDA don't support long indices for sorting, and the triangular solvers // seem broken -using Types = gko::test::cartesian_type_product_t>; #else // HIP only supports real types and int32 -using Types = gko::test::cartesian_type_product_t>; +using Types = + gko::test::cartesian_type_product_t>; #endif TYPED_TEST_SUITE(Direct, Types, PairTypenameNameGenerator);