ginkgo-project · yhmtsai · Dec 3, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,6 +32,12 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
 option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
 option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
 option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
+option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
+# We do not support MSVC. SYCL will come later
+if(MSVC OR GINKGO_BUILD_SYCL)
+    message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
+    set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
+endif()
 option(GINKGO_SKIP_DEPENDENCY_UPDATE
     "Do not update dependencies each time the project is rebuilt" ON)
 option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)

diff --git a/accessor/reference_helper.hpp b/accessor/reference_helper.hpp
@@ -12,18 +12,16 @@
 #include "utils.hpp"
 
 
-// CUDA TOOLKIT < 11 does not support constexpr in combination with
-// thrust::complex, which is why constexpr is only present in later versions
-#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \
-    (__CUDACC_VER_MAJOR__ < 11)
+// NVC++ disallow a constexpr function has a nonliteral return type like half
+#if defined(__NVCOMPILER) && GINKGO_ENABLE_HALF
 
 #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR
 
 #else
 
 #define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR constexpr
 
-#endif  // __CUDA_ARCH__ && __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__ < 11
+#endif
 
 
 namespace gko {

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -18,6 +18,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
             PRIVATE
             $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
     endif()
+    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
+        target_compile_definitions(cusparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    endif()
     # make the dependency public to catch issues
     target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
@@ -28,6 +31,9 @@ endfunction()
 function(ginkgo_benchmark_hipsparse_linops type def)
     add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp)
     set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP)
+    if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
+        target_compile_definitions(hipsparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
+    endif()
     target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
     target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
     target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})

diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
@@ -130,7 +130,7 @@ foreach(log_type ${log_types})
         "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL")
     ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
+        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI;GINKGO_ENABLE_HALF")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")

diff --git a/common/cuda_hip/base/device_matrix_data_kernels.cpp b/common/cuda_hip/base/device_matrix_data_kernels.cpp
@@ -12,6 +12,7 @@
 #include <thrust/sort.h>
 #include <thrust/tuple.h>
 
+#include "common/cuda_hip/base/math.hpp"
 #include "common/cuda_hip/base/thrust.hpp"
 #include "common/cuda_hip/base/types.hpp"
 
@@ -22,6 +23,15 @@ namespace GKO_DEVICE_NAMESPACE {
 namespace components {
 
 
+// __half `!=` operation is only available in __device__
+// Although gko::is_nonzero is constexpr, it still shows calling __device__ in
+// __host__
+template <typename T>
+GKO_INLINE __device__ constexpr bool is_nonzero(T value)
+{
+    return value != zero<T>();
+}
+
 template <typename ValueType, typename IndexType>
 void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
                   array<ValueType>& values, array<IndexType>& row_idxs,
@@ -58,7 +68,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);
 
 
@@ -102,7 +112,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
     }
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);
 
 
@@ -117,7 +127,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
                         it + data.get_num_stored_elements(), vals);
 }
 
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
     GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);
 
 

diff --git a/common/cuda_hip/base/math.hpp b/common/cuda_hip/base/math.hpp
@@ -11,16 +11,60 @@
 #include <ginkgo/core/base/math.hpp>
 
 
+#ifdef GKO_COMPILING_CUDA
+
+
+#include <cuda_fp16.h>
+
+
+#elif defined(GKO_COMPILING_HIP)
+
+
+#include <hip/hip_fp16.h>
+
+
+#endif
+
+
+#include "common/cuda_hip/base/thrust_macro.hpp"
+
+
 namespace gko {
 
 
 // We need this struct, because otherwise we would call a __host__ function in a
 // __device__ function (even though it is constexpr)
 template <typename T>
 struct device_numeric_limits {
-    static constexpr auto inf = std::numeric_limits<T>::infinity();
-    static constexpr auto max = std::numeric_limits<T>::max();
-    static constexpr auto min = std::numeric_limits<T>::min();
+    static constexpr auto inf() { return std::numeric_limits<T>::infinity(); }
+    static constexpr auto max() { return std::numeric_limits<T>::max(); }
+    static constexpr auto min() { return std::numeric_limits<T>::min(); }
+};
+
+template <>
+struct device_numeric_limits<__half> {
+    // from __half documentation, it accepts unsigned short
+    // __half and __half_raw does not have constexpr constructor
+    static GKO_ATTRIBUTES GKO_INLINE auto inf()
+    {
+        __half_raw bits;
+        bits.x = static_cast<unsigned short>(0b0111110000000000u);
+        return __half{bits};
+    }
+
+    static GKO_ATTRIBUTES GKO_INLINE auto max()
+    {
+        __half_raw bits;
+        bits.x = static_cast<unsigned short>(0b0111101111111111u);
+        return __half{bits};
+    }
+
+    static GKO_ATTRIBUTES GKO_INLINE auto min()
+    {
+        __half_raw bits;
+        bits.x = static_cast<unsigned short>(0b0000010000000000u);
+        return __half{bits};
+    }
 };
 
 
@@ -34,22 +78,94 @@ struct remove_complex_impl<thrust::complex<T>> {
 
 
 template <typename T>
-struct is_complex_impl<thrust::complex<T>>
-    : public std::integral_constant<bool, true> {};
+struct truncate_type_impl<thrust::complex<T>> {
+    using type = thrust::complex<typename truncate_type_impl<T>::type>;
+};
 
 
 template <typename T>
-struct is_complex_or_scalar_impl<thrust::complex<T>> : std::is_scalar<T> {};
+struct is_complex_impl<thrust::complex<T>> : public std::true_type {};
 
+template <>
+struct is_complex_or_scalar_impl<__half> : public std::true_type {};
 
 template <typename T>
-struct truncate_type_impl<thrust::complex<T>> {
-    using type = thrust::complex<typename truncate_type_impl<T>::type>;
-};
+struct is_complex_or_scalar_impl<thrust::complex<T>>
+    : public is_complex_or_scalar_impl<T> {};
 
 
 }  // namespace detail
 }  // namespace gko
 
 
+GKO_THRUST_NAEMSPACE_PREFIX
+namespace thrust {
+
+
+template <>
+GKO_ATTRIBUTES GKO_INLINE complex<__half> sqrt<__half>(const complex<__half>& a)
+{
+    return sqrt(static_cast<complex<float>>(a));
+}
+
+
+template <>
+GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z)
+{
+    return abs(static_cast<complex<float>>(z));
+}
+
+
+}  // namespace thrust
+GKO_THRUST_NAEMSPACE_POSTFIX
+
+
+namespace gko {
+
+
+// It is required by NVHPC 23.3, `isnan` is undefined when NVHPC is used as a
+// host compiler.
+#if defined(__CUDACC__) || defined(GKO_COMPILING_HIP)
+
+__device__ __forceinline__ bool is_nan(const __half& val)
+{
+    // from the cuda_fp16.hpp
+#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+    return __hisnan(val);
+#else
+    return isnan(static_cast<float>(val));
+#endif
+}
+
+__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val)
+{
+    return is_nan(val.real()) || is_nan(val.imag());
+}
+
+
+__device__ __forceinline__ __half abs(const __half& val)
+{
+#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+    return __habs(val);
+#else
+    return abs(static_cast<float>(val));
+#endif
+}
+
+__device__ __forceinline__ __half sqrt(const __half& val)
+{
+#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
+    return hsqrt(val);
+#else
+    return sqrt(static_cast<float>(val));
+#endif
+}
+
+
+#endif
+
+
+}  // namespace gko
+
+
 #endif  // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
diff --git a/common/cuda_hip/base/thrust_macro.hpp b/common/cuda_hip/base/thrust_macro.hpp
@@ -0,0 +1,22 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
+#define GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
+
+// although thrust provides the similar thing, these macro are only defined when
+// they supported. Thus, we need to provide our own macro to make it work with
+// the old version
+#ifdef THRUST_CUB_WRAPPED_NAMESPACE
+#define GKO_THRUST_NAEMSPACE_PREFIX namespace THRUST_CUB_WRAPPED_NAMESPACE {
+#define GKO_THRUST_NAEMSPACE_POSTFIX }
+#define GKO_THRUST_QUALIFIER ::THRUST_CUB_WRAPPED_NAMESPACE::thrust
+#else
+#define GKO_THRUST_NAEMSPACE_PREFIX
+#define GKO_THRUST_NAEMSPACE_POSTFIX
+#define GKO_THRUST_QUALIFIER ::thrust
+#endif  // THRUST_CUB_WRAPPED_NAMESPACE
+
+
+#endif  // GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
diff --git a/common/cuda_hip/base/types.hpp b/common/cuda_hip/base/types.hpp
@@ -5,7 +5,7 @@
 #ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
 #define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
 
-
+#include "common/cuda_hip/base/math.hpp"
 #if defined(GKO_COMPILING_CUDA)
 #include "cuda/base/types.hpp"
 #elif defined(GKO_COMPILING_HIP)
@@ -15,4 +15,21 @@
 #endif
 
 
+#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq)                     \
+    GKO_ATTRIBUTES GKO_INLINE GKO_THRUST_QUALIFIER::complex<__half> \
+    operator _op(const GKO_THRUST_QUALIFIER::complex<__half> lhs,   \
+                 const GKO_THRUST_QUALIFIER::complex<__half> rhs)   \
+    {                                                               \
+        return GKO_THRUST_QUALIFIER::complex<float>{                \
+            lhs} _op GKO_THRUST_QUALIFIER::complex<float>(rhs);     \
+    }
+
+THRUST_HALF_FRIEND_OPERATOR(+, +=)
+THRUST_HALF_FRIEND_OPERATOR(-, -=)
+THRUST_HALF_FRIEND_OPERATOR(*, *=)
+THRUST_HALF_FRIEND_OPERATOR(/, /=)
+
+#undef THRUST_HALF_FRIEND_OPERATOR
+
+
 #endif  // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
diff --git a/common/cuda_hip/components/atomic.hpp b/common/cuda_hip/components/atomic.hpp
@@ -39,6 +39,7 @@ struct atomic_helper {
 };
 
 
+// TODO: consider it implemented by memcpy.
 template <typename ResultType, typename ValueType>
 __forceinline__ __device__ ResultType reinterpret(ValueType val)
 {
@@ -101,8 +102,9 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
 
 
-#if defined(CUDA_VERSION)
-// Support 16-bit ATOMIC_ADD and ATOMIC_MAX only on CUDA
+#if defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)
+// Support 16-bit atomicCAS, atomicADD, and atomicMAX only on CUDA with CC
+// >= 7.0
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
 #endif
 

diff --git a/common/cuda_hip/components/merging.hpp b/common/cuda_hip/components/merging.hpp
@@ -131,7 +131,7 @@ __forceinline__ __device__ void group_merge(const ValueType* __restrict__ a,
     IndexType a_begin{};
     IndexType b_begin{};
     auto lane = static_cast<IndexType>(group.thread_rank());
-    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto sentinel = device_numeric_limits<IndexType>::max();
     auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel);
     auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel);
     for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) {
@@ -240,7 +240,7 @@ __forceinline__ __device__ void sequential_merge(
     auto c_size = a_size + b_size;
     IndexType a_begin{};
     IndexType b_begin{};
-    auto sentinel = device_numeric_limits<IndexType>::max;
+    auto sentinel = device_numeric_limits<IndexType>::max();
     auto a_cur = checked_load(a, a_begin, a_size, sentinel);
     auto b_cur = checked_load(b, b_begin, b_size, sentinel);
     for (IndexType c_begin{}; c_begin < c_size; c_begin++) {

diff --git a/common/cuda_hip/factorization/par_ict_kernels.cpp b/common/cuda_hip/factorization/par_ict_kernels.cpp
@@ -128,7 +128,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(
 
     IndexType l_new_begin = l_new_row_ptrs[row];
 
-    constexpr auto sentinel = device_numeric_limits<IndexType>::max;
+    constexpr auto sentinel = device_numeric_limits<IndexType>::max();
     // load column indices and values for the first merge step
     auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel);
     auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero<ValueType>());
@@ -456,4 +456,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 }  // namespace par_ict_factorization
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
-}  // namespace gko
+}  // namespace gko