Skip to content

Commit

Permalink
Merge #1708 Add Half matrix and components implementation
Browse files Browse the repository at this point in the history
This PR implements the half precision for matrices and components

Related PR: #1706
  • Loading branch information
yhmtsai authored Dec 3, 2024
2 parents a144043 + 8190bf6 commit 76ef161
Show file tree
Hide file tree
Showing 174 changed files with 2,822 additions and 1,356 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
# We do not support MSVC. SYCL will come later
if(MSVC OR GINKGO_BUILD_SYCL)
message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
endif()
option(GINKGO_SKIP_DEPENDENCY_UPDATE
"Do not update dependencies each time the project is rebuilt" ON)
option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
Expand Down
8 changes: 3 additions & 5 deletions accessor/reference_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,16 @@
#include "utils.hpp"


// CUDA TOOLKIT < 11 does not support constexpr in combination with
// thrust::complex, which is why constexpr is only present in later versions
#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \
(__CUDACC_VER_MAJOR__ < 11)
// NVC++ disallow a constexpr function has a nonliteral return type like half
#if defined(__NVCOMPILER) && GINKGO_ENABLE_HALF

#define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR

#else

#define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR constexpr

#endif // __CUDA_ARCH__ && __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__ < 11
#endif


namespace gko {
Expand Down
6 changes: 6 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
endif()
if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
target_compile_definitions(cusparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
endif()
# make the dependency public to catch issues
target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
Expand All @@ -28,6 +31,9 @@ endfunction()
function(ginkgo_benchmark_hipsparse_linops type def)
add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp)
set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP)
if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
target_compile_definitions(hipsparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
endif()
target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
Expand Down
2 changes: 1 addition & 1 deletion cmake/get_info.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ foreach(log_type ${log_types})
"GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL")
ginkgo_print_module_footer(${${log_type}} " Enabled features:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
"GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI;GINKGO_ENABLE_HALF")
ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
Expand Down
16 changes: 13 additions & 3 deletions common/cuda_hip/base/device_matrix_data_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <thrust/sort.h>
#include <thrust/tuple.h>

#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/base/types.hpp"

Expand All @@ -22,6 +23,15 @@ namespace GKO_DEVICE_NAMESPACE {
namespace components {


// __half `!=` operation is only available in __device__
// Although gko::is_nonzero is constexpr, it still shows calling __device__ in
// __host__
template <typename T>
GKO_INLINE __device__ constexpr bool is_nonzero(T value)
{
return value != zero<T>();
}

template <typename ValueType, typename IndexType>
void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
array<ValueType>& values, array<IndexType>& row_idxs,
Expand Down Expand Up @@ -58,7 +68,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);


Expand Down Expand Up @@ -102,7 +112,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);


Expand All @@ -117,7 +127,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
it + data.get_num_stored_elements(), vals);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);


Expand Down
134 changes: 125 additions & 9 deletions common/cuda_hip/base/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,60 @@
#include <ginkgo/core/base/math.hpp>


#ifdef GKO_COMPILING_CUDA


#include <cuda_fp16.h>


#elif defined(GKO_COMPILING_HIP)


#include <hip/hip_fp16.h>


#endif


#include "common/cuda_hip/base/thrust_macro.hpp"


namespace gko {


// We need this struct, because otherwise we would call a __host__ function in a
// __device__ function (even though it is constexpr)
template <typename T>
struct device_numeric_limits {
static constexpr auto inf = std::numeric_limits<T>::infinity();
static constexpr auto max = std::numeric_limits<T>::max();
static constexpr auto min = std::numeric_limits<T>::min();
static constexpr auto inf() { return std::numeric_limits<T>::infinity(); }
static constexpr auto max() { return std::numeric_limits<T>::max(); }
static constexpr auto min() { return std::numeric_limits<T>::min(); }
};

template <>
struct device_numeric_limits<__half> {
// from __half documentation, it accepts unsigned short
// __half and __half_raw does not have constexpr constructor
static GKO_ATTRIBUTES GKO_INLINE auto inf()
{
__half_raw bits;
bits.x = static_cast<unsigned short>(0b0111110000000000u);
return __half{bits};
}

static GKO_ATTRIBUTES GKO_INLINE auto max()
{
__half_raw bits;
bits.x = static_cast<unsigned short>(0b0111101111111111u);
return __half{bits};
}

static GKO_ATTRIBUTES GKO_INLINE auto min()
{
__half_raw bits;
bits.x = static_cast<unsigned short>(0b0000010000000000u);
return __half{bits};
}
};


Expand All @@ -34,22 +78,94 @@ struct remove_complex_impl<thrust::complex<T>> {


template <typename T>
struct is_complex_impl<thrust::complex<T>>
: public std::integral_constant<bool, true> {};
struct truncate_type_impl<thrust::complex<T>> {
using type = thrust::complex<typename truncate_type_impl<T>::type>;
};


template <typename T>
struct is_complex_or_scalar_impl<thrust::complex<T>> : std::is_scalar<T> {};
struct is_complex_impl<thrust::complex<T>> : public std::true_type {};

template <>
struct is_complex_or_scalar_impl<__half> : public std::true_type {};

template <typename T>
struct truncate_type_impl<thrust::complex<T>> {
using type = thrust::complex<typename truncate_type_impl<T>::type>;
};
struct is_complex_or_scalar_impl<thrust::complex<T>>
: public is_complex_or_scalar_impl<T> {};


} // namespace detail
} // namespace gko


GKO_THRUST_NAEMSPACE_PREFIX
namespace thrust {


template <>
GKO_ATTRIBUTES GKO_INLINE complex<__half> sqrt<__half>(const complex<__half>& a)
{
return sqrt(static_cast<complex<float>>(a));
}


template <>
GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z)
{
return abs(static_cast<complex<float>>(z));
}


} // namespace thrust
GKO_THRUST_NAEMSPACE_POSTFIX


namespace gko {


// It is required by NVHPC 23.3, `isnan` is undefined when NVHPC is used as a
// host compiler.
#if defined(__CUDACC__) || defined(GKO_COMPILING_HIP)

__device__ __forceinline__ bool is_nan(const __half& val)
{
// from the cuda_fp16.hpp
#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __hisnan(val);
#else
return isnan(static_cast<float>(val));
#endif
}

__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val)
{
return is_nan(val.real()) || is_nan(val.imag());
}


__device__ __forceinline__ __half abs(const __half& val)
{
#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __habs(val);
#else
return abs(static_cast<float>(val));
#endif
}

__device__ __forceinline__ __half sqrt(const __half& val)
{
#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return hsqrt(val);
#else
return sqrt(static_cast<float>(val));
#endif
}


#endif


} // namespace gko


#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
22 changes: 22 additions & 0 deletions common/cuda_hip/base/thrust_macro.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
#define GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_

// although thrust provides the similar thing, these macro are only defined when
// they supported. Thus, we need to provide our own macro to make it work with
// the old version
#ifdef THRUST_CUB_WRAPPED_NAMESPACE
#define GKO_THRUST_NAEMSPACE_PREFIX namespace THRUST_CUB_WRAPPED_NAMESPACE {
#define GKO_THRUST_NAEMSPACE_POSTFIX }
#define GKO_THRUST_QUALIFIER ::THRUST_CUB_WRAPPED_NAMESPACE::thrust
#else
#define GKO_THRUST_NAEMSPACE_PREFIX
#define GKO_THRUST_NAEMSPACE_POSTFIX
#define GKO_THRUST_QUALIFIER ::thrust
#endif // THRUST_CUB_WRAPPED_NAMESPACE


#endif // GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
19 changes: 18 additions & 1 deletion common/cuda_hip/base/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
#define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_


#include "common/cuda_hip/base/math.hpp"
#if defined(GKO_COMPILING_CUDA)
#include "cuda/base/types.hpp"
#elif defined(GKO_COMPILING_HIP)
Expand All @@ -15,4 +15,21 @@
#endif


#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \
GKO_ATTRIBUTES GKO_INLINE GKO_THRUST_QUALIFIER::complex<__half> \
operator _op(const GKO_THRUST_QUALIFIER::complex<__half> lhs, \
const GKO_THRUST_QUALIFIER::complex<__half> rhs) \
{ \
return GKO_THRUST_QUALIFIER::complex<float>{ \
lhs} _op GKO_THRUST_QUALIFIER::complex<float>(rhs); \
}

THRUST_HALF_FRIEND_OPERATOR(+, +=)
THRUST_HALF_FRIEND_OPERATOR(-, -=)
THRUST_HALF_FRIEND_OPERATOR(*, *=)
THRUST_HALF_FRIEND_OPERATOR(/, /=)

#undef THRUST_HALF_FRIEND_OPERATOR


#endif // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
6 changes: 4 additions & 2 deletions common/cuda_hip/components/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct atomic_helper {
};


// TODO: consider it implemented by memcpy.
template <typename ResultType, typename ValueType>
__forceinline__ __device__ ResultType reinterpret(ValueType val)
{
Expand Down Expand Up @@ -101,8 +102,9 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);


#if defined(CUDA_VERSION)
// Support 16-bit ATOMIC_ADD and ATOMIC_MAX only on CUDA
#if defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)
// Support 16-bit atomicCAS, atomicADD, and atomicMAX only on CUDA with CC
// >= 7.0
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
#endif

Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/components/merging.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ __forceinline__ __device__ void group_merge(const ValueType* __restrict__ a,
IndexType a_begin{};
IndexType b_begin{};
auto lane = static_cast<IndexType>(group.thread_rank());
auto sentinel = device_numeric_limits<IndexType>::max;
auto sentinel = device_numeric_limits<IndexType>::max();
auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel);
auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel);
for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) {
Expand Down Expand Up @@ -240,7 +240,7 @@ __forceinline__ __device__ void sequential_merge(
auto c_size = a_size + b_size;
IndexType a_begin{};
IndexType b_begin{};
auto sentinel = device_numeric_limits<IndexType>::max;
auto sentinel = device_numeric_limits<IndexType>::max();
auto a_cur = checked_load(a, a_begin, a_size, sentinel);
auto b_cur = checked_load(b, b_begin, b_size, sentinel);
for (IndexType c_begin{}; c_begin < c_size; c_begin++) {
Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/factorization/par_ict_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(

IndexType l_new_begin = l_new_row_ptrs[row];

constexpr auto sentinel = device_numeric_limits<IndexType>::max;
constexpr auto sentinel = device_numeric_limits<IndexType>::max();
// load column indices and values for the first merge step
auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel);
auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero<ValueType>());
Expand Down Expand Up @@ -456,4 +456,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
} // namespace par_ict_factorization
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
} // namespace gko
Loading

0 comments on commit 76ef161

Please sign in to comment.