Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Half matrix and components #1708

Merged
merged 16 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ option(GINKGO_BUILD_DOC "Generate documentation" OFF)
option(GINKGO_FAST_TESTS "Reduces the input size for a few tests known to be time-intensive" OFF)
option(GINKGO_TEST_NONDEFAULT_STREAM "Uses non-default streams in CUDA and HIP tests" OFF)
option(GINKGO_MIXED_PRECISION "Instantiate true mixed-precision kernels (otherwise they will be conversion-based using implicit temporary storage)" OFF)
option(GINKGO_ENABLE_HALF "Enable the use of half precision" ON)
yhmtsai marked this conversation as resolved.
Show resolved Hide resolved
# We do not support MSVC. SYCL will come later
if(MSVC OR GINKGO_BUILD_SYCL)
message(STATUS "HALF is not supported in MSVC, and later support in SYCL")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs to be rephrased since I really don't know what you mean by "and later support in SYCL".
Do you mean that SYCL does support half-precision in a later version?

Copy link
Member Author

@yhmtsai yhmtsai Nov 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we will enable the support from #1710
As the half is trivial copy again now, we might not need the device_type mapping though.

set(GINKGO_ENABLE_HALF OFF CACHE BOOL "Enable the use of half precision" FORCE)
endif()
option(GINKGO_SKIP_DEPENDENCY_UPDATE
"Do not update dependencies each time the project is rebuilt" ON)
option(GINKGO_WITH_CLANG_TIDY "Make Ginkgo call `clang-tidy` to find programming issues." OFF)
Expand Down
8 changes: 3 additions & 5 deletions accessor/reference_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,16 @@
#include "utils.hpp"


// CUDA TOOLKIT < 11 does not support constexpr in combination with
// thrust::complex, which is why constexpr is only present in later versions
#if defined(__CUDA_ARCH__) && defined(__CUDACC_VER_MAJOR__) && \
(__CUDACC_VER_MAJOR__ < 11)
// NVC++ disallow a constexpr function has a nonliteral return type like half
#if defined(__NVCOMPILER) && GINKGO_ENABLE_HALF

#define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR

#else

#define GKO_ACC_ENABLE_REFERENCE_CONSTEXPR constexpr

#endif // __CUDA_ARCH__ && __CUDACC_VER_MAJOR__ && __CUDACC_VER_MAJOR__ < 11
#endif


namespace gko {
Expand Down
6 changes: 6 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ function(ginkgo_benchmark_cusparse_linops type def)
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
endif()
if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
target_compile_definitions(cusparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
endif()
# make the dependency public to catch issues
target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
target_compile_definitions(cusparse_linops_${type} PRIVATE GKO_COMPILING_CUDA)
Expand All @@ -28,6 +31,9 @@ endfunction()
function(ginkgo_benchmark_hipsparse_linops type def)
add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp)
set_source_files_properties(utils/hip_linops.hip.cpp PROPERTIES LANGUAGE HIP)
if(GINKGO_CUDA_CUSTOM_THRUST_NAMESPACE)
target_compile_definitions(hipsparse_linops_${type} PRIVATE THRUST_CUB_WRAPPED_NAMESPACE=gko)
endif()
target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def})
target_compile_definitions(hipsparse_linops_${type} PRIVATE GKO_COMPILING_HIP)
target_include_directories(hipsparse_linops_${type} SYSTEM PRIVATE ${HIPBLAS_INCLUDE_DIRS} ${HIPSPARSE_INCLUDE_DIRS})
Expand Down
2 changes: 1 addition & 1 deletion cmake/get_info.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ foreach(log_type ${log_types})
"GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_SYCL")
ginkgo_print_module_footer(${${log_type}} " Enabled features:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
"GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI;GINKGO_ENABLE_HALF")
ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:")
ginkgo_print_foreach_variable(${${log_type}}
"GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
Expand Down
16 changes: 13 additions & 3 deletions common/cuda_hip/base/device_matrix_data_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <thrust/sort.h>
#include <thrust/tuple.h>

#include "common/cuda_hip/base/math.hpp"
#include "common/cuda_hip/base/thrust.hpp"
#include "common/cuda_hip/base/types.hpp"

Expand All @@ -22,6 +23,15 @@ namespace GKO_DEVICE_NAMESPACE {
namespace components {


// __half `!=` operation is only available in __device__
// Although gko::is_nonzero is constexpr, it still shows calling __device__ in
// __host__
template <typename T>
GKO_INLINE __device__ constexpr bool is_nonzero(T value)
{
return value != zero<T>();
}

template <typename ValueType, typename IndexType>
void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
array<ValueType>& values, array<IndexType>& row_idxs,
Expand Down Expand Up @@ -58,7 +68,7 @@ void remove_zeros(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_DEVICE_MATRIX_DATA_REMOVE_ZEROS_KERNEL);


Expand Down Expand Up @@ -102,7 +112,7 @@ void sum_duplicates(std::shared_ptr<const DefaultExecutor> exec, size_type,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_DEVICE_MATRIX_DATA_SUM_DUPLICATES_KERNEL);


Expand All @@ -117,7 +127,7 @@ void sort_row_major(std::shared_ptr<const DefaultExecutor> exec,
it + data.get_num_stored_elements(), vals);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_DEVICE_MATRIX_DATA_SORT_ROW_MAJOR_KERNEL);


Expand Down
134 changes: 125 additions & 9 deletions common/cuda_hip/base/math.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,60 @@
#include <ginkgo/core/base/math.hpp>


#ifdef GKO_COMPILING_CUDA


#include <cuda_fp16.h>


#elif defined(GKO_COMPILING_HIP)


#include <hip/hip_fp16.h>


#endif


#include "common/cuda_hip/base/thrust_macro.hpp"


namespace gko {


// We need this struct, because otherwise we would call a __host__ function in a
// __device__ function (even though it is constexpr)
template <typename T>
struct device_numeric_limits {
static constexpr auto inf = std::numeric_limits<T>::infinity();
static constexpr auto max = std::numeric_limits<T>::max();
static constexpr auto min = std::numeric_limits<T>::min();
static constexpr auto inf() { return std::numeric_limits<T>::infinity(); }
static constexpr auto max() { return std::numeric_limits<T>::max(); }
static constexpr auto min() { return std::numeric_limits<T>::min(); }
thoasm marked this conversation as resolved.
Show resolved Hide resolved
};

template <>
struct device_numeric_limits<__half> {
// from __half documentation, it accepts unsigned short
// __half and __half_raw does not have constexpr constructor
static GKO_ATTRIBUTES GKO_INLINE auto inf()
thoasm marked this conversation as resolved.
Show resolved Hide resolved
{
__half_raw bits;
bits.x = static_cast<unsigned short>(0b0111110000000000u);
return __half{bits};
}

static GKO_ATTRIBUTES GKO_INLINE auto max()
{
__half_raw bits;
bits.x = static_cast<unsigned short>(0b0111101111111111u);
return __half{bits};
}

static GKO_ATTRIBUTES GKO_INLINE auto min()
{
__half_raw bits;
bits.x = static_cast<unsigned short>(0b0000010000000000u);
return __half{bits};
}
};


Expand All @@ -34,22 +78,94 @@ struct remove_complex_impl<thrust::complex<T>> {


template <typename T>
struct is_complex_impl<thrust::complex<T>>
: public std::integral_constant<bool, true> {};
struct truncate_type_impl<thrust::complex<T>> {
using type = thrust::complex<typename truncate_type_impl<T>::type>;
};


template <typename T>
struct is_complex_or_scalar_impl<thrust::complex<T>> : std::is_scalar<T> {};
struct is_complex_impl<thrust::complex<T>> : public std::true_type {};

template <>
struct is_complex_or_scalar_impl<__half> : public std::true_type {};

template <typename T>
struct truncate_type_impl<thrust::complex<T>> {
using type = thrust::complex<typename truncate_type_impl<T>::type>;
};
struct is_complex_or_scalar_impl<thrust::complex<T>>
: public is_complex_or_scalar_impl<T> {};


} // namespace detail
} // namespace gko


GKO_THRUST_NAEMSPACE_PREFIX
namespace thrust {


template <>
GKO_ATTRIBUTES GKO_INLINE complex<__half> sqrt<__half>(const complex<__half>& a)
{
return sqrt(static_cast<complex<float>>(a));
}


template <>
GKO_ATTRIBUTES GKO_INLINE __half abs<__half>(const complex<__half>& z)
{
return abs(static_cast<complex<float>>(z));
}


} // namespace thrust
GKO_THRUST_NAEMSPACE_POSTFIX


namespace gko {


// It is required by NVHPC 23.3, `isnan` is undefined when NVHPC is used as a
// host compiler.
#if defined(__CUDACC__) || defined(GKO_COMPILING_HIP)

__device__ __forceinline__ bool is_nan(const __half& val)
{
// from the cuda_fp16.hpp
#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __hisnan(val);
#else
return isnan(static_cast<float>(val));
#endif
}

__device__ __forceinline__ bool is_nan(const thrust::complex<__half>& val)
{
return is_nan(val.real()) || is_nan(val.imag());
}


__device__ __forceinline__ __half abs(const __half& val)
{
#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return __habs(val);
#else
return abs(static_cast<float>(val));
#endif
}

__device__ __forceinline__ __half sqrt(const __half& val)
{
#if GINKGO_HIP_PLATFORM_HCC || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
return hsqrt(val);
#else
return sqrt(static_cast<float>(val));
#endif
}


#endif


} // namespace gko


#endif // GKO_COMMON_CUDA_HIP_BASE_MATH_HPP_
22 changes: 22 additions & 0 deletions common/cuda_hip/base/thrust_macro.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#ifndef GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
#define GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_

// although thrust provides the similar thing, these macro are only defined when
// they supported. Thus, we need to provide our own macro to make it work with
// the old version
#ifdef THRUST_CUB_WRAPPED_NAMESPACE
#define GKO_THRUST_NAEMSPACE_PREFIX namespace THRUST_CUB_WRAPPED_NAMESPACE {
#define GKO_THRUST_NAEMSPACE_POSTFIX }
#define GKO_THRUST_QUALIFIER ::THRUST_CUB_WRAPPED_NAMESPACE::thrust
#else
#define GKO_THRUST_NAEMSPACE_PREFIX
#define GKO_THRUST_NAEMSPACE_POSTFIX
#define GKO_THRUST_QUALIFIER ::thrust
#endif // THRUST_CUB_WRAPPED_NAMESPACE


#endif // GKO_COMMON_CUDA_HIP_BASE_THRUST_MACRO_HPP_
19 changes: 18 additions & 1 deletion common/cuda_hip/base/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#ifndef GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
#define GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_


#include "common/cuda_hip/base/math.hpp"
#if defined(GKO_COMPILING_CUDA)
#include "cuda/base/types.hpp"
#elif defined(GKO_COMPILING_HIP)
Expand All @@ -15,4 +15,21 @@
#endif


#define THRUST_HALF_FRIEND_OPERATOR(_op, _opeq) \
GKO_ATTRIBUTES GKO_INLINE GKO_THRUST_QUALIFIER::complex<__half> \
operator _op(const GKO_THRUST_QUALIFIER::complex<__half> lhs, \
const GKO_THRUST_QUALIFIER::complex<__half> rhs) \
{ \
return GKO_THRUST_QUALIFIER::complex<float>{ \
lhs} _op GKO_THRUST_QUALIFIER::complex<float>(rhs); \
}

THRUST_HALF_FRIEND_OPERATOR(+, +=)
THRUST_HALF_FRIEND_OPERATOR(-, -=)
THRUST_HALF_FRIEND_OPERATOR(*, *=)
THRUST_HALF_FRIEND_OPERATOR(/, /=)
yhmtsai marked this conversation as resolved.
Show resolved Hide resolved

#undef THRUST_HALF_FRIEND_OPERATOR


#endif // GKO_COMMON_CUDA_HIP_BASE_TYPES_HPP_
6 changes: 4 additions & 2 deletions common/cuda_hip/components/atomic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct atomic_helper {
};


// TODO: consider it implemented by memcpy.
template <typename ResultType, typename ValueType>
__forceinline__ __device__ ResultType reinterpret(ValueType val)
{
Expand Down Expand Up @@ -101,8 +102,9 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned long long int);
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);


#if defined(CUDA_VERSION)
// Support 16-bit ATOMIC_ADD and ATOMIC_MAX only on CUDA
#if defined(CUDA_VERSION) && (__CUDA_ARCH__ >= 700)
// Support 16-bit atomicCAS, atomicADD, and atomicMAX only on CUDA with CC
// >= 7.0
GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
#endif

Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/components/merging.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ __forceinline__ __device__ void group_merge(const ValueType* __restrict__ a,
IndexType a_begin{};
IndexType b_begin{};
auto lane = static_cast<IndexType>(group.thread_rank());
auto sentinel = device_numeric_limits<IndexType>::max;
auto sentinel = device_numeric_limits<IndexType>::max();
auto a_cur = checked_load(a, a_begin + lane, a_size, sentinel);
auto b_cur = checked_load(b, b_begin + lane, b_size, sentinel);
for (IndexType c_begin{}; c_begin < c_size; c_begin += group_size) {
Expand Down Expand Up @@ -240,7 +240,7 @@ __forceinline__ __device__ void sequential_merge(
auto c_size = a_size + b_size;
IndexType a_begin{};
IndexType b_begin{};
auto sentinel = device_numeric_limits<IndexType>::max;
auto sentinel = device_numeric_limits<IndexType>::max();
auto a_cur = checked_load(a, a_begin, a_size, sentinel);
auto b_cur = checked_load(b, b_begin, b_size, sentinel);
for (IndexType c_begin{}; c_begin < c_size; c_begin++) {
Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/factorization/par_ict_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ __global__ __launch_bounds__(default_block_size) void ict_tri_spgeam_init(

IndexType l_new_begin = l_new_row_ptrs[row];

constexpr auto sentinel = device_numeric_limits<IndexType>::max;
constexpr auto sentinel = device_numeric_limits<IndexType>::max();
// load column indices and values for the first merge step
auto a_col = checked_load(a_col_idxs, a_begin + lane, a_end, sentinel);
auto a_val = checked_load(a_vals, a_begin + lane, a_end, zero<ValueType>());
Expand Down Expand Up @@ -456,4 +456,4 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
} // namespace par_ict_factorization
} // namespace GKO_DEVICE_NAMESPACE
} // namespace kernels
} // namespace gko
} // namespace gko
Loading
Loading