Skip to content

Commit

Permalink
update for CUDA 9 / 10 (#5)
Browse files Browse the repository at this point in the history
* Fp16 fixes for CUDA 9 (torch#783)

* Warp intrinsic fixes (torch#785)

* Updates for CUDA 9

* cuda 9 hgemm fix

* update with CMake 3.13, and add Turing support

* add nvcc option for half

* patch for CUDA 10

* fix cuda 10.0 patch to be able to build with 9.x

* add WARP_ANY

* fix alignment warning

* disable CudaHalfTensor for workaround on CUDA 10.

half structure is replaced by half class on CUDA 10.
It conflict with extern "C" declared functions.

* Allowing larger grids for THCApply shows improved performance.

* Fix grid size for batch cat tensor now that getApplyGrid has been changed.

* fix __launch_bounds__ parameter for Turing(7.5)

* same to ReduceNoncontig

* intoroduce mask parameter to WARP_ANY

* use cudaPointerAttributes.type for checking managed mamory.
cudaPointerAttributes.isManaged is deprecated in CUDA 10.

* fix cutorch_isManagedPtr

* fix __launch_bounds__ parameter for Turing(7.5)

* intoroduce mask parameter to WARP_ANY

* use cudaPointerAttributes.type for checking managed mamory.
cudaPointerAttributes.isManaged is deprecated in CUDA 10.

* fix bool <-> int conversion

* turn off CRT warnings of MSVC.

* add Compute Capability 7.2 to SELECT_COMPUTE_ARCH

Co-authored-by: Christian Sarofeen <[email protected]>
Co-authored-by: Soumith Chintala <[email protected]>
  • Loading branch information
3 people authored and elikosan committed Jan 15, 2020
1 parent b14e265 commit 5128d20
Show file tree
Hide file tree
Showing 21 changed files with 347 additions and 98 deletions.
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ FIND_PACKAGE(MAGMA)
IF (NOT WIN32)
SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}")
ENDIF (NOT WIN32)
IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
IF(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))
ADD_DEFINITIONS(-DTH_GENERIC_USE_HALF=1)
ADD_DEFINITIONS(-DCUDA_HAS_FP16=1)
ENDIF()

IF(MSVC)
# turn off CRT func warnings
ADD_DEFINITIONS(-D_CRT_SECURE_NO_WARNINGS)
ENDIF(MSVC)

INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})

ADD_SUBDIRECTORY(lib)
Expand Down
4 changes: 4 additions & 0 deletions init.c
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,11 @@ static int cutorch_isManagedPtr(lua_State *L)
lua_pushboolean(L, 0);
} else {
THCudaCheck(res);
#if CUDART_VERSION >= 10000
lua_pushboolean(L, (attributes.type == cudaMemoryTypeManaged) ? 1 : 0);
#else
lua_pushboolean(L, attributes.isManaged);
#endif
}
return 1;
}
Expand Down
9 changes: 5 additions & 4 deletions lib/THC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")


IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.7" OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "4.7" )
SET(CXX_VERSION "c++11")
Expand Down Expand Up @@ -203,14 +204,14 @@ endforeach()

MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})

IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
IF(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))
MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
LIST(APPEND src-cuda THCHalf.cu)
LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF2_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__")
SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
ELSE(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))
MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
ENDIF(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))

MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
IF ("$ENV{STATIC_TH}" STREQUAL "YES")
Expand Down
31 changes: 19 additions & 12 deletions lib/THC/THCApply.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,20 @@

// Threads per block for our apply kernel
// FIXME: use occupancy calculator instead
#define THC_APPLY_THREADS_PER_BLOCK 32 * 16
#if __CUDA_ARCH__ >= 750
#define THC_APPLY_THREADS_PER_BLOCK (32 * 16)
#define THC_APPLY_BLOCKS_PER_SM 2
#else
#define THC_APPLY_THREADS_PER_BLOCK (32 * 16)
#define THC_APPLY_BLOCKS_PER_SM 4
#endif

template <typename Op,
typename Ta,
typename IndexType,
int ADims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply1(TensorInfo<Ta, IndexType> a,
Expand All @@ -43,7 +49,7 @@ template <typename Op,
typename IndexType,
int ADims, int BDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply2(TensorInfo<Ta, IndexType> a,
Expand All @@ -70,7 +76,7 @@ template <typename Op,
typename IndexType,
int ADims, int BDims, int CDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply3(TensorInfo<Ta, IndexType> a,
Expand Down Expand Up @@ -109,16 +115,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
return false;
}

// Assume a reasonable number of SMs if no state is available
int numSM =
state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;
if(THCState_getCurrentDeviceProperties(state)->major < 3){
grid = dim3(min((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1));
return true;
}

// 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
// which seems to be a good sweetspot for latency hiding
grid = dim3(min((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
4LL * numSM));
grid = dim3((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
return true;

}

template <typename TensorTypeA,
Expand Down Expand Up @@ -640,5 +646,6 @@ bool THC_pointwiseApply3(THCState* state,
}

#undef THC_APPLY_THREADS_PER_BLOCK
#undef THC_APPLY_BLOCKS_PER_SM

#endif // THC_APPLY_INC
9 changes: 9 additions & 0 deletions lib/THC/THCAtomics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ static inline __device__ void atomicAdd(long *address, long val) {
}

#ifdef CUDA_HALF_TENSOR
#if ! ( CUDA_VERSION >= 10000 && (__CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)) )
static inline __device__ void atomicAdd(half *address, half val) {
unsigned int * address_as_ui =
(unsigned int *) ((char *)address - ((size_t)address & 2));
Expand All @@ -102,14 +103,22 @@ static inline __device__ void atomicAdd(half *address, half val) {

do {
assumed = old;
#if CUDA_VERSION < 9000
half hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
hsum = THCNumerics<half>::add(hsum, val);
#else
__half_raw hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
half tmpres = THCNumerics<half>::add(hsum, val);
hsum = __half_raw(tmpres);
#endif
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
}
#endif
#endif

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
// from CUDA C Programmic Guide
Expand Down
50 changes: 30 additions & 20 deletions lib/THC/THCBlas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -263,35 +263,45 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n,
cublasOperation_t opb = convertTransToCublasOperation(transb);

if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
{
int i_m = (int)m;
int i_n = (int)n;
int i_k = (int)k;
int i_lda = (int)lda;
int i_ldb = (int)ldb;
int i_ldc = (int)ldc;
{
int i_m = (int)m;
int i_n = (int)n;
int i_k = (int)k;
int i_lda = (int)lda;
int i_ldb = (int)ldb;
int i_ldc = (int)ldc;

cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
cublasSetStream(handle, THCState_getCurrentStream(state));
cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
cublasSetStream(handle, THCState_getCurrentStream(state));

// Check for native Hgemm support
if (THC_fastHalfInstructions(state)) {
THCublasCheck(cublasHgemm(handle, opa, opb,
i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb,
&beta, c, i_ldc));
} else {
// Simulated Hgemm
float fAlpha = THC_half2float(alpha);
float fBeta = THC_half2float(beta);

#if CUDA_VERSION < 9000
THCublasCheck(cublasSgemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,
i_m, i_n, i_k, &fAlpha,
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
#else
cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state);
if (prop->major >= 5){
THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
THCublasCheck(cublasGemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}else{
THCublasCheck(cublasSgemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
}
#endif
return;
}

return;
}
THError("Cublas_Hgemm only supports m, n, k, lda, ldb, ldc"
"with th bound [val] <= %d", INT_MAX);
}
Expand Down
68 changes: 68 additions & 0 deletions lib/THC/THCDeviceUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,72 @@ __device__ __forceinline__ T doLdg(const T* p) {
#endif
}

__device__ __forceinline__ unsigned int ACTIVE_MASK()
{
#if CUDA_VERSION >= 9000
return __activemask();
#else
// will be ignored anyway
return 0xffffffff;
#endif
}

__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __ballot_sync(mask, predicate);
#else
return __ballot(predicate);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_sync(mask, value, srcLane, width);
#else
return __shfl(value, srcLane, width);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_up_sync(mask, value, delta, width);
#else
return __shfl_up(value, delta, width);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_down_sync(mask, value, delta, width);
#else
return __shfl_down(value, delta, width);
#endif
}

__device__ __forceinline__ bool WARP_ANY(bool cond, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return (bool)__any_sync(mask, (int)cond);
#else
return __any(cond);
#endif
}

#endif // THC_DEVICE_UTILS_INC
4 changes: 4 additions & 0 deletions lib/THC/THCGeneral.c
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,7 @@ void THCHeapUpdate(THCState *state, ptrdiff_t size) {
#include "THCAllocator.c"

/* from THCHalf.h */
#ifdef CUDA_HALF_TENSOR

half THC_float2half(float f)
{
Expand All @@ -953,3 +954,6 @@ float THC_half2float(half h)
TH_halfbits2float(&h.x, &f);
return f;
}

#endif

8 changes: 7 additions & 1 deletion lib/THC/THCHalf.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "THCGeneral.h"

/* We compile with CudaHalfTensor support if we have this: */
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
#if (CUDA_VERSION >= 7050 && CUDA_VERSION < 10000) || CUDA_HAS_FP16
#define CUDA_HALF_TENSOR 1
#endif

Expand All @@ -13,6 +13,12 @@
#include <cuda_fp16.h>
#include <stdint.h>

#if CUDA_VERSION >= 9000
#ifndef __cplusplus
typedef __half_raw half;
#endif
#endif

THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
THC_API half THC_float2half(float a);
Expand Down
5 changes: 5 additions & 0 deletions lib/THC/THCNumerics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,13 @@ struct THCNumerics<long> {
#ifdef CUDA_HALF_TENSOR
template <>
struct THCNumerics<half> {
#if CUDA_VERSION < 9000
static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; }
static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; }
#else
static inline __host__ __device__ half min() { __half_raw h; h.x = 0xfbff; return h; }
static inline __host__ __device__ half max() { __half_raw h; h.x = 0x7bff; return h; }
#endif

static inline __host__ __device__ bool lt(half a, half b) {
#ifdef __CUDA_ARCH__
Expand Down
11 changes: 9 additions & 2 deletions lib/THC/THCReduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@
#include "THCReduceApplyUtils.cuh"

// Threads per thread block
#define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
#if __CUDA_ARCH__ >= 750
#define THC_NONCONTIG_REDUCE_BLOCK_SIZE (32 * 16)
#define THC_NONCONTIG_REDUCE_BLOCKS_PER_SM 2
#else
#define THC_NONCONTIG_REDUCE_BLOCK_SIZE (32 * 16)
#define THC_NONCONTIG_REDUCE_BLOCKS_PER_SM 4
#endif

template <typename IndexType>
__device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
Expand All @@ -27,7 +33,7 @@ template <typename ModifyOp,
typename IndexType,
int ADims, int BDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_NONCONTIG_REDUCE_BLOCK_SIZE, THC_NONCONTIG_REDUCE_BLOCKS_PER_SM)
#endif
__global__ void
kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
Expand Down Expand Up @@ -324,5 +330,6 @@ bool THC_reduceDim(THCState* state,
}

#undef THC_NONCONTIG_REDUCE_BLOCK_SIZE
#undef THC_NONCONTIG_REDUCE_BLOCKS_PER_SM

#endif // THC_REDUCE_INC
3 changes: 2 additions & 1 deletion lib/THC/THCScanUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define THC_SCAN_UTILS_INC

#include "THCAsmUtils.cuh"
#include "THCDeviceUtils.cuh"

// Collection of in-kernel scan / prefix sum utilities

Expand Down Expand Up @@ -152,7 +153,7 @@ __device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunct
template <typename T, bool KillWARDependency, class BinaryFunction>
__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
// Within-warp, we use warp voting.
T vote = __ballot(in);
T vote = WARP_BALLOT(in);
T index = __popc(getLaneMaskLe() & vote);
T carry = __popc(vote);

Expand Down
Loading

0 comments on commit 5128d20

Please sign in to comment.