Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update for CUDA 9 / 10 #5

Merged
merged 25 commits into from
Jan 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
2cdea63
Fp16 fixes for CUDA 9 (#783)
csarofeen Jun 26, 2017
68170ce
Warp intrinsic fixes (#785)
Jun 29, 2017
e905454
Updates for CUDA 9
csarofeen Jul 19, 2017
b629e33
cuda 9 hgemm fix
soumith Aug 25, 2017
a06460a
update with CMake 3.13, and add Turing support
gkanno Nov 30, 2018
51efac5
add nvcc option for half
gkanno Nov 30, 2018
c09c92f
patch for CUDA 10
gkanno Dec 3, 2018
e7fed5b
fix cuda 10.0 patch to be able to build with 9.x
gkanno Feb 25, 2019
4b168ba
add WARP_ANY
gkanno Feb 25, 2019
f944107
fix alignment warning
Sep 11, 2017
49dc78f
disable CudaHalfTensor for workaround on CUDA 10.
gkanno Jun 4, 2019
71c4469
Allowing larger grids for THCApply shows improved performance.
csarofeen Jul 21, 2017
352b446
Fix grid size for batch cat tensor now that getApplyGrid has been cha…
csarofeen Aug 28, 2017
aee45ce
fix __launch_bounds__ parameter for Turing(7.5)
gkanno Jun 10, 2019
7165723
same to ReduceNoncontig
gkanno Jun 10, 2019
27ba716
intoroduce mask parameter to WARP_ANY
gkanno Jun 11, 2019
4d6dc70
use cudaPointerAttributes.type for checking managed mamory.
gkanno Jun 11, 2019
3424b7c
fix cutorch_isManagedPtr
gkanno Jun 11, 2019
4992a6f
fix __launch_bounds__ parameter for Turing(7.5)
gkanno Jun 10, 2019
8fbc9cb
intoroduce mask parameter to WARP_ANY
gkanno Jun 11, 2019
a7d47fc
use cudaPointerAttributes.type for checking managed mamory.
gkanno Jun 11, 2019
cd8b9ef
fix bool <-> int conversion
gkanno Jun 11, 2019
6f33031
turn off CRT warnings of MSVC.
gkanno Jun 11, 2019
e04fdce
Merge branch '2017-06-01' of https://github.com/gkanno/cutorch into 2…
gkanno Jun 11, 2019
55100c0
add Compute Capability 7.2 to SELECT_COMPUTE_ARCH
gkanno Jun 12, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ FIND_PACKAGE(MAGMA)
IF (NOT WIN32)
SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration ${CMAKE_C_FLAGS}")
ENDIF (NOT WIN32)
IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
IF(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))
ADD_DEFINITIONS(-DTH_GENERIC_USE_HALF=1)
ADD_DEFINITIONS(-DCUDA_HAS_FP16=1)
ENDIF()

IF(MSVC)
# turn off CRT func warnings
ADD_DEFINITIONS(-D_CRT_SECURE_NO_WARNINGS)
ENDIF(MSVC)

INCLUDE_DIRECTORIES(${CUDA_INCLUDE_DIRS})

ADD_SUBDIRECTORY(lib)
Expand Down
4 changes: 4 additions & 0 deletions init.c
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,11 @@ static int cutorch_isManagedPtr(lua_State *L)
lua_pushboolean(L, 0);
} else {
THCudaCheck(res);
#if CUDART_VERSION >= 10000
lua_pushboolean(L, (attributes.type == cudaMemoryTypeManaged) ? 1 : 0);
#else
lua_pushboolean(L, attributes.isManaged);
#endif
}
return 1;
}
Expand Down
9 changes: 5 additions & 4 deletions lib/THC/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
endif(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.9.3")
endif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")


IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
IF(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "4.7" OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL "4.7" )
SET(CXX_VERSION "c++11")
Expand Down Expand Up @@ -203,14 +204,14 @@ endforeach()

MESSAGE(STATUS "got cuda version " ${CUDA_VERSION})

IF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
IF(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))
MESSAGE(STATUS "Found CUDA with FP16 support, compiling with torch.CudaHalfTensor")
LIST(APPEND src-cuda THCHalf.cu)
LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1")
LIST(APPEND CUDA_NVCC_FLAGS "-DCUDA_HAS_FP16=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF2_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__")
SET(CMAKE_C_FLAGS "-DCUDA_HAS_FP16=1 ${CMAKE_C_FLAGS}")
ELSE(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
ELSE(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))
MESSAGE(STATUS "Could not find CUDA with FP16 support, compiling without torch.CudaHalfTensor")
ENDIF(CUDA_HAS_FP16 OR NOT ${CUDA_VERSION} LESS 7.5)
ENDIF(CUDA_HAS_FP16 OR (NOT ${CUDA_VERSION} LESS 7.5 AND ${CUDA_VERSION} LESS 10.0))

MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
IF ("$ENV{STATIC_TH}" STREQUAL "YES")
Expand Down
31 changes: 19 additions & 12 deletions lib/THC/THCApply.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,20 @@

// Threads per block for our apply kernel
// FIXME: use occupancy calculator instead
#define THC_APPLY_THREADS_PER_BLOCK 32 * 16
#if __CUDA_ARCH__ >= 750
#define THC_APPLY_THREADS_PER_BLOCK (32 * 16)
#define THC_APPLY_BLOCKS_PER_SM 2
#else
#define THC_APPLY_THREADS_PER_BLOCK (32 * 16)
#define THC_APPLY_BLOCKS_PER_SM 4
#endif

template <typename Op,
typename Ta,
typename IndexType,
int ADims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply1(TensorInfo<Ta, IndexType> a,
Expand All @@ -43,7 +49,7 @@ template <typename Op,
typename IndexType,
int ADims, int BDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply2(TensorInfo<Ta, IndexType> a,
Expand All @@ -70,7 +76,7 @@ template <typename Op,
typename IndexType,
int ADims, int BDims, int CDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_APPLY_THREADS_PER_BLOCK, THC_APPLY_BLOCKS_PER_SM)
#endif
__global__ void
kernelPointwiseApply3(TensorInfo<Ta, IndexType> a,
Expand Down Expand Up @@ -109,16 +115,16 @@ inline bool getApplyGrid(THCState* state, ptrdiff_t totalElements, dim3& grid) {
return false;
}

// Assume a reasonable number of SMs if no state is available
int numSM =
state ? THCState_getCurrentDeviceProperties(state)->multiProcessorCount : 15;
if(THCState_getCurrentDeviceProperties(state)->major < 3){
grid = dim3(min((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK), (long long) 64*1024-1));
return true;
}

// 16 warps per block * 4 per SM gives 64 warps per SM at maximum,
// which seems to be a good sweetspot for latency hiding
grid = dim3(min((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK),
4LL * numSM));
grid = dim3((long long) THCCeilDiv(totalElements,
(ptrdiff_t) THC_APPLY_THREADS_PER_BLOCK) );
return true;

}

template <typename TensorTypeA,
Expand Down Expand Up @@ -640,5 +646,6 @@ bool THC_pointwiseApply3(THCState* state,
}

#undef THC_APPLY_THREADS_PER_BLOCK
#undef THC_APPLY_BLOCKS_PER_SM

#endif // THC_APPLY_INC
9 changes: 9 additions & 0 deletions lib/THC/THCAtomics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ static inline __device__ void atomicAdd(long *address, long val) {
}

#ifdef CUDA_HALF_TENSOR
#if ! ( CUDA_VERSION >= 10000 && (__CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)) )
static inline __device__ void atomicAdd(half *address, half val) {
unsigned int * address_as_ui =
(unsigned int *) ((char *)address - ((size_t)address & 2));
Expand All @@ -102,14 +103,22 @@ static inline __device__ void atomicAdd(half *address, half val) {

do {
assumed = old;
#if CUDA_VERSION < 9000
half hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
hsum = THCNumerics<half>::add(hsum, val);
#else
__half_raw hsum;
hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
half tmpres = THCNumerics<half>::add(hsum, val);
hsum = __half_raw(tmpres);
#endif
old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
old = atomicCAS(address_as_ui, assumed, old);
} while (assumed != old);
}
#endif
#endif

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
// from CUDA C Programmic Guide
Expand Down
50 changes: 30 additions & 20 deletions lib/THC/THCBlas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -263,35 +263,45 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, long m, long n,
cublasOperation_t opb = convertTransToCublasOperation(transb);

if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
{
int i_m = (int)m;
int i_n = (int)n;
int i_k = (int)k;
int i_lda = (int)lda;
int i_ldb = (int)ldb;
int i_ldc = (int)ldc;
{
int i_m = (int)m;
int i_n = (int)n;
int i_k = (int)k;
int i_lda = (int)lda;
int i_ldb = (int)ldb;
int i_ldc = (int)ldc;

cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
cublasSetStream(handle, THCState_getCurrentStream(state));
cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
cublasSetStream(handle, THCState_getCurrentStream(state));

// Check for native Hgemm support
if (THC_fastHalfInstructions(state)) {
THCublasCheck(cublasHgemm(handle, opa, opb,
i_m, i_n, i_k, &alpha, a, i_lda, b, i_ldb,
&beta, c, i_ldc));
} else {
// Simulated Hgemm
float fAlpha = THC_half2float(alpha);
float fBeta = THC_half2float(beta);

#if CUDA_VERSION < 9000
THCublasCheck(cublasSgemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,
i_m, i_n, i_k, &fAlpha,
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
#else
cudaDeviceProp* prop = THCState_getCurrentDeviceProperties(state);
if (prop->major >= 5){
THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
THCublasCheck(cublasGemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
}else{
THCublasCheck(cublasSgemmEx(handle, opa, opb,
i_m, i_n, i_k, &fAlpha,
a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
i_ldb, &fBeta, c, CUDA_R_16F, i_ldc));
}
#endif
return;
}

return;
}
THError("Cublas_Hgemm only supports m, n, k, lda, ldb, ldc"
"with th bound [val] <= %d", INT_MAX);
}
Expand Down
68 changes: 68 additions & 0 deletions lib/THC/THCDeviceUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,72 @@ __device__ __forceinline__ T doLdg(const T* p) {
#endif
}

__device__ __forceinline__ unsigned int ACTIVE_MASK()
{
#if CUDA_VERSION >= 9000
return __activemask();
#else
// will be ignored anyway
return 0xffffffff;
#endif
}

__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __ballot_sync(mask, predicate);
#else
return __ballot(predicate);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_sync(mask, value, srcLane, width);
#else
return __shfl(value, srcLane, width);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_up_sync(mask, value, delta, width);
#else
return __shfl_up(value, delta, width);
#endif
}

template <typename T>
__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_down_sync(mask, value, delta, width);
#else
return __shfl_down(value, delta, width);
#endif
}

__device__ __forceinline__ bool WARP_ANY(bool cond, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return (bool)__any_sync(mask, (int)cond);
#else
return __any(cond);
#endif
}

#endif // THC_DEVICE_UTILS_INC
4 changes: 4 additions & 0 deletions lib/THC/THCGeneral.c
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,7 @@ void THCHeapUpdate(THCState *state, ptrdiff_t size) {
#include "THCAllocator.c"

/* from THCHalf.h */
#ifdef CUDA_HALF_TENSOR

half THC_float2half(float f)
{
Expand All @@ -953,3 +954,6 @@ float THC_half2float(half h)
TH_halfbits2float(&h.x, &f);
return f;
}

#endif

8 changes: 7 additions & 1 deletion lib/THC/THCHalf.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "THCGeneral.h"

/* We compile with CudaHalfTensor support if we have this: */
#if CUDA_VERSION >= 7050 || CUDA_HAS_FP16
#if (CUDA_VERSION >= 7050 && CUDA_VERSION < 10000) || CUDA_HAS_FP16
#define CUDA_HALF_TENSOR 1
#endif

Expand All @@ -13,6 +13,12 @@
#include <cuda_fp16.h>
#include <stdint.h>

#if CUDA_VERSION >= 9000
#ifndef __cplusplus
typedef __half_raw half;
#endif
#endif

THC_EXTERNC void THCFloat2Half(THCState *state, half *out, float *in, ptrdiff_t len);
THC_EXTERNC void THCHalf2Float(THCState *state, float *out, half *in, ptrdiff_t len);
THC_API half THC_float2half(float a);
Expand Down
5 changes: 5 additions & 0 deletions lib/THC/THCNumerics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,13 @@ struct THCNumerics<long> {
#ifdef CUDA_HALF_TENSOR
template <>
struct THCNumerics<half> {
#if CUDA_VERSION < 9000
static inline __host__ __device__ half min() { half h; h.x = 0xfbff; return h; }
static inline __host__ __device__ half max() { half h; h.x = 0x7bff; return h; }
#else
static inline __host__ __device__ half min() { __half_raw h; h.x = 0xfbff; return h; }
static inline __host__ __device__ half max() { __half_raw h; h.x = 0x7bff; return h; }
#endif

static inline __host__ __device__ bool lt(half a, half b) {
#ifdef __CUDA_ARCH__
Expand Down
11 changes: 9 additions & 2 deletions lib/THC/THCReduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@
#include "THCReduceApplyUtils.cuh"

// Threads per thread block
#define THC_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
#if __CUDA_ARCH__ >= 750
#define THC_NONCONTIG_REDUCE_BLOCK_SIZE (32 * 16)
#define THC_NONCONTIG_REDUCE_BLOCKS_PER_SM 2
#else
#define THC_NONCONTIG_REDUCE_BLOCK_SIZE (32 * 16)
#define THC_NONCONTIG_REDUCE_BLOCKS_PER_SM 4
#endif

template <typename IndexType>
__device__ __forceinline__ IndexType getReduceNoncontigDimSliceIndex() {
Expand All @@ -27,7 +33,7 @@ template <typename ModifyOp,
typename IndexType,
int ADims, int BDims>
#if __CUDA_ARCH__ >= 350
__launch_bounds__(32 * 16, 4)
__launch_bounds__(THC_NONCONTIG_REDUCE_BLOCK_SIZE, THC_NONCONTIG_REDUCE_BLOCKS_PER_SM)
#endif
__global__ void
kernelReduceNoncontigDim(TensorInfo<T, IndexType> out,
Expand Down Expand Up @@ -324,5 +330,6 @@ bool THC_reduceDim(THCState* state,
}

#undef THC_NONCONTIG_REDUCE_BLOCK_SIZE
#undef THC_NONCONTIG_REDUCE_BLOCKS_PER_SM

#endif // THC_REDUCE_INC
3 changes: 2 additions & 1 deletion lib/THC/THCScanUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define THC_SCAN_UTILS_INC

#include "THCAsmUtils.cuh"
#include "THCDeviceUtils.cuh"

// Collection of in-kernel scan / prefix sum utilities

Expand Down Expand Up @@ -152,7 +153,7 @@ __device__ void exclusivePrefixScan(T* smem, T in, T* out, T* carry, BinaryFunct
template <typename T, bool KillWARDependency, class BinaryFunction>
__device__ void inclusiveBinaryPrefixScan(T* smem, bool in, T* out, BinaryFunction binop) {
// Within-warp, we use warp voting.
T vote = __ballot(in);
T vote = WARP_BALLOT(in);
T index = __popc(getLaneMaskLe() & vote);
T carry = __popc(vote);

Expand Down
Loading