Debugging GPU implementation

p-sto · Nov 30, 2017 · 5b56eec · 5b56eec
1 parent e94ff3f
commit 5b56eec
Show file tree

Hide file tree

Showing 12 changed files with 171 additions and 66 deletions.
diff --git a/Makefile b/Makefile
@@ -40,11 +40,12 @@ LDFLAGS := -Wl,--start-group $(LIBRARY_DIRS)/libmkl_intel_ilp64.a $(LIBRARY_DIRS
 LDFLAGS +=  -liomp5 -lpthread -lm -ldl
 
 # add cuda flags
-LDFLAGS += -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse
+# -DMKL_ILP64 sets int to 64, has to be added to both gcc and nvcc
+CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64
 
 $(TARGET): $(OBJECTS)
 	@echo " Linking..."
-	$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS)
+	$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS) $(CUDAFLAGS)
 
 $(BUILDDIR)/%.o: $(SRCDIR)/%.$(NVSRCEXT)
 	$(NVCC) $(CUDAFLAGS) -c -o $@ $<
@@ -60,4 +61,7 @@ clean:
 	@echo " Cleaning...";
 	@echo " $(RM) -r $(BUILDDIR) $(TARGET)"; $(RM) -r $(BUILDDIR) $(TARGET)
 
+cuda_info:
+	nvidia-smi -a
+
 .PHONY: default all clean
diff --git a/README.rst b/README.rst
@@ -128,39 +128,71 @@ Examples can be found in ``scripts/ConjugateGradients/demo.py``
 Required Python 3.5+
 
 
-C implementation - TBA
+CPU/GPU implementation
 ----------------------
 
-MKL implementation
-~~~~~~~~~~~~~~~~~~
+Libraries and compilation
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Required - Intel MKL library for BLAS operations. Implementation was tested on version 2017 though older should work as well.
-By default MKL will be installed in directory ``/opt/intel/mkl/``.
-
-To compile:
+Before compiling code, make sure you have installed:
 
 ::
 
-    $ source mkl_setup.sh
-    $ make
+    1. Intel MKL library
+    2. Nvidia CUDA with NVCC compiler
 
-``mkl_setup.sh`` sources MKL env configuration.
+Intel MKL library is used for BLAS operations. Implementation was tested on version 2017 though older should work as well.
+By default MKL will be installed in directory ``/opt/intel/mkl/``. Before compiling make sure ``prepare_env.sh`` has proper
+paths to MKL and CUDA libraries.
+
+In Makefile set accordingly:
 
 ::
 
-    If you have installed MKL in a different directory, then you will have to adjust mkl_setup.sh
-    script to point to proper paths.
-    Also please note that in makefile there is MKLROOT variable which points to MKL installation directory.
+    1. MKLROOT
+    2. NVCC
+    3. CUDALIBPATH
 
-By default MKL will be compiled as a static library.
-Since there are many dependencies it is good to set ``CFLAGS`` and ``LDFLAGS`` accordingly to MKL link line advisor:
+By default MKL will be compiled as a static library. CUDA is linked dynamically.
+``LDFLAGS`` are used to set dependencies for MKL, please refer to MKL link line advisor to be sure to have it set properly:
 
 https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor
 
+``CUDAFLAGS`` are responsible for setting CUDA libraries.
+
+``GCC`` is used for compiling .c files, ``NVCC`` is used for .cu files. Whole project is linked by ``GCC``.
+
+To compile:
+
+::
+
+    $ source prepare_env.sh
+    $ make
+
 Use ``make clean`` command to delete compiled build.
 
-CUDA implementation - TBA
--------------------------
+Running ConjugateGradient
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Running single core CPU MKL implementation:
+
+``./ConjugateGradient -i input_matrix.txt``
+
+Running multiple core CPU MKL implementation:
+
+``./ConjugateGradient -i input_matrix.txt -mt 4``
+
+Running GPU implementation (single device only available):
+
+``./ConjugateGradient -i input_matrix.txt --gpu``
+
+::
+
+    If there are no CUDA devices, CPU implementation will be launched.
+
+input_matrix.txt is expected to be CSR formatted matrix, various examples can be generated by Python scripts.
+
+
 
 Conjugate Gradients description
 -------------------------------

diff --git a/mkl_setup.sh b/mkl_setup.sh
diff --git a/prepare_env.sh b/prepare_env.sh
@@ -0,0 +1,4 @@
+source /opt/intel/mkl/bin/mklvars.sh intel64
+source /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+export PATH=$PATH:/usr/local/cuda/bin
diff --git a/src/cg_colver_gpu.cu b/src/cg_colver_gpu.cu
@@ -1,19 +1,21 @@
 /*Contains implementation for gpu_cg_solver functions.*/
 
-
-#include "cg_colver_gpu.h"
+#include <stdio.h>
 #include "ckernels.h"
 
+
 extern "C"
 {
-#include "gpu_utils.h"
 #include "utils.h"
-
+#include "gpu_utils.h"
+#include "cg_colver_gpu.h"
 #include <cuda_runtime.h>
 #include <cusparse_v2.h>
 #include <cublas_v2.h>
+}
 
 #define threadsPerBlock 256
+#define CHECK_FOR_STATUS(status) printf("cublas status = %s\n", cublasGetErrorString(status))
 
 #define FREE_DEVICE_STACK \
 	cudaFree(d_r);\
@@ -29,21 +31,20 @@ extern "C"
 	cudaFree(d_beta);\
 	cudaFree(d_alfa);\
 	cudaFree(d_alpha_zero);\
-	cudaFree(d_dot);\
+	cudaFree(d_dot_new);\
 	cudaFree(d_norm);\
 	cudaFree(d_dot_zero);\
 	cudaFree(d_dot_old);\
 	cudaFree(d_dTq);
 
 
-int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data gpu_data){
+int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){
 	/*Single GPU CG solver using cublas*/
-
 	double *h_dot, *h_dot_zero;
 	int *d_I = NULL, *d_J = NULL;
 	const double tol = 1e-2f;
 	double *d_alfa, *d_beta, *d_alpha_zero;
-	double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
+	double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
 	int k, max_iter;
 
 	k = 0;
@@ -52,41 +53,55 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
 	max_iter = 200;
 
 	size_t size = matrix->size * sizeof(double);
+	size_t d_size = sizeof(double);
 
 	cusparseHandle_t cusparseHandle = 0;
-	cublasHandle_t cublasHandle = 0;
+	cusparseCreate(&cusparseHandle);
+
 	cusparseMatDescr_t descr = 0;
+	cusparseCreateMatDescr(&descr);
+
+	cublasHandle_t cublasHandle = 0;
+	cublasCreate(&cublasHandle);
 
 	cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
 	cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);
 
+	cublasStatus_t cublasStatus;
+
+	printf("Mallocing CUDA divice memory\n");
 	cudaMalloc((void **)&d_r, size);
 	cudaMalloc((void **)&d_helper, size);
 	cudaMalloc((void **)&d_x, size);
 	cudaMalloc((void **)&d_rhs,	size);
 	cudaMalloc((void **)&d_d, size);
 	cudaMalloc((void **)&d_Ax, size);
 	cudaMalloc((void **)&d_q, size);
-
 	cudaMalloc((void **)&d_val, matrix->non_zero * sizeof(double));
-	cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(double));
+	cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(int));
 	cudaMalloc((void **)&d_I, (matrix->size + 1) * sizeof(int));
 
-	cudaMalloc((void **)&d_beta, sizeof(double));
-	cudaMalloc((void **)&d_alfa, sizeof(double));
-	cudaMalloc((void **)&d_alpha_zero, sizeof(double));
-	cudaMalloc((void **)&d_dot, sizeof(double));
-	cudaMalloc((void **)&d_dot_zero, sizeof(double));
-	cudaMalloc((void **)&d_norm, sizeof(double));
-
+	cudaMalloc((void **)&d_beta, d_size);
+	cudaMalloc((void **)&d_alfa, d_size);
+	cudaMalloc((void **)&d_alpha_zero, d_size);
+	cudaMalloc((void **)&d_dot_new, d_size);
+	cudaMalloc((void **)&d_dot_zero, d_size);
+	cudaMalloc((void **)&d_norm, d_size);
+
+	cudaMemset(d_beta, 0, d_size);
+	cudaMemset(d_alfa, 0, d_size);
+	cudaMemset(d_alpha_zero, 0, d_size);
+	cudaMemset(d_dot_new, 0, d_size);
+	cudaMemset(d_dot_zero, 0, d_size);
+	cudaMemset(d_norm, 0, d_size);
+
+	printf("Copying to device\n");
 	cudaMemcpy(d_val, matrix->val, matrix->non_zero * sizeof(double), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_J, matrix->J_row, matrix->non_zero * sizeof(int), cudaMemcpyHostToDevice);
 	cudaMemcpy(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int), cudaMemcpyHostToDevice);
-
 	cudaMemcpy(d_x, x_vec, size, cudaMemcpyHostToDevice);
-	cudaMemcpy(d_rhs, rhs, size, cudaMemcpyHostToDevice);
 
-	int blocksPerGrid = ((matrix->size + threadsPerBlock -1) / threadsPerBlock );
+	int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock );
 	while (blocksPerGrid % threadsPerBlock != 0){
 		blocksPerGrid++;
 	}
@@ -96,43 +111,60 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
 	const double one = 1.0;
 	const double minus_one = -1.0;
 	/*Calculate Ax matrix*/
+
 	cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
-					  &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
+				   &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
 	/*Calculate rhs=rhs-Ax matrix*/
-	cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1);
+	cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1);
+	CHECK_FOR_STATUS(cublasStatus);
+
 	/*CG: Copy updated rhs (residuum) to d vector*/
-	cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1);
-	/*CG: calculate dot r'*r, assign it to dot_new */
-	cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot);
+	cublasStatus = cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1);
+	CHECK_FOR_STATUS(cublasStatus);
+
+	/*CG: calculate dot r'*r, assign it to d_dot_new */
+	cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
+	CHECK_FOR_STATUS(cublasStatus);
+
 	/*assign dot_new to dot_zero*/
-	d_dot_zero = d_dot;
-	cudaMemcpy(h_dot, d_dot,  sizeof(double), cudaMemcpyDeviceToHost);
+	d_dot_zero = d_dot_new;
+	cudaMemcpy(h_dot, d_dot_new,  sizeof(double), cudaMemcpyDeviceToHost);
 	cudaMemcpy(h_dot_zero, d_dot_zero,  sizeof(double), cudaMemcpyDeviceToHost);
 	while ((*h_dot >  tol * tol * *h_dot_zero) && (k < max_iter)) {
 		/*Calculate q=A*d vector*/
 		cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
 						  &alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
 		/*Calculate alpha:*/
-		cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq);
-		sDdiv<<<1, gpu_data.devices[0].warp_size>>>(d_alfa, d_dot, d_dTq);
+		cublasStatus = cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq);
+		CHECK_FOR_STATUS(cublasStatus);
+
+		sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq);
 		/*Calculate x=x+alpha*d*/
-		cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1);
+		cublasStatus = cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1);
+		CHECK_FOR_STATUS(cublasStatus);
+
 		/*Calculate r=r-alpha*q*/
 		axpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, -1, d_q, d_rhs);
 		/*Assign dot_old = dot_new*/
-		cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot, 1);
+		cublasStatus = cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot_new, 1);
+		CHECK_FOR_STATUS(cublasStatus);
+
 		/*CG:Assign dot_new = r'*r*/
-		cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot);
-		sDdiv<<<1, gpu_data.devices[0].warp_size>>>(d_beta, d_dot, d_dot_old);
+		cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
+		CHECK_FOR_STATUS(cublasStatus);
+
+		sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old);
 		/*Scale beta*d*/
-		cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1);
+		cublasStatus = cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1);
+		CHECK_FOR_STATUS(cublasStatus);
+
 		/*CG:Calculate d=r+beta*d*/
-		cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1);
+		cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1);
+		CHECK_FOR_STATUS(cublasStatus);
 		k++;
 	}
 	cusparseDestroy(cusparseHandle);
 	cudaDeviceReset();
 	FREE_DEVICE_STACK
 	return k;
 }
-}
diff --git a/src/cg_colver_gpu.h b/src/cg_colver_gpu.h
@@ -1,5 +1,10 @@
 /*Contains prototypes for gpu_cg_solver functions.*/
 
+#ifndef CG_SOLVER_GPU_H
+#define CG_SOLVER_GPU_H
 #include "utils.h"
+#include "gpu_utils.h"
 
-int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec);
+int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data);
+
+#endif
diff --git a/src/ckernels.cu b/src/ckernels.cu
@@ -1,6 +1,23 @@
 /*Contains implementation of custom kernels for CUDA devices.*/
 
 #include "ckernels.h"
+#include <cublas_v2.h>
+
+const char* cublasGetErrorString(cublasStatus_t status)
+{
+	switch(status)
+	{
+		case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+		case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+		case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+		case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+		case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+		case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+		case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+		case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+	}
+	return "unknown error";
+}
 
 __global__ void sDdiv(double *res, double *divided, double *divider) {
 	/*Division of scalar elements on a single CUDA thread*/
@@ -17,3 +34,4 @@ __global__ void axpy(int num_elements, double alpha, double *x, double *y) {
 		y[i] = y[i] + alpha * x[i];
 	}
 }
+
diff --git a/src/ckernels.h b/src/ckernels.h
@@ -1,4 +1,7 @@
 /*Contains prototypes of custom kernels for CUDA devices.*/
 
+#include <cublas_v2.h>
+
+const char* cublasGetErrorString(cublasStatus_t status);
 __global__ void sDdiv(double *res, double *divided, double *divider);
 __global__ void axpy(int num_elements, double alpha, double *x, double *y);
diff --git a/src/gpu_utils.cu b/src/gpu_utils.cu
@@ -1,5 +1,7 @@
 /*Contains implementation of gpu_utils functions and structs.*/
 
+#include <stdio.h>
+
 extern "C" {
 #include <cuda_runtime.h>
 #include "gpu_utils.h"
@@ -10,7 +12,11 @@ GPU_data *get_gpu_devices_data(){
 	GPU_data *gpu_data;
 	gpu_data = (GPU_data *)malloc(sizeof(GPU_data));
 	gpu_data->devices_number = 0;
-	cudaGetDeviceCount(&gpu_data->devices_number);
+	cudaError_t device_error;
+	device_error = cudaGetDeviceCount(&gpu_data->devices_number);
+	if (device_error != cudaSuccess)
+		printf("Error - could not read properly number of device, err=[%s] \n", cudaGetErrorString(device_error));
+
 	if (gpu_data->devices_number != 0){
 		gpu_data->devices = (GPU_device *)malloc(gpu_data->devices_number * sizeof(GPU_device));
 		for (int i = 0; i < gpu_data->devices_number; i ++){