Skip to content

Commit

Permalink
Debugging GPU implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
p-sto committed Nov 30, 2017
1 parent e94ff3f commit 5b56eec
Show file tree
Hide file tree
Showing 12 changed files with 171 additions and 66 deletions.
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@ LDFLAGS := -Wl,--start-group $(LIBRARY_DIRS)/libmkl_intel_ilp64.a $(LIBRARY_DIRS
LDFLAGS += -liomp5 -lpthread -lm -ldl

# add cuda flags
LDFLAGS += -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse
# -DMKL_ILP64 sets int to 64, has to be added to both gcc and nvcc
CUDAFLAGS := -L$(CUDALIBPATH) -lcuda -lcudart -lcublas -lcusparse -m64 -DMKL_ILP64

$(TARGET): $(OBJECTS)
@echo " Linking..."
$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS)
$(CC) $^ -o $(TARGET) $(LFLAGS) $(LDFLAGS) $(CUDAFLAGS)

$(BUILDDIR)/%.o: $(SRCDIR)/%.$(NVSRCEXT)
$(NVCC) $(CUDAFLAGS) -c -o $@ $<
Expand All @@ -60,4 +61,7 @@ clean:
@echo " Cleaning...";
@echo " $(RM) -r $(BUILDDIR) $(TARGET)"; $(RM) -r $(BUILDDIR) $(TARGET)

cuda_info:
nvidia-smi -a

.PHONY: default all clean
66 changes: 49 additions & 17 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,39 +128,71 @@ Examples can be found in ``scripts/ConjugateGradients/demo.py``
Required Python 3.5+


C implementation - TBA
CPU/GPU implementation
----------------------

MKL implementation
~~~~~~~~~~~~~~~~~~
Libraries and compilation
~~~~~~~~~~~~~~~~~~~~~~~~~

Required - Intel MKL library for BLAS operations. Implementation was tested on version 2017 though older should work as well.
By default MKL will be installed in directory ``/opt/intel/mkl/``.

To compile:
Before compiling code, make sure you have installed:

::

$ source mkl_setup.sh
$ make
1. Intel MKL library
2. Nvidia CUDA with NVCC compiler

``mkl_setup.sh`` sources MKL env configuration.
Intel MKL library is used for BLAS operations. Implementation was tested on version 2017 though older should work as well.
By default MKL will be installed in directory ``/opt/intel/mkl/``. Before compiling make sure ``prepare_env.sh`` has proper
paths to MKL and CUDA libraries.

In Makefile set accordingly:

::

If you have installed MKL in a different directory, then you will have to adjust mkl_setup.sh
script to point to proper paths.
Also please note that in makefile there is MKLROOT variable which points to MKL installation directory.
1. MKLROOT
2. NVCC
3. CUDALIBPATH

By default MKL will be compiled as a static library.
Since there are many dependencies it is good to set ``CFLAGS`` and ``LDFLAGS`` accordingly to MKL link line advisor:
By default MKL will be compiled as a static library. CUDA is linked dynamically.
``LDFLAGS`` are used to set dependencies for MKL, please refer to MKL link line advisor to be sure to have it set properly:

https://software.intel.com/en-us/articles/intel-mkl-link-line-advisor

``CUDAFLAGS`` are responsible for setting CUDA libraries.

``GCC`` is used for compiling .c files, ``NVCC`` is used for .cu files. Whole project is linked by ``GCC``.

To compile:

::

$ source prepare_env.sh
$ make

Use ``make clean`` command to delete compiled build.

CUDA implementation - TBA
-------------------------
Running ConjugateGradient
~~~~~~~~~~~~~~~~~~~~~~~~~

Running single core CPU MKL implementation:

``./ConjugateGradient -i input_matrix.txt``

Running multiple core CPU MKL implementation:

``./ConjugateGradient -i input_matrix.txt -mt 4``

Running GPU implementation (single device only available):

``./ConjugateGradient -i input_matrix.txt --gpu``

::

If there are no CUDA devices, CPU implementation will be launched.

input_matrix.txt is expected to be CSR formatted matrix, various examples can be generated by Python scripts.



Conjugate Gradients description
-------------------------------
Expand Down
2 changes: 0 additions & 2 deletions mkl_setup.sh

This file was deleted.

4 changes: 4 additions & 0 deletions prepare_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source /opt/intel/mkl/bin/mklvars.sh intel64
source /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
export PATH=$PATH:/usr/local/cuda/bin
106 changes: 69 additions & 37 deletions src/cg_colver_gpu.cu
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
/*Contains implementation for gpu_cg_solver functions.*/


#include "cg_colver_gpu.h"
#include <stdio.h>
#include "ckernels.h"


extern "C"
{
#include "gpu_utils.h"
#include "utils.h"

#include "gpu_utils.h"
#include "cg_colver_gpu.h"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
#include <cublas_v2.h>
}

#define threadsPerBlock 256
#define CHECK_FOR_STATUS(status) printf("cublas status = %s\n", cublasGetErrorString(status))

#define FREE_DEVICE_STACK \
cudaFree(d_r);\
Expand All @@ -29,21 +31,20 @@ extern "C"
cudaFree(d_beta);\
cudaFree(d_alfa);\
cudaFree(d_alpha_zero);\
cudaFree(d_dot);\
cudaFree(d_dot_new);\
cudaFree(d_norm);\
cudaFree(d_dot_zero);\
cudaFree(d_dot_old);\
cudaFree(d_dTq);


int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data gpu_data){
int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data){
/*Single GPU CG solver using cublas*/

double *h_dot, *h_dot_zero;
int *d_I = NULL, *d_J = NULL;
const double tol = 1e-2f;
double *d_alfa, *d_beta, *d_alpha_zero;
double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
double *d_Ax, *d_x, *d_d, *d_q, *d_rhs, *d_r, *d_helper, *d_norm, *d_dot_new, *d_dot_zero, *d_dot_old, *d_dTq, *d_val;
int k, max_iter;

k = 0;
Expand All @@ -52,41 +53,55 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
max_iter = 200;

size_t size = matrix->size * sizeof(double);
size_t d_size = sizeof(double);

cusparseHandle_t cusparseHandle = 0;
cublasHandle_t cublasHandle = 0;
cusparseCreate(&cusparseHandle);

cusparseMatDescr_t descr = 0;
cusparseCreateMatDescr(&descr);

cublasHandle_t cublasHandle = 0;
cublasCreate(&cublasHandle);

cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO);

cublasStatus_t cublasStatus;

printf("Mallocing CUDA divice memory\n");
cudaMalloc((void **)&d_r, size);
cudaMalloc((void **)&d_helper, size);
cudaMalloc((void **)&d_x, size);
cudaMalloc((void **)&d_rhs, size);
cudaMalloc((void **)&d_d, size);
cudaMalloc((void **)&d_Ax, size);
cudaMalloc((void **)&d_q, size);

cudaMalloc((void **)&d_val, matrix->non_zero * sizeof(double));
cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(double));
cudaMalloc((void **)&d_J, matrix->non_zero * sizeof(int));
cudaMalloc((void **)&d_I, (matrix->size + 1) * sizeof(int));

cudaMalloc((void **)&d_beta, sizeof(double));
cudaMalloc((void **)&d_alfa, sizeof(double));
cudaMalloc((void **)&d_alpha_zero, sizeof(double));
cudaMalloc((void **)&d_dot, sizeof(double));
cudaMalloc((void **)&d_dot_zero, sizeof(double));
cudaMalloc((void **)&d_norm, sizeof(double));

cudaMalloc((void **)&d_beta, d_size);
cudaMalloc((void **)&d_alfa, d_size);
cudaMalloc((void **)&d_alpha_zero, d_size);
cudaMalloc((void **)&d_dot_new, d_size);
cudaMalloc((void **)&d_dot_zero, d_size);
cudaMalloc((void **)&d_norm, d_size);

cudaMemset(d_beta, 0, d_size);
cudaMemset(d_alfa, 0, d_size);
cudaMemset(d_alpha_zero, 0, d_size);
cudaMemset(d_dot_new, 0, d_size);
cudaMemset(d_dot_zero, 0, d_size);
cudaMemset(d_norm, 0, d_size);

printf("Copying to device\n");
cudaMemcpy(d_val, matrix->val, matrix->non_zero * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_J, matrix->J_row, matrix->non_zero * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_I, matrix->I_column, (matrix->size + 1) * sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(d_x, x_vec, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_rhs, rhs, size, cudaMemcpyHostToDevice);

int blocksPerGrid = ((matrix->size + threadsPerBlock -1) / threadsPerBlock );
int blocksPerGrid = ((matrix->size + threadsPerBlock - 1) / threadsPerBlock );
while (blocksPerGrid % threadsPerBlock != 0){
blocksPerGrid++;
}
Expand All @@ -96,43 +111,60 @@ int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, do
const double one = 1.0;
const double minus_one = -1.0;
/*Calculate Ax matrix*/

cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
&alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
&alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
/*Calculate rhs=rhs-Ax matrix*/
cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1);
cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &minus_one, d_Ax, 1, d_rhs, 1);
CHECK_FOR_STATUS(cublasStatus);

/*CG: Copy updated rhs (residuum) to d vector*/
cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1);
/*CG: calculate dot r'*r, assign it to dot_new */
cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot);
cublasStatus = cublasDcopy(cublasHandle, matrix->size, d_d, 1, d_rhs, 1);
CHECK_FOR_STATUS(cublasStatus);

/*CG: calculate dot r'*r, assign it to d_dot_new */
cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
CHECK_FOR_STATUS(cublasStatus);

/*assign dot_new to dot_zero*/
d_dot_zero = d_dot;
cudaMemcpy(h_dot, d_dot, sizeof(double), cudaMemcpyDeviceToHost);
d_dot_zero = d_dot_new;
cudaMemcpy(h_dot, d_dot_new, sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(h_dot_zero, d_dot_zero, sizeof(double), cudaMemcpyDeviceToHost);
while ((*h_dot > tol * tol * *h_dot_zero) && (k < max_iter)) {
/*Calculate q=A*d vector*/
cusparseDcsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, matrix->size, matrix->size, matrix->non_zero,
&alpha, descr, d_val, d_J, d_I, d_x, &beta, d_Ax);
/*Calculate alpha:*/
cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq);
sDdiv<<<1, gpu_data.devices[0].warp_size>>>(d_alfa, d_dot, d_dTq);
cublasStatus = cublasDdot(cublasHandle, matrix->size, d_d, 1, d_q, 1, d_dTq);
CHECK_FOR_STATUS(cublasStatus);

sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_alfa, d_dot_new, d_dTq);
/*Calculate x=x+alpha*d*/
cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1);
cublasStatus = cublasDaxpy(cublasHandle, matrix->size, d_alfa, d_x, 1, d_d, 1);
CHECK_FOR_STATUS(cublasStatus);

/*Calculate r=r-alpha*q*/
axpy<<<blocksPerGrid, threadsPerBlock>>>(matrix->size, -1, d_q, d_rhs);
/*Assign dot_old = dot_new*/
cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot, 1);
cublasStatus = cublasDcopy(cublasHandle, 1, d_dot_old, 1, d_dot_new, 1);
CHECK_FOR_STATUS(cublasStatus);

/*CG:Assign dot_new = r'*r*/
cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot);
sDdiv<<<1, gpu_data.devices[0].warp_size>>>(d_beta, d_dot, d_dot_old);
cublasStatus = cublasDdot(cublasHandle, matrix->size, d_rhs, 1, d_rhs, 1, d_dot_new);
CHECK_FOR_STATUS(cublasStatus);

sDdiv<<<1, gpu_data->devices[0].warp_size>>>(d_beta, d_dot_new, d_dot_old);
/*Scale beta*d*/
cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1);
cublasStatus = cublasDscal(cublasHandle, matrix->size, d_beta, d_d, 1);
CHECK_FOR_STATUS(cublasStatus);

/*CG:Calculate d=r+beta*d*/
cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1);
cublasStatus = cublasDaxpy(cublasHandle, matrix->size, &one, d_rhs, 1, d_d, 1);
CHECK_FOR_STATUS(cublasStatus);
k++;
}
cusparseDestroy(cusparseHandle);
cudaDeviceReset();
FREE_DEVICE_STACK
return k;
}
}
7 changes: 6 additions & 1 deletion src/cg_colver_gpu.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
/*Contains prototypes for gpu_cg_solver functions.*/

#ifndef CG_SOLVER_GPU_H
#define CG_SOLVER_GPU_H
#include "utils.h"
#include "gpu_utils.h"

int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec);
int gpu_conjugate_gradient_solver(Matrix *matrix, double *x_vec, double *rhs, double *res_vec, GPU_data *gpu_data);

#endif
18 changes: 18 additions & 0 deletions src/ckernels.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
/*Contains implementation of custom kernels for CUDA devices.*/

#include "ckernels.h"
#include <cublas_v2.h>

const char* cublasGetErrorString(cublasStatus_t status)
{
switch(status)
{
case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
}
return "unknown error";
}

__global__ void sDdiv(double *res, double *divided, double *divider) {
/*Division of scalar elements on a single CUDA thread*/
Expand All @@ -17,3 +34,4 @@ __global__ void axpy(int num_elements, double alpha, double *x, double *y) {
y[i] = y[i] + alpha * x[i];
}
}

3 changes: 3 additions & 0 deletions src/ckernels.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
/*Contains prototypes of custom kernels for CUDA devices.*/

#include <cublas_v2.h>

const char* cublasGetErrorString(cublasStatus_t status);
__global__ void sDdiv(double *res, double *divided, double *divider);
__global__ void axpy(int num_elements, double alpha, double *x, double *y);
8 changes: 7 additions & 1 deletion src/gpu_utils.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
/*Contains implementation of gpu_utils functions and structs.*/

#include <stdio.h>

extern "C" {
#include <cuda_runtime.h>
#include "gpu_utils.h"
Expand All @@ -10,7 +12,11 @@ GPU_data *get_gpu_devices_data(){
GPU_data *gpu_data;
gpu_data = (GPU_data *)malloc(sizeof(GPU_data));
gpu_data->devices_number = 0;
cudaGetDeviceCount(&gpu_data->devices_number);
cudaError_t device_error;
device_error = cudaGetDeviceCount(&gpu_data->devices_number);
if (device_error != cudaSuccess)
printf("Error - could not read properly number of device, err=[%s] \n", cudaGetErrorString(device_error));

if (gpu_data->devices_number != 0){
gpu_data->devices = (GPU_device *)malloc(gpu_data->devices_number * sizeof(GPU_device));
for (int i = 0; i < gpu_data->devices_number; i ++){
Expand Down
Loading

0 comments on commit 5b56eec

Please sign in to comment.