From 58b4c545d226bc11d10a96de9640fef8dcf7bdc6 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Wed, 11 Jan 2023 22:39:51 -0800 Subject: [PATCH 01/44] update --- CMakeLists.txt | 2 ++ apps/SparseMatrix/dense_matrix.cuh | 10 +++++----- apps/SparseMatrix/matrix_operation.cuh | 6 ++++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2f90908b..a517dcd0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,8 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") +set(USE_POLYSCOPE "OFF") + if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") diff --git a/apps/SparseMatrix/dense_matrix.cuh b/apps/SparseMatrix/dense_matrix.cuh index 8713eef6..228582d2 100644 --- a/apps/SparseMatrix/dense_matrix.cuh +++ b/apps/SparseMatrix/dense_matrix.cuh @@ -79,11 +79,11 @@ struct DenseMatrix return m_row_size * m_col_size * sizeof(T); } - void quick_tanspose_w_ld_trans() - { - std::swap(m_row_size, m_col_size); - m_is_row_major = !(m_is_row_major); - } + // void quick_tanspose_w_ld_trans() + // { + // std::swap(m_row_size, m_col_size); + // m_is_row_major = !(m_is_row_major); + // } bool m_is_row_major; IndexT m_row_size; diff --git a/apps/SparseMatrix/matrix_operation.cuh b/apps/SparseMatrix/matrix_operation.cuh index 2dd54cd3..4153935f 100644 --- a/apps/SparseMatrix/matrix_operation.cuh +++ b/apps/SparseMatrix/matrix_operation.cuh @@ -99,7 +99,9 @@ void spmat_linear_solve(rxmesh::SparseMatrix A_mat, T* B_arr, T* X_arr, rxmesh::Solver solver, - rxmesh::Reorder reorder) + rxmesh::Reorder reorder + //cudaStream_t stream = null + ) { cusolverSpHandle_t handle = NULL; cusparseHandle_t cusparseHandle = NULL; @@ -326,7 +328,7 @@ void spmat_denmat_mul(rxmesh::SparseMatrix A_mat, cusparseCreateDnMat(&matB, B_mat.m_row_size, B_mat.m_col_size, - B_mat.lead_dim(), + B_mat.lead_dim(), // lead_dim < row_size B_mat.data(), CUDA_R_32F, CUSPARSE_ORDER_COL); From 0b16207d54e3a46637b1ebcb754108e8f90b859c Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Thu, 12 Jan 2023 00:17:30 -0800 Subject: [PATCH 02/44] col major --- CMakeLists.txt | 1 + include/rxmesh/matrix/dense_matrix.cuh | 29 ++++------------------ include/rxmesh/matrix/matrix_operation.cuh | 6 ++--- 3 files changed, 9 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a517dcd0..77844bf4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,6 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) + project(RXMesh VERSION 0.2.1 LANGUAGES C CXX CUDA) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index b48d1a7f..fba446fe 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -9,6 +9,7 @@ namespace rxmesh { // Currently this is device only // host/device transormation will be added +// only col major is supportedF template struct DenseMatrix { @@ -16,7 +17,6 @@ struct DenseMatrix : m_row_size(row_size), m_col_size(col_size) { cudaMalloc((void**)&m_d_val, bytes()); - m_is_row_major = false; } DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major) @@ -38,29 +38,17 @@ struct DenseMatrix IndexT lead_dim() const { - if (m_is_row_major) { - return m_col_size; - } else { - return m_row_size; - } + return m_row_size; } __device__ T& operator()(const uint32_t row, const uint32_t col) { - if (m_is_row_major) { - return m_d_val[row * m_col_size + col]; - } else { - return m_d_val[col * m_row_size + row]; - } + return m_d_val[col * m_row_size + row]; } __device__ T& operator()(const uint32_t row, const uint32_t col) const { - if (m_is_row_major) { - return m_d_val[row * m_col_size + col]; - } else { - return m_d_val[col * m_row_size + row]; - } + return m_d_val[col * m_row_size + row]; } T* data() const @@ -68,7 +56,7 @@ struct DenseMatrix return m_d_val; } - T* ld_data(const uint32_t ld_idx) const + T* col_data(const uint32_t ld_idx) const { return m_d_val + ld_idx * lead_dim(); } @@ -78,13 +66,6 @@ struct DenseMatrix return m_row_size * m_col_size * sizeof(T); } - // void quick_tanspose_w_ld_trans() - // { - // std::swap(m_row_size, m_col_size); - // m_is_row_major = !(m_is_row_major); - // } - - bool m_is_row_major; IndexT m_row_size; IndexT m_col_size; T* m_d_val; diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh index b8575992..7e70857d 100644 --- a/include/rxmesh/matrix/matrix_operation.cuh +++ b/include/rxmesh/matrix/matrix_operation.cuh @@ -171,8 +171,8 @@ void spmat_linear_solve(rxmesh::SparseMatrix A_mat, A_mat.m_d_row_ptr, A_mat.m_d_col_idx, A_mat.m_d_val, - B_mat.ld_data(i), - X_mat.ld_data(i)); + B_mat.col_data(i), + X_mat.col_data(i)); } cusolverSpDestroy(handle); @@ -441,7 +441,7 @@ void spmat_denmat_mul_cw(rxmesh::SparseMatrix A_mat, rxmesh::DenseMatrix C_mat) { for (int i = 0; i < B_mat.m_col_size; ++i) { - spmat_arr_mul(A_mat, B_mat.ld_data(i), C_mat.ld_data(i)); + spmat_arr_mul(A_mat, B_mat.col_data(i), C_mat.col_data(i)); } } From e418fab6e193e2e09ecce27e87bcfde054d09b6f Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Thu, 12 Jan 2023 00:38:30 -0800 Subject: [PATCH 03/44] col-wise fix --- CMakeLists.txt | 3 +++ include/rxmesh/matrix/dense_matrix.cuh | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 77844bf4..f126b332 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,8 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 75) +endif() project(RXMesh VERSION 0.2.1 diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index fba446fe..6940b495 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -21,8 +21,7 @@ struct DenseMatrix DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major) : m_row_size(row_size), - m_col_size(col_size), - m_is_row_major(is_row_major) + m_col_size(col_size) { cudaMalloc((void**)&m_d_val, bytes()); } From f3c84d6068cca929c67cb1d9685dce4b5186deec Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Fri, 3 Feb 2023 21:27:36 -0800 Subject: [PATCH 04/44] stable version --- include/rxmesh/matrix/dense_matrix.cuh | 26 +- include/rxmesh/matrix/matrix_operation.cuh | 334 ++++----------------- include/rxmesh/matrix/sparse_matrix.cuh | 238 ++++++++++++++- include/rxmesh/util/macros.h | 63 ++++ tests/RXMesh_test/test_sparse_matrix.cuh | 73 +++-- 5 files changed, 417 insertions(+), 317 deletions(-) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index 6940b495..66b403a5 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -1,6 +1,6 @@ #pragma once #include - +#include "cusparse.h" #include "rxmesh/attribute.h" #include "rxmesh/context.h" #include "rxmesh/types.h" @@ -14,16 +14,17 @@ template struct DenseMatrix { DenseMatrix(IndexT row_size, IndexT col_size) - : m_row_size(row_size), m_col_size(col_size) + : m_row_size(row_size), m_col_size(col_size), m_dendescr(NULL) { - cudaMalloc((void**)&m_d_val, bytes()); - } + CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); - DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major) - : m_row_size(row_size), - m_col_size(col_size) - { - cudaMalloc((void**)&m_d_val, bytes()); + CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr, + m_row_size, + m_col_size, + m_row_size, // leading dim + m_d_val, + CUDA_R_32F, + CUSPARSE_ORDER_COL)); } void set_ones() @@ -65,9 +66,10 @@ struct DenseMatrix return m_row_size * m_col_size * sizeof(T); } - IndexT m_row_size; - IndexT m_col_size; - T* m_d_val; + cusparseDnMatDescr_t m_dendescr; + IndexT m_row_size; + IndexT m_col_size; + T* m_d_val; }; } // namespace rxmesh diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh index 7e70857d..0ab008a7 100644 --- a/include/rxmesh/matrix/matrix_operation.cuh +++ b/include/rxmesh/matrix/matrix_operation.cuh @@ -1,6 +1,4 @@ #pragma once -#include "cusolverSp.h" -#include "cusparse.h" #include "rxmesh/attribute.h" #include "rxmesh/matrix/dense_matrix.cuh" @@ -51,39 +49,6 @@ static int reorder_to_int(const Reorder& reorder) } } -/** - * @brief for transpose the dense matrix using cublas - */ -template -void denmat_transpose(rxmesh::DenseMatrix den_mat) -{ - T* d_rt_arr; - cudaMalloc(&d_rt_arr, den_mat.bytes()); - - float const alpha(1.0); - float const beta(0.0); - - cublasHandle_t handle; - cublasCreate(&handle); - cublasSgeam(handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - den_mat.m_row_size, - den_mat.m_col_size, - &alpha, // 0 - den_mat.data(), // A_arr - den_mat.m_col_size, // ld_a - &beta, // 0 - den_mat.data(), // B_arr - den_mat.m_row_size, // ld_b - d_rt_arr, // rt_arr - den_mat.m_row_size); // ld_cm - cublasDestroy(handle); - - den_mat.m_d_val = d_rt_arr; - // TODO cont; -} - /** * @brief solve the Ax=b for x where x and b are all array */ @@ -92,31 +57,13 @@ void spmat_linear_solve(rxmesh::SparseMatrix A_mat, T* B_arr, T* X_arr, rxmesh::Solver solver, - rxmesh::Reorder reorder - //cudaStream_t stream = null - ) + rxmesh::Reorder reorder) { - cusolverSpHandle_t handle = NULL; - cusparseHandle_t cusparseHandle = NULL; - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; - - cusolverSpCreate(&handle); - cusparseCreate(&cusparseHandle); - - cudaStreamCreate(&stream); - cusolverSpSetStream(handle, stream); - cusparseSetStream(cusparseHandle, stream); - - cusparseCreateMatDescr(&descrA); - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); - + A_mat.create_cusolver_sphandle(); cusparse_linear_solver_wrapper(solver, reorder, - handle, - cusparseHandle, - descrA, + A_mat.m_cusolver_sphandles, + A_mat.m_descr, A_mat.m_row_size, A_mat.m_col_size, A_mat.m_nnz, @@ -125,11 +72,6 @@ void spmat_linear_solve(rxmesh::SparseMatrix A_mat, A_mat.m_d_val, B_arr, X_arr); - - cusolverSpDestroy(handle); - cusparseDestroy(cusparseHandle); - cudaStreamDestroy(stream); - cusparseDestroyMatDescr(descrA); } /** @@ -143,28 +85,12 @@ void spmat_linear_solve(rxmesh::SparseMatrix A_mat, rxmesh::Solver solver, rxmesh::Reorder reorder) { - cusolverSpHandle_t handle = NULL; - cusparseHandle_t cusparseHandle = NULL; - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; - - cusolverSpCreate(&handle); - cusparseCreate(&cusparseHandle); - - cudaStreamCreate(&stream); - cusolverSpSetStream(handle, stream); - cusparseSetStream(cusparseHandle, stream); - - cusparseCreateMatDescr(&descrA); - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); - + // A_mat.create_cusolver_sphandle(); for (int i = 0; i < B_mat.m_col_size; ++i) { cusparse_linear_solver_wrapper(solver, reorder, - handle, - cusparseHandle, - descrA, + A_mat.m_cusolver_sphandle, + A_mat.m_descr, A_mat.m_row_size, A_mat.m_col_size, A_mat.m_nnz, @@ -174,11 +100,6 @@ void spmat_linear_solve(rxmesh::SparseMatrix A_mat, B_mat.col_data(i), X_mat.col_data(i)); } - - cusolverSpDestroy(handle); - cusparseDestroy(cusparseHandle); - cudaStreamDestroy(stream); - cusparseDestroyMatDescr(descrA); } /** @@ -189,7 +110,6 @@ template void cusparse_linear_solver_wrapper(const rxmesh::Solver solver, const rxmesh::Reorder reorder, cusolverSpHandle_t handle, - cusparseHandle_t cusparseHandle, cusparseMatDescr_t descrA, int rowsA, int colsA, @@ -213,64 +133,64 @@ void cusparse_linear_solver_wrapper(const rxmesh::Solver solver, /* solve B*z = Q*b */ if (solver == Solver::CHOL) { if constexpr (std::is_same_v) { - cusolverSpScsrlsvchol(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); + CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); } if constexpr (std::is_same_v) { - cusolverSpDcsrlsvchol(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); + CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); } } else if (solver == Solver::QR) { if constexpr (std::is_same_v) { - cusolverSpScsrlsvqr(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); + CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); } if constexpr (std::is_same_v) { - cusolverSpDcsrlsvqr(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); + CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); } } else { RXMESH_ERROR( @@ -286,150 +206,6 @@ void cusparse_linear_solver_wrapper(const rxmesh::Solver solver, } } -/** - * @brief wrap up the cusparse api for sparse matrix dense matrix - * multiplication. - */ -template -void spmat_denmat_mul(rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix B_mat, - rxmesh::DenseMatrix C_mat) -{ - float alpha = 1.0f; - float beta = 0.0f; - - cusparseHandle_t handle = NULL; - cusparseSpMatDescr_t matA; - cusparseDnMatDescr_t matB, matC; - void* dBuffer = NULL; - size_t bufferSize = 0; - - cusparseCreate(&handle); - // Create sparse matrix A in CSR format - cusparseCreateCsr(&matA, - A_mat.m_row_size, - A_mat.m_col_size, - A_mat.m_nnz, - A_mat.m_d_row_ptr, - A_mat.m_d_col_idx, - A_mat.m_d_val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F); - // Create dense matrix B - cusparseCreateDnMat(&matB, - B_mat.m_row_size, - B_mat.m_col_size, - B_mat.lead_dim(), // lead_dim < row_size - B_mat.data(), - CUDA_R_32F, - CUSPARSE_ORDER_COL); - // Create dense matrix C - cusparseCreateDnMat(&matC, - C_mat.m_row_size, - C_mat.m_col_size, - C_mat.lead_dim(), - C_mat.data(), - CUDA_R_32F, - CUSPARSE_ORDER_COL); - // allocate an external buffer if needed - cusparseSpMM_bufferSize(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, - matA, - matB, - &beta, - matC, - CUDA_R_32F, - CUSPARSE_SPMM_ALG_DEFAULT, - &bufferSize); - cudaMalloc(&dBuffer, bufferSize); - - // execute SpMM - cusparseSpMM(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, - matA, - matB, - &beta, - matC, - CUDA_R_32F, - CUSPARSE_SPMM_ALG_DEFAULT, - dBuffer); - - // destroy matrix/vector descriptors - cusparseDestroySpMat(matA); - cusparseDestroyDnMat(matB); - cusparseDestroyDnMat(matC); - cusparseDestroy(handle); -} - - -/** - * @brief wrap up the cusparse api for sparse matrix array - * multiplication. - */ -// only works for float -template -void spmat_arr_mul(rxmesh::SparseMatrix sp_mat, T* in_arr, T* rt_arr) -{ - const float minus_one = -1.0f; - const float one = 1.0f; - const float zero = 0.0f; - - cusparseHandle_t handle = NULL; - void* buffer = NULL; - size_t bufferSize = 0; - cusparseSpMatDescr_t sp_mat_des = NULL; - cusparseDnVecDescr_t vecx = NULL; - cusparseDnVecDescr_t vecy = NULL; - - cusparseCreate(&handle); - cusparseCreateCsr(&sp_mat_des, - sp_mat.m_row_size, - sp_mat.m_col_size, - sp_mat.m_nnz, - sp_mat.m_d_row_ptr, - sp_mat.m_d_col_idx, - sp_mat.m_d_val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F); - cusparseCreateDnVec(&vecx, sp_mat.m_col_size, in_arr, CUDA_R_32F); - cusparseCreateDnVec(&vecy, sp_mat.m_row_size, rt_arr, CUDA_R_32F); - - cusparseSpMV_bufferSize(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - sp_mat_des, - vecx, - &zero, - vecy, - CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, - &bufferSize); - cudaMalloc(&buffer, bufferSize); - - cusparseSpMV(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - sp_mat_des, - vecx, - &zero, - vecy, - CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, - buffer); - - cusparseDestroySpMat(sp_mat_des); - cusparseDestroyDnVec(vecx); - cusparseDestroyDnVec(vecy); - cusparseDestroy(handle); -} /** * @brief do the sparse matrix dense matrix multiplication using sparse matrix diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 97133fbe..2f6239cb 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -1,4 +1,6 @@ #pragma once +#include "cusolverSp.h" +#include "cusparse.h" #include "rxmesh/attribute.h" #include "rxmesh/context.h" #include "rxmesh/query.cuh" @@ -70,7 +72,11 @@ struct SparseMatrix m_row_size(0), m_col_size(0), m_nnz(0), - m_context(rx.get_context()) + m_context(rx.get_context()), + m_cusparse_handle(NULL), + m_descr(NULL), + m_spdescr(NULL), + m_spmm_buffer_size(0) { using namespace rxmesh; constexpr uint32_t blockThreads = 256; @@ -138,6 +144,27 @@ struct SparseMatrix // val pointer allocation, actual value init should be in another // function CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(IndexT))); + + CUSPARSE_ERROR(cusparseCreateMatDescr(&m_descr)); + CUSPARSE_ERROR( + cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_ERROR( + cusparseSetMatIndexBase(m_descr, CUSPARSE_INDEX_BASE_ZERO)); + + CUSPARSE_ERROR(cusparseCreateCsr(&m_spdescr, + m_row_size, + m_col_size, + m_nnz, + m_d_row_ptr, + m_d_col_idx, + m_d_val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + + CUSPARSE_ERROR(cusparseCreate(&m_cusparse_handle)); + CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle)); } void set_ones() @@ -186,20 +213,215 @@ struct SparseMatrix return m_d_val[get_val_idx(row_v, col_v)]; } + __device__ T& direct_access(IndexT x, IndexT y) + { + const IndexT start = m_d_row_ptr[x]; + const IndexT end = m_d_row_ptr[x + 1]; + + for (IndexT i = start; i < end; ++i) { + if (m_d_col_idx[i] == y) { + return m_d_val[i]; + } + } + assert(1 != 1); + } + void free() { CUDA_ERROR(cudaFree(m_d_row_ptr)); CUDA_ERROR(cudaFree(m_d_col_idx)); CUDA_ERROR(cudaFree(m_d_val)); + CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle)); + CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr)); + CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle)); + } + + + /** + * @brief wrap up the cusparse api for sparse matrix dense matrix + * multiplication buffer size calculation. + */ + void denmat_mul_buffer_size(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix C_mat, + cudaStream_t stream = 0) + { + float alpha = 1.0f; + float beta = 0.0f; + + cusparseSpMatDescr_t matA = m_spdescr; + cusparseDnMatDescr_t matB = B_mat.m_dendescr; + cusparseDnMatDescr_t matC = C_mat.m_dendescr; + void* dBuffer = NULL; + + cusparseSetStream(m_cusparse_handle, stream); + + // allocate an external buffer if needed + CUSPARSE_ERROR(cusparseSpMM_bufferSize(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + matB, + &beta, + matC, + CUDA_R_32F, + CUSPARSE_SPMM_ALG_DEFAULT, + &m_spmm_buffer_size)); + } + + /** + * @brief wrap up the cusparse api for sparse matrix dense matrix + * multiplication. + */ + void denmat_mul(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix C_mat, + cudaStream_t stream = 0) + { + float alpha = 1.0f; + float beta = 0.0f; + + // A_mat.create_cusparse_handle(); + cusparseSpMatDescr_t matA = m_spdescr; + cusparseDnMatDescr_t matB = B_mat.m_dendescr; + cusparseDnMatDescr_t matC = C_mat.m_dendescr; + void* dBuffer = NULL; + + cusparseSetStream(m_cusparse_handle, stream); + + // allocate an external buffer if needed + if (m_spmm_buffer_size == 0) { + RXMESH_WARN( + "Sparse matrix - Dense matrix multiplication buffer size not " + "initialized.", + "Calculate it now."); + denmat_mul_buffer_size(B_mat, C_mat, stream); + } + CUDA_ERROR(cudaMalloc(&dBuffer, m_spmm_buffer_size)); + + // execute SpMM + CUSPARSE_ERROR(cusparseSpMM(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + matB, + &beta, + matC, + CUDA_R_32F, + CUSPARSE_SPMM_ALG_DEFAULT, + dBuffer)); + + CUDA_ERROR(cudaFree(dBuffer)); } - const Context m_context; - IndexT* m_d_row_ptr; - IndexT* m_d_col_idx; - T* m_d_val; - IndexT m_row_size; - IndexT m_col_size; - IndexT m_nnz; + // void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0) + // { + // const float alpha = 1.0f; + // const float beta = 0.0f; + + // cusparseDnVecDescr_t vecx = NULL; + // cusparseDnVecDescr_t vecy = NULL; + + // printf("check\n"); + + // CUSPARSE_ERROR( + // cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F)); + // CUSPARSE_ERROR( + // cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F)); + + // printf("check\n"); + + // CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, + // CUSPARSE_OPERATION_NON_TRANSPOSE, + // &alpha, + // m_spdescr, + // vecx, + // &beta, + // vecy, + // CUDA_R_32F, + // CUSPARSE_SPMV_ALG_DEFAULT, + // &m_spmv_buffer_size)); + // printf("check\n"); + // } + + /** + * @brief wrap up the cusparse api for sparse matrix array + * multiplication. + */ + void arr_mul(T* in_arr, T* rt_arr, cudaStream_t stream = 0) + { + const float alpha = 1.0f; + const float beta = 0.0f; + + printf("check\n"); + + size_t m_spmv_buffer_size = 0; + + printf("check\n"); + + void* buffer = NULL; + cusparseDnVecDescr_t vecx = NULL; + cusparseDnVecDescr_t vecy = NULL; + + printf("check\n"); + + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F)); + + cusparseSetStream(m_cusparse_handle, stream); + + // if (m_spmv_buffer_size == 0) { + // RXMESH_WARN( + // "Sparse matrix - Array multiplication buffer size not " + // "initialized.", + // "Calculate it now."); + // arr_mul_buffer_size(in_arr, rt_arr, stream); + // } + + CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + m_spdescr, + vecx, + &beta, + vecy, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + &m_spmv_buffer_size)); + CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size)); + + CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + m_spdescr, + vecx, + &beta, + vecy, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + CUSPARSE_ERROR(cusparseDestroyDnVec(vecx)); + CUSPARSE_ERROR(cusparseDestroyDnVec(vecy)); + CUDA_ERROR(cudaFree(buffer)); + } + + const Context m_context; + cusparseHandle_t m_cusparse_handle; + cusolverSpHandle_t m_cusolver_sphandle; + cusparseSpMatDescr_t m_spdescr; + cusparseMatDescr_t m_descr; + + size_t m_spmm_buffer_size; + + IndexT* m_d_row_ptr; + IndexT* m_d_col_idx; + T* m_d_val; + IndexT m_row_size; + IndexT m_col_size; + IndexT m_nnz; }; } // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/macros.h b/include/rxmesh/util/macros.h index 964bd974..fd630bf9 100644 --- a/include/rxmesh/util/macros.h +++ b/include/rxmesh/util/macros.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include #include #include "rxmesh/util/log.h" @@ -49,6 +51,67 @@ inline void HandleError(cudaError_t err, const char* file, int line) } #define CUDA_ERROR(err) (HandleError(err, __FILE__, __LINE__)) +inline void cusparseHandleError(cusparseStatus_t status, + const char* file, + const int line) +{ + if (status != CUSPARSE_STATUS_SUCCESS) { + Log::get_logger()->error("Line {} File {}", line, file); + Log::get_logger()->error("CUSPARSE ERROR: {}", + cusparseGetErrorString(status)); +#ifdef _WIN32 + system("pause"); +#else + exit(EXIT_FAILURE); +#endif + } + return; +} +#define CUSPARSE_ERROR(err) (cusparseHandleError(err, __FILE__, __LINE__)) + + +static inline void cusolverHandleError(cusolverStatus_t status, + const char* file, + const int line) +{ + if (status != CUSOLVER_STATUS_SUCCESS) { + auto cusolverGetErrorString = [](cusolverStatus_t status) { + switch (status) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + default: + return "UNKNOWN_ERROR"; + } + }; + + Log::get_logger()->error("Line {} File {}", line, file); + Log::get_logger()->error("CUSOLVER ERROR: {}", + cusolverGetErrorString(status)); +#ifdef _WIN32 + system("pause"); +#else + exit(EXIT_FAILURE); +#endif + } + return; +} +#define CUSOLVER_ERROR(err) (cusolverHandleError(err, __FILE__, __LINE__)) + + // GPU_FREE #define GPU_FREE(ptr) \ if (ptr != nullptr) { \ diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 1fc4014c..e71621bd 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -72,9 +72,9 @@ __global__ static void sparse_mat_edge_len_test( Vector<3, T> vi_coord( coords(iter[v], 0), coords(iter[v], 1), coords(iter[v], 2)); - sparse_mat(v_id, iter[v]) = dist(v_coord, vi_coord); + sparse_mat(v_id, iter[v]) = 1; //dist(v_coord, vi_coord); - arr_ref[row_index] += dist(v_coord, vi_coord); + arr_ref[row_index] += 1; //dist(v_coord, vi_coord); } }; @@ -137,12 +137,13 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, coords(v_id, 0), coords(v_id, 1), coords(v_id, 2)); for (uint32_t v = 0; v < iter.size(); ++v) { T e_weight = 1; - A_mat(v_id, iter[v]) = -time_step * e_weight; + A_mat(v_id, iter[v]) = time_step * e_weight; sum_e_weight += e_weight; } - A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight; + A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight + + iter.size() * iter.size() + 1000000; }; auto block = cooperative_groups::this_thread_block(); @@ -151,6 +152,20 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, query.dispatch(block, shrd_alloc, mat_setup); } +template +__global__ static void print_diag(rxmesh::SparseMatrix A_mat, int num_vet) +{ + int t_idx = threadIdx.x + blockDim.x * blockIdx.x; + if (t_idx == 0) { + + for (int i = 0; i < num_vet; ++i) { + if (A_mat.direct_access(i, i) < 1) { + printf("A_mat diag %d: %f \n", i, A_mat.direct_access(i, i)); + } + } + } +} + TEST(RXMeshStatic, SparseMatrix) { using namespace rxmesh; @@ -292,9 +307,13 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) float* d_arr_ref; float* d_result; + printf("check1\n"); + CUDA_ERROR(cudaMalloc((void**)&d_arr_ref, (num_vertices) * sizeof(float))); CUDA_ERROR(cudaMalloc((void**)&d_result, (num_vertices) * sizeof(float))); + printf("check2\n"); + LaunchBox launch_box; rxmesh.prepare_launch_box( {Op::VV}, launch_box, (void*)sparse_mat_edge_len_test); @@ -304,11 +323,12 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, spmat, d_arr_ref); - // Spmat matrix multiply + printf("check3\n"); - // spmat_multi_hardwired_kernel - // <<>>(d_arr_ones, spmat, d_result, num_vertices); - spmat_arr_mul(spmat, d_arr_ones, d_result); + // Spmat matrix array multiply + spmat_multi_hardwired_kernel + <<>>(d_arr_ones, spmat, d_result, num_vertices); + // spmat.arr_mul(d_arr_ones, d_result); // copy the value back to host std::vector h_arr_ref(num_vertices); @@ -341,7 +361,8 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) cuda_query(0); // generate rxmesh obj - std::string obj_path = STRINGIFY(INPUT_DIR) "dragon.obj"; + std::string obj_path = STRINGIFY(INPUT_DIR) "dragon.obj"; // STRINGIFY(INPUT_DIR) "dragon.obj"; + // "/home/ericycc/data/120628.obj"; RXMeshStatic rxmesh(obj_path); uint32_t num_vertices = rxmesh.get_num_vertices(); @@ -366,21 +387,37 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); + print_diag<<<1, 1>>>(A_mat, num_vertices); + spmat_linear_solve(A_mat, B_mat, X_mat, Solver::CHOL, Reorder::NONE); - spmat_denmat_mul(A_mat, X_mat, ret_mat); - // spmat_denmat_mul_cw(A_mat, X_mat, ret_mat); + // timing begins for spmm + GPUTimer timer; + timer.start(); + + A_mat.denmat_mul(X_mat, ret_mat); + + timer.stop(); + RXMESH_TRACE("SPMM_rxmesh() took {} (ms) ", timer.elapsed_millis()); - std::vector h_ret_mat(num_vertices * 3); + std::vector h_ret_mat(num_vertices); cudaMemcpy(h_ret_mat.data(), ret_mat.data(), - num_vertices * 3, + num_vertices * 3 * sizeof(float), cudaMemcpyDeviceToHost); - std::vector h_B_mat(num_vertices * 3); - cudaMemcpy( - h_B_mat.data(), B_mat.data(), num_vertices * 3, cudaMemcpyDeviceToHost); + std::vector h_B_mat(num_vertices); + cudaMemcpy(h_B_mat.data(), + B_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost); + + // auto ps_mesh = rxmesh.get_polyscope_mesh(); + // ps_mesh->addVertexColorQuantity("ret_mat", h_ret_mat); + // ps_mesh->addVertexColorQuantity("B_mat", h_B_mat); - for (uint32_t i = 0; i < num_vertices * 3; ++i) { - EXPECT_NEAR(h_ret_mat[i], h_B_mat[i], 1e-3); + for (uint32_t i = 0; i < num_vertices; ++i) { + for (uint32_t j = 0; j < 3; ++j) { + EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); + } } } \ No newline at end of file From 2574ca8b5b8d608f9b355f6a0d8e79c90278d749 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Fri, 3 Feb 2023 23:49:18 -0800 Subject: [PATCH 05/44] merge spmv --- include/rxmesh/matrix/sparse_matrix.cuh | 94 +++++++++++------------- tests/RXMesh_test/test_sparse_matrix.cuh | 13 +--- 2 files changed, 46 insertions(+), 61 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 2f6239cb..f8a32b80 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -76,7 +76,8 @@ struct SparseMatrix m_cusparse_handle(NULL), m_descr(NULL), m_spdescr(NULL), - m_spmm_buffer_size(0) + m_spmm_buffer_size(0), + m_spmv_buffer_size(0) { using namespace rxmesh; constexpr uint32_t blockThreads = 256; @@ -314,35 +315,30 @@ struct SparseMatrix CUDA_ERROR(cudaFree(dBuffer)); } - // void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0) - // { - // const float alpha = 1.0f; - // const float beta = 0.0f; - - // cusparseDnVecDescr_t vecx = NULL; - // cusparseDnVecDescr_t vecy = NULL; - - // printf("check\n"); + void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0) + { + const float alpha = 1.0f; + const float beta = 0.0f; - // CUSPARSE_ERROR( - // cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F)); - // CUSPARSE_ERROR( - // cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F)); + cusparseDnVecDescr_t vecx = NULL; + cusparseDnVecDescr_t vecy = NULL; - // printf("check\n"); + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F)); - // CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, - // CUSPARSE_OPERATION_NON_TRANSPOSE, - // &alpha, - // m_spdescr, - // vecx, - // &beta, - // vecy, - // CUDA_R_32F, - // CUSPARSE_SPMV_ALG_DEFAULT, - // &m_spmv_buffer_size)); - // printf("check\n"); - // } + CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + m_spdescr, + vecx, + &beta, + vecy, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + &m_spmv_buffer_size)); + } /** * @brief wrap up the cusparse api for sparse matrix array @@ -353,18 +349,10 @@ struct SparseMatrix const float alpha = 1.0f; const float beta = 0.0f; - printf("check\n"); - - size_t m_spmv_buffer_size = 0; - - printf("check\n"); - void* buffer = NULL; cusparseDnVecDescr_t vecx = NULL; cusparseDnVecDescr_t vecy = NULL; - printf("check\n"); - CUSPARSE_ERROR( cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F)); CUSPARSE_ERROR( @@ -372,24 +360,25 @@ struct SparseMatrix cusparseSetStream(m_cusparse_handle, stream); - // if (m_spmv_buffer_size == 0) { - // RXMESH_WARN( - // "Sparse matrix - Array multiplication buffer size not " - // "initialized.", - // "Calculate it now."); - // arr_mul_buffer_size(in_arr, rt_arr, stream); - // } + if (m_spmv_buffer_size == 0) { + RXMESH_WARN( + "Sparse matrix - Array multiplication buffer size not " + "initialized." + "Calculate it now."); + arr_mul_buffer_size(in_arr, rt_arr, stream); + } + + // CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, + // CUSPARSE_OPERATION_NON_TRANSPOSE, + // &alpha, + // m_spdescr, + // vecx, + // &beta, + // vecy, + // CUDA_R_32F, + // CUSPARSE_SPMV_ALG_DEFAULT, + // &m_spmv_buffer_size)); - CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, - m_spdescr, - vecx, - &beta, - vecy, - CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, - &m_spmv_buffer_size)); CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size)); CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle, @@ -415,6 +404,7 @@ struct SparseMatrix cusparseMatDescr_t m_descr; size_t m_spmm_buffer_size; + size_t m_spmv_buffer_size; IndexT* m_d_row_ptr; IndexT* m_d_col_idx; diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index e71621bd..3f4c4a09 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -307,13 +307,9 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) float* d_arr_ref; float* d_result; - printf("check1\n"); - CUDA_ERROR(cudaMalloc((void**)&d_arr_ref, (num_vertices) * sizeof(float))); CUDA_ERROR(cudaMalloc((void**)&d_result, (num_vertices) * sizeof(float))); - printf("check2\n"); - LaunchBox launch_box; rxmesh.prepare_launch_box( {Op::VV}, launch_box, (void*)sparse_mat_edge_len_test); @@ -323,12 +319,10 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, spmat, d_arr_ref); - printf("check3\n"); - // Spmat matrix array multiply - spmat_multi_hardwired_kernel - <<>>(d_arr_ones, spmat, d_result, num_vertices); - // spmat.arr_mul(d_arr_ones, d_result); + // spmat_multi_hardwired_kernel + // <<>>(d_arr_ones, spmat, d_result, num_vertices); + spmat.arr_mul(d_arr_ones, d_result); // copy the value back to host std::vector h_arr_ref(num_vertices); @@ -420,4 +414,5 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); } } + A_mat.free(); } \ No newline at end of file From 19c5cbc1486c56be81d46f42d2846c55ba9c8848 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Fri, 10 Feb 2023 14:32:36 -0800 Subject: [PATCH 06/44] debug version --- CMakeLists.txt | 1 - apps/MCF/mcf.cu | 1 + apps/MCF/mcf_sparse_matrix.cuh | 7 +- include/rxmesh/matrix/matrix_operation.cuh | 224 --------------------- include/rxmesh/matrix/sparse_matrix.cuh | 220 +++++++++++++++++++- tests/RXMesh_test/test_sparse_matrix.cuh | 21 +- 6 files changed, 224 insertions(+), 250 deletions(-) delete mode 100644 include/rxmesh/matrix/matrix_operation.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index f126b332..e10a467d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,6 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") -set(USE_POLYSCOPE "OFF") if(${USE_POLYSCOPE}) diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu index 02c5355f..49ceb94d 100644 --- a/apps/MCF/mcf.cu +++ b/apps/MCF/mcf.cu @@ -28,6 +28,7 @@ struct arg #include "mcf_openmesh.h" #include "mcf_rxmesh.h" +#include "mcf_sparse_matrix.cuh" TEST(App, MCF) diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index daf1ed4f..2193b965 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -43,5 +43,10 @@ __global__ static void mcf_A_B_setup( A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight; }; - query_block_dispatcher(context, init_lambda); + auto block = cooperative_groups::this_thread_block(); + Query query(context); + ShmemAllocator shrd_alloc; + query.dispatch(block, shrd_alloc, compute_valence); } + +void mcf_rxmesh_solver() {} \ No newline at end of file diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh deleted file mode 100644 index 0ab008a7..00000000 --- a/include/rxmesh/matrix/matrix_operation.cuh +++ /dev/null @@ -1,224 +0,0 @@ -#pragma once - -#include "rxmesh/attribute.h" -#include "rxmesh/matrix/dense_matrix.cuh" -#include "rxmesh/matrix/sparse_matrix.cuh" -#include "rxmesh/rxmesh_static.h" - -namespace rxmesh { - -/** - * @brief The enum class for choosing different solver types - */ -enum class Solver -{ - CHOL = 0, - LU = 1, - QR = 2 -}; - -/** - * @brief The enum class for choosing different reorder types - * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee - * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based - * on Quotient Graph, NSTDIS for Nested Dissection - */ -enum class Reorder -{ - NONE = 0, - SYMRCM = 1, - SYMAMD = 2, - NSTDIS = 3 -}; - -static int reorder_to_int(const Reorder& reorder) -{ - switch (reorder) { - case Reorder::NONE: - return 0; - case Reorder::SYMRCM: - return 1; - case Reorder::SYMAMD: - return 2; - case Reorder::NSTDIS: - return 3; - default: { - RXMESH_ERROR("reorder_to_int() unknown input reorder"); - return 0; - } - } -} - -/** - * @brief solve the Ax=b for x where x and b are all array - */ -template -void spmat_linear_solve(rxmesh::SparseMatrix A_mat, - T* B_arr, - T* X_arr, - rxmesh::Solver solver, - rxmesh::Reorder reorder) -{ - A_mat.create_cusolver_sphandle(); - cusparse_linear_solver_wrapper(solver, - reorder, - A_mat.m_cusolver_sphandles, - A_mat.m_descr, - A_mat.m_row_size, - A_mat.m_col_size, - A_mat.m_nnz, - A_mat.m_d_row_ptr, - A_mat.m_d_col_idx, - A_mat.m_d_val, - B_arr, - X_arr); -} - -/** - * @brief solve the AX=B for X where X and B are all dense matrix and we would - * solve it in a column wise manner - */ -template -void spmat_linear_solve(rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix B_mat, - rxmesh::DenseMatrix X_mat, - rxmesh::Solver solver, - rxmesh::Reorder reorder) -{ - // A_mat.create_cusolver_sphandle(); - for (int i = 0; i < B_mat.m_col_size; ++i) { - cusparse_linear_solver_wrapper(solver, - reorder, - A_mat.m_cusolver_sphandle, - A_mat.m_descr, - A_mat.m_row_size, - A_mat.m_col_size, - A_mat.m_nnz, - A_mat.m_d_row_ptr, - A_mat.m_d_col_idx, - A_mat.m_d_val, - B_mat.col_data(i), - X_mat.col_data(i)); - } -} - -/** - * @brief wrap up the cusolver api for solving linear systems. This is a lower - * level api - */ -template -void cusparse_linear_solver_wrapper(const rxmesh::Solver solver, - const rxmesh::Reorder reorder, - cusolverSpHandle_t handle, - cusparseMatDescr_t descrA, - int rowsA, - int colsA, - int nnzA, - int* d_csrRowPtrA, - int* d_csrColIndA, - T* d_csrValA, - T* d_b, - T* d_x) -{ - if constexpr ((!std::is_same_v)&&(!std::is_same_v)) { - RXMESH_ERROR( - "Unsupported type for cusparse: {}" - "Only float and double are supported", - typeid(T).name()); - } - - double tol = 1.e-12; - int singularity = 0; /* -1 if A is invertible under tol. */ - - /* solve B*z = Q*b */ - if (solver == Solver::CHOL) { - if constexpr (std::is_same_v) { - CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity)); - } - - if constexpr (std::is_same_v) { - CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity)); - } - - } else if (solver == Solver::QR) { - if constexpr (std::is_same_v) { - CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity)); - } - - if constexpr (std::is_same_v) { - CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity)); - } - } else { - RXMESH_ERROR( - "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x for " - "Solver::LU"); - } - cudaDeviceSynchronize(); - - if (0 <= singularity) { - RXMESH_WARN("WARNING: the matrix is singular at row {} under tol ({})", - singularity, - tol); - } -} - - -/** - * @brief do the sparse matrix dense matrix multiplication using sparse matrix - * array multiplication in a column wise way - */ -template -void spmat_denmat_mul_cw(rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix B_mat, - rxmesh::DenseMatrix C_mat) -{ - for (int i = 0; i < B_mat.m_col_size; ++i) { - spmat_arr_mul(A_mat, B_mat.col_data(i), C_mat.col_data(i)); - } -} - -} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index f8a32b80..3ab55334 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -6,8 +6,52 @@ #include "rxmesh/query.cuh" #include "rxmesh/types.h" +#include "rxmesh/matrix/dense_matrix.cuh" + namespace rxmesh { +/** + * @brief The enum class for choosing different solver types + */ +enum class Solver +{ + CHOL = 0, + LU = 1, + QR = 2 +}; + +/** + * @brief The enum class for choosing different reorder types + * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee + * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based + * on Quotient Graph, NSTDIS for Nested Dissection + */ +enum class Reorder +{ + NONE = 0, + SYMRCM = 1, + SYMAMD = 2, + NSTDIS = 3 +}; + +static int reorder_to_int(const Reorder& reorder) +{ + switch (reorder) { + case Reorder::NONE: + return 0; + case Reorder::SYMRCM: + return 1; + case Reorder::SYMAMD: + return 2; + case Reorder::NSTDIS: + return 3; + default: { + RXMESH_ERROR("reorder_to_int() unknown input reorder"); + return 0; + } + } +} + namespace detail { // this is the function for the CSR calculation template @@ -368,17 +412,6 @@ struct SparseMatrix arr_mul_buffer_size(in_arr, rt_arr, stream); } - // CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, - // CUSPARSE_OPERATION_NON_TRANSPOSE, - // &alpha, - // m_spdescr, - // vecx, - // &beta, - // vecy, - // CUDA_R_32F, - // CUSPARSE_SPMV_ALG_DEFAULT, - // &m_spmv_buffer_size)); - CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size)); CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle, @@ -397,6 +430,171 @@ struct SparseMatrix CUDA_ERROR(cudaFree(buffer)); } + /** + * @brief do the sparse matrix dense matrix multiplication using sparse + * matrix array multiplication in a column wise way + */ + void spmat_denmat_mul_cw(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix C_mat) + { + for (int i = 0; i < B_mat.m_col_size; ++i) { + arr_mul(B_mat.col_data(i), C_mat.col_data(i)); + } + } + + /** + * @brief solve the Ax=b for x where x and b are all array + */ + void spmat_linear_solve(T* B_arr, + T* X_arr, + rxmesh::Solver solver, + rxmesh::Reorder reorder) + { + cusparse_linear_solver_wrapper(solver, + reorder, + m_cusolver_sphandle, + m_descr, + m_row_size, + m_col_size, + m_nnz, + m_d_row_ptr, + m_d_col_idx, + m_d_val, + B_arr, + X_arr); + } + + /** + * @brief solve the AX=B for X where X and B are all dense matrix and we + * would solve it in a column wise manner + */ + void spmat_linear_solve(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix X_mat, + rxmesh::Solver solver, + rxmesh::Reorder reorder) + { + for (int i = 0; i < B_mat.m_col_size; ++i) { + cusparse_linear_solver_wrapper(solver, + reorder, + m_cusolver_sphandle, + m_descr, + m_row_size, + m_col_size, + m_nnz, + m_d_row_ptr, + m_d_col_idx, + m_d_val, + B_mat.col_data(i), + X_mat.col_data(i)); + } + } + + /** + * @brief wrap up the cusolver api for solving linear systems. This is a + * lower level api + */ + void cusparse_linear_solver_wrapper(const rxmesh::Solver solver, + const rxmesh::Reorder reorder, + cusolverSpHandle_t handle, + cusparseMatDescr_t descrA, + int rowsA, + int colsA, + int nnzA, + int* d_csrRowPtrA, + int* d_csrColIndA, + T* d_csrValA, + T* d_b, + T* d_x) + { + if constexpr ((!std::is_same_v)&&( + !std::is_same_v)) { + RXMESH_ERROR( + "Unsupported type for cusparse: {}" + "Only float and double are supported", + typeid(T).name()); + } + + double tol = 1.e-12; + int singularity = 0; /* -1 if A is invertible under tol. */ + + /* solve B*z = Q*b */ + if (solver == Solver::CHOL) { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + + } else if (solver == Solver::QR) { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + } else { + RXMESH_ERROR( + "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x " + "for " + "Solver::LU"); + } + cudaDeviceSynchronize(); + + if (0 <= singularity) { + RXMESH_WARN( + "WARNING: the matrix is singular at row {} under tol ({})", + singularity, + tol); + } + } + const Context m_context; cusparseHandle_t m_cusparse_handle; cusolverSpHandle_t m_cusolver_sphandle; diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 3f4c4a09..f29ff86a 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -1,7 +1,8 @@ #include "gtest/gtest.h" #include "rxmesh/attribute.h" -#include "rxmesh/matrix/matrix_operation.cuh" +#include "rxmesh/matrix/dense_matrix.cuh" +#include "rxmesh/matrix/sparse_matrix.cuh" #include "rxmesh/query.cuh" #include "rxmesh/rxmesh_static.h" @@ -11,7 +12,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context, IndexT* vet_degree) { using namespace rxmesh; - auto compute_valence = [&](VertexHandle& v_id, const VertexIterator& iter) { auto ids = v_id.unpack(); uint32_t patch_id = ids.first; @@ -72,9 +72,9 @@ __global__ static void sparse_mat_edge_len_test( Vector<3, T> vi_coord( coords(iter[v], 0), coords(iter[v], 1), coords(iter[v], 2)); - sparse_mat(v_id, iter[v]) = 1; //dist(v_coord, vi_coord); + sparse_mat(v_id, iter[v]) = 1; // dist(v_coord, vi_coord); - arr_ref[row_index] += 1; //dist(v_coord, vi_coord); + arr_ref[row_index] += 1; // dist(v_coord, vi_coord); } }; @@ -283,8 +283,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) cuda_query(0); // generate rxmesh obj - std::string obj_path = STRINGIFY(INPUT_DIR) "cube.obj"; - RXMeshStatic rxmesh(obj_path); + RXMeshStatic rxmesh(rxmesh_args.obj_file_name, rxmesh_args.quite); uint32_t num_vertices = rxmesh.get_num_vertices(); @@ -319,9 +318,6 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, spmat, d_arr_ref); - // Spmat matrix array multiply - // spmat_multi_hardwired_kernel - // <<>>(d_arr_ones, spmat, d_result, num_vertices); spmat.arr_mul(d_arr_ones, d_result); // copy the value back to host @@ -355,9 +351,8 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) cuda_query(0); // generate rxmesh obj - std::string obj_path = STRINGIFY(INPUT_DIR) "dragon.obj"; // STRINGIFY(INPUT_DIR) "dragon.obj"; - // "/home/ericycc/data/120628.obj"; - RXMeshStatic rxmesh(obj_path); + std::string obj_path = rxmesh_args.obj_file_name; + RXMeshStatic rxmesh(obj_path, rxmesh_args.quite); uint32_t num_vertices = rxmesh.get_num_vertices(); @@ -383,7 +378,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) print_diag<<<1, 1>>>(A_mat, num_vertices); - spmat_linear_solve(A_mat, B_mat, X_mat, Solver::CHOL, Reorder::NONE); + A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); // timing begins for spmm GPUTimer timer; From 8de4288263e275e04332c717648627196eb0579c Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Sat, 11 Feb 2023 13:00:57 -0800 Subject: [PATCH 07/44] init --- apps/MCF/mcf.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu index 49ceb94d..cdedf01c 100644 --- a/apps/MCF/mcf.cu +++ b/apps/MCF/mcf.cu @@ -54,6 +54,9 @@ TEST(App, MCF) // RXMesh Impl mcf_rxmesh(rxmesh, ground_truth); + + // RXMesh cusolver Impl + mcf_rxmesh_solver(rxmesh, ground_truth); } int main(int argc, char** argv) From c6abf7431760f9de9ea88970f5a448505d5ba292 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Sat, 11 Feb 2023 13:02:48 -0800 Subject: [PATCH 08/44] add error checker --- include/rxmesh/matrix/sparse_matrix.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 3ab55334..8123df8b 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -298,7 +298,7 @@ struct SparseMatrix cusparseDnMatDescr_t matC = C_mat.m_dendescr; void* dBuffer = NULL; - cusparseSetStream(m_cusparse_handle, stream); + CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream)); // allocate an external buffer if needed CUSPARSE_ERROR(cusparseSpMM_bufferSize(m_cusparse_handle, @@ -331,7 +331,7 @@ struct SparseMatrix cusparseDnMatDescr_t matC = C_mat.m_dendescr; void* dBuffer = NULL; - cusparseSetStream(m_cusparse_handle, stream); + CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream)); // allocate an external buffer if needed if (m_spmm_buffer_size == 0) { @@ -402,7 +402,7 @@ struct SparseMatrix CUSPARSE_ERROR( cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F)); - cusparseSetStream(m_cusparse_handle, stream); + CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream)); if (m_spmv_buffer_size == 0) { RXMESH_WARN( From e4a1137bf639d478ffce9e286c703466060825e7 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 13 Feb 2023 12:48:13 -0800 Subject: [PATCH 09/44] uniform done --- CMakeLists.txt | 2 +- apps/MCF/mcf_sparse_matrix.cuh | 78 +++++++++++++++++++++--- include/rxmesh/matrix/dense_matrix.cuh | 14 +++-- tests/RXMesh_test/test_sparse_matrix.cuh | 2 +- 4 files changed, 80 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e10a467d..c5fffbd6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") - +set(USE_POLYSCOPE "OFF") if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index 2193b965..3b948228 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -6,10 +6,11 @@ template -__global__ static void mcf_A_B_setup( +__global__ static void mcf_A_X_B_setup( const rxmesh::Context context, rxmesh::VertexAttribute coords, // for non-uniform rxmesh::SparseMatrix A_mat, + rxmesh::DenseMatrix X_mat, rxmesh::DenseMatrix B_mat, const bool use_uniform_laplace, // for non-uniform const T time_step) @@ -25,28 +26,89 @@ __global__ static void mcf_A_B_setup( uint32_t r_patch_id = r_ids.first; uint16_t r_local_id = r_ids.second; - uint32_t row_index = A_mat.m_d_patch_ptr_v[r_patch_id] + r_local_id; + uint32_t row_index = + A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; + // set up B matrix B_mat(row_index, 0) = coords(v_id, 0) * v_weight; B_mat(row_index, 1) = coords(v_id, 1) * v_weight; B_mat(row_index, 2) = coords(v_id, 2) * v_weight; + // set up initial X matrix + X_mat(row_index, 0) = coords(v_id, 0); + X_mat(row_index, 1) = coords(v_id, 1); + X_mat(row_index, 2) = coords(v_id, 2); + Vector<3, float> vi_coord( coords(v_id, 0), coords(v_id, 1), coords(v_id, 2)); for (uint32_t v = 0; v < iter.size(); ++v) { - T e_weight = 1; - A_mat(v_id, iter[v]) = -time_step * e_weight; - + T e_weight = 1; + e_weight *= time_step; sum_e_weight += e_weight; + + A_mat(v_id, iter[v]) = -e_weight; } - A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight; + A_mat(v_id, v_id) = v_weight + sum_e_weight; }; auto block = cooperative_groups::this_thread_block(); Query query(context); ShmemAllocator shrd_alloc; - query.dispatch(block, shrd_alloc, compute_valence); + query.dispatch( + block, + shrd_alloc, + init_lambda, + [](VertexHandle) { return true; }, + !use_uniform_laplace); } -void mcf_rxmesh_solver() {} \ No newline at end of file +template +void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, + const std::vector>& ground_truth) +{ + using namespace rxmesh; + constexpr uint32_t blockThreads = 256; + + uint32_t num_vertices = rxmesh.get_num_vertices(); + auto coords = rxmesh.get_input_vertex_coordinates(); + + SparseMatrix A_mat(rxmesh); + DenseMatrix X_mat(num_vertices, 3); + DenseMatrix B_mat(num_vertices, 3); + + LaunchBox launch_box; + rxmesh.prepare_launch_box( + {Op::VV}, launch_box, (void*)mcf_A_X_B_setup); + + printf("use_uniform_laplace: %d, time_step: %f\n", + Arg.use_uniform_laplace, + Arg.time_step); + + mcf_A_X_B_setup + <<>>(rxmesh.get_context(), + *coords, + A_mat, + X_mat, + B_mat, + true, + Arg.time_step); + + A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); + + + std::vector h_X_mat(num_vertices); + cudaMemcpy(h_X_mat.data(), + X_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost); + + const T tol = 0.1f; + // for (uint32_t i = 0; i < num_vertices; ++i) { + // for (uint32_t j = 0; j < 3; ++j) { + // EXPECT_NEAR(h_X_mat[i][j], ground_truth[i][j], tol); + // } + // } +} \ No newline at end of file diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index 66b403a5..889a201f 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -19,12 +19,12 @@ struct DenseMatrix CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr, - m_row_size, - m_col_size, - m_row_size, // leading dim - m_d_val, - CUDA_R_32F, - CUSPARSE_ORDER_COL)); + m_row_size, + m_col_size, + m_row_size, // leading dim + m_d_val, + CUDA_R_32F, + CUSPARSE_ORDER_COL)); } void set_ones() @@ -66,6 +66,8 @@ struct DenseMatrix return m_row_size * m_col_size * sizeof(T); } + // TODO: something like attribute->move() + cusparseDnMatDescr_t m_dendescr; IndexT m_row_size; IndexT m_col_size; diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index f29ff86a..b2acce4b 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -377,7 +377,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); print_diag<<<1, 1>>>(A_mat, num_vertices); - + A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); // timing begins for spmm From e4f57e281d97a07205d3bf4b861b56d2469b4c9c Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 13 Feb 2023 15:19:02 -0800 Subject: [PATCH 10/44] singularity check works --- CMakeLists.txt | 2 +- tests/RXMesh_test/test_sparse_matrix.cuh | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e10a467d..c5fffbd6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") - +set(USE_POLYSCOPE "OFF") if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index f29ff86a..4b203583 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -144,6 +144,10 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight + iter.size() * iter.size() + 1000000; + + if (row_index % 10 == 0) { + A_mat(v_id, v_id) = 0; + } }; auto block = cooperative_groups::this_thread_block(); @@ -378,7 +382,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) print_diag<<<1, 1>>>(A_mat, num_vertices); - A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); + A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); // timing begins for spmm GPUTimer timer; @@ -400,10 +404,6 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) num_vertices * 3 * sizeof(float), cudaMemcpyDeviceToHost); - // auto ps_mesh = rxmesh.get_polyscope_mesh(); - // ps_mesh->addVertexColorQuantity("ret_mat", h_ret_mat); - // ps_mesh->addVertexColorQuantity("B_mat", h_B_mat); - for (uint32_t i = 0; i < num_vertices; ++i) { for (uint32_t j = 0; j < 3; ++j) { EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); From a6a7d844f6696080bada0b6021980487455c2b30 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 13 Feb 2023 15:32:26 -0800 Subject: [PATCH 11/44] finish warpping up sparse matrix code --- CMakeLists.txt | 1 - tests/RXMesh_test/test_sparse_matrix.cuh | 36 +++++++++--------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5fffbd6..ad56a02e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,6 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") -set(USE_POLYSCOPE "OFF") if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 4b203583..00053aaa 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -125,9 +125,9 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, uint32_t row_index = A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; - B_mat(row_index, 0) = iter.size() * 7; - B_mat(row_index, 1) = iter.size() * 2; - B_mat(row_index, 2) = iter.size() * 10; + B_mat(row_index, 0) = iter.size() * 7.4f; + B_mat(row_index, 1) = iter.size() * 2.6f; + B_mat(row_index, 2) = iter.size() * 10.3f; X_mat(row_index, 0) = coords(v_id, 0) * v_weight; X_mat(row_index, 1) = coords(v_id, 1) * v_weight; @@ -144,10 +144,6 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight + iter.size() * iter.size() + 1000000; - - if (row_index % 10 == 0) { - A_mat(v_id, v_id) = 0; - } }; auto block = cooperative_groups::this_thread_block(); @@ -156,20 +152,8 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, query.dispatch(block, shrd_alloc, mat_setup); } -template -__global__ static void print_diag(rxmesh::SparseMatrix A_mat, int num_vet) -{ - int t_idx = threadIdx.x + blockDim.x * blockIdx.x; - if (t_idx == 0) { - - for (int i = 0; i < num_vet; ++i) { - if (A_mat.direct_access(i, i) < 1) { - printf("A_mat diag %d: %f \n", i, A_mat.direct_access(i, i)); - } - } - } -} +/* Check the access of the sparse matrix in CSR format in device */ TEST(RXMeshStatic, SparseMatrix) { using namespace rxmesh; @@ -216,6 +200,7 @@ TEST(RXMeshStatic, SparseMatrix) rxmesh.prepare_launch_box( {Op::VV}, launch_box, (void*)sparse_mat_test); + // test kernel sparse_mat_test <<); + // test kernel sparse_mat_query_test <<>>( rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); - print_diag<<<1, 1>>>(A_mat, num_vertices); - A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); // timing begins for spmm From a4bcaa0a089aca4c76e5a6250846a48089c08252 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 13 Feb 2023 16:00:29 -0800 Subject: [PATCH 12/44] fix mcf build --- CMakeLists.txt | 2 +- apps/MCF/mcf_sparse_matrix.cuh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad56a02e..9fcf5984 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") - +set(USE_POLYSCOPE "OFF") if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") else() diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index 2193b965..dae89726 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -46,7 +46,7 @@ __global__ static void mcf_A_B_setup( auto block = cooperative_groups::this_thread_block(); Query query(context); ShmemAllocator shrd_alloc; - query.dispatch(block, shrd_alloc, compute_valence); + query.dispatch(block, shrd_alloc, init_lambda); } void mcf_rxmesh_solver() {} \ No newline at end of file From c501804914605c1e996b5aa7ee96ee1d80ea2f26 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Tue, 21 Feb 2023 14:01:52 -0800 Subject: [PATCH 13/44] test --- apps/MCF/mcf_sparse_matrix.cuh | 90 ++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 16 deletions(-) diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index 3b948228..cfb02411 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -63,6 +63,36 @@ __global__ static void mcf_A_X_B_setup( !use_uniform_laplace); } +template +__global__ static void update_smooth_result(const rxmesh::Context context, + rxmesh::VertexAttribute smooth_X, + rxmesh::SparseMatrix A_mat, + rxmesh::DenseMatrix X_mat) +{ + using namespace rxmesh; + auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) { + auto r_ids = v_id.unpack(); + uint32_t r_patch_id = r_ids.first; + uint16_t r_local_id = r_ids.second; + + uint32_t row_index = + A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; + + // printf("check: %f\n", X_mat(row_index, 0)); + + smooth_X(v_id, 0) = X_mat(row_index, 0); + smooth_X(v_id, 1) = X_mat(row_index, 1); + smooth_X(v_id, 2) = X_mat(row_index, 2); + + // printf("s_check: %f\n", smooth_X(v_id, 0)); + }; + + auto block = cooperative_groups::this_thread_block(); + Query query(context); + ShmemAllocator shrd_alloc; + query.dispatch(block, shrd_alloc, init_lambda); +} + template void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, const std::vector>& ground_truth) @@ -77,14 +107,14 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, DenseMatrix X_mat(num_vertices, 3); DenseMatrix B_mat(num_vertices, 3); - LaunchBox launch_box; - rxmesh.prepare_launch_box( - {Op::VV}, launch_box, (void*)mcf_A_X_B_setup); - printf("use_uniform_laplace: %d, time_step: %f\n", Arg.use_uniform_laplace, Arg.time_step); + LaunchBox launch_box; + rxmesh.prepare_launch_box( + {Op::VV}, launch_box, (void*)mcf_A_X_B_setup); + mcf_A_X_B_setup <<("smooth_X", 3, rxmesh::LOCATION_ALL); + + LaunchBox launch_box_smooth; + rxmesh.prepare_launch_box({Op::VV}, + launch_box_smooth, + (void*)update_smooth_result); + + update_smooth_result + <<>>( + rxmesh.get_context(), *smooth_X, A_mat, X_mat); + + smooth_X->move(rxmesh::DEVICE, rxmesh::HOST); + + const T tol = 0.001; + T tmp_tol = tol; + bool passed = true; + rxmesh.for_each_vertex(HOST, [&](const VertexHandle vh) { + uint32_t v_id = rxmesh.map_to_global(vh); + + for (uint32_t i = 0; i < 3; ++i) { + tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) / + ground_truth[v_id][i]); + if (tmp_tol > tol) { + printf("val: %f, truth: %f, tol: %f\n", + (*smooth_X)(vh, i), + ground_truth[v_id][i], + tmp_tol); + } + + if (std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) / + ground_truth[v_id][i]) > tol) { + passed = false; + break; + } + } + }); - std::vector h_X_mat(num_vertices); - cudaMemcpy(h_X_mat.data(), - X_mat.data(), - num_vertices * 3 * sizeof(float), - cudaMemcpyDeviceToHost); - - const T tol = 0.1f; - // for (uint32_t i = 0; i < num_vertices; ++i) { - // for (uint32_t j = 0; j < 3; ++j) { - // EXPECT_NEAR(h_X_mat[i][j], ground_truth[i][j], tol); - // } - // } + EXPECT_TRUE(passed); } \ No newline at end of file From eeb1f989e1ac1e7bbd1c4ca1c298aaec51e7f9cc Mon Sep 17 00:00:00 2001 From: ericycc Date: Tue, 21 Feb 2023 14:22:24 -0800 Subject: [PATCH 14/44] delete off --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e10a467d..572c32e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,6 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") - if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") else() @@ -165,4 +164,4 @@ add_subdirectory("include") include(GoogleTest) add_subdirectory(apps) -add_subdirectory(tests) \ No newline at end of file +add_subdirectory(tests) From 6bcd5a6c193a7c5b0073eb878db014112cd82919 Mon Sep 17 00:00:00 2001 From: ericycc Date: Tue, 21 Feb 2023 14:45:25 -0800 Subject: [PATCH 15/44] visual test --- CMakeLists.txt | 1 - apps/MCF/mcf_sparse_matrix.cuh | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5fffbd6..ad56a02e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,6 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") -set(USE_POLYSCOPE "OFF") if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index cfb02411..4cf3f2d7 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -130,6 +130,8 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, auto smooth_X = rxmesh.add_vertex_attribute("smooth_X", 3, rxmesh::LOCATION_ALL); + auto truth_X = + rxmesh.add_vertex_attribute("truth_X", 3, rxmesh::LOCATION_ALL); LaunchBox launch_box_smooth; rxmesh.prepare_launch_box({Op::VV}, @@ -143,6 +145,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, rxmesh.get_context(), *smooth_X, A_mat, X_mat); smooth_X->move(rxmesh::DEVICE, rxmesh::HOST); + truth_X->move(rxmesh::DEVICE, rxmesh::HOST); const T tol = 0.001; T tmp_tol = tol; @@ -151,6 +154,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, uint32_t v_id = rxmesh.map_to_global(vh); for (uint32_t i = 0; i < 3; ++i) { + (*truth_X)(vh, i) = ground_truth[v_id][i]; tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) / ground_truth[v_id][i]); if (tmp_tol > tol) { @@ -168,5 +172,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, } }); + auto ps_mesh = rxmesh.get_polyscope_mesh(); + ps_mesh->addVertexColorQuantity("smooth_x", *smooth_X); + ps_mesh->addVertexColorQuantity("smooth_om", *truth_X); + polyscope::show(); + EXPECT_TRUE(passed); } \ No newline at end of file From 36c1d4cb865a46c25769a23538b9cecc1a07f2b7 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Sat, 25 Feb 2023 00:47:04 -0800 Subject: [PATCH 16/44] uniform pass --- apps/MCF/mcf_rxmesh.h | 1 + apps/MCF/mcf_sparse_matrix.cuh | 179 ++++++++++++++++++++++++++------- 2 files changed, 142 insertions(+), 38 deletions(-) diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h index a58a62df..f17a1591 100644 --- a/apps/MCF/mcf_rxmesh.h +++ b/apps/MCF/mcf_rxmesh.h @@ -227,6 +227,7 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, // output to obj // rxmesh.export_obj("mcf_rxmesh.obj", *X); + rxmesh.export_obj("mcf_rxmesh_cg.obj", *X); // Verify const T tol = 0.001; diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index cfb02411..08ceb8aa 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -1,55 +1,138 @@ #pragma once +#include "mcf_util.h" #include "rxmesh/attribute.h" #include "rxmesh/matrix/dense_matrix.cuh" #include "rxmesh/matrix/sparse_matrix.cuh" #include "rxmesh/rxmesh_static.h" +template +__global__ static void mcf_B_setup(const rxmesh::Context context, + const rxmesh::VertexAttribute coords, + rxmesh::DenseMatrix B_mat, + const bool use_uniform_laplace) +{ + using namespace rxmesh; + + auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) { + auto r_ids = p_id.unpack(); + uint32_t r_patch_id = r_ids.first; + uint16_t r_local_id = r_ids.second; + + uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; + + if (use_uniform_laplace) { + const T valence = static_cast(iter.size()); + B_mat(row_index, 0) = coords(p_id, 0) * valence; + B_mat(row_index, 1) = coords(p_id, 1) * valence; + B_mat(row_index, 2) = coords(p_id, 2) * valence; + } else { + T v_weight = 0; + + // this is the last vertex in the one-ring (before r_id) + VertexHandle q_id = iter.back(); + + for (uint32_t v = 0; v < iter.size(); ++v) { + // the current one ring vertex + VertexHandle r_id = iter[v]; + + T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); + + v_weight += (tri_area > 0) ? tri_area : 0.0; + + q_id = r_id; + } + v_weight = 0.5 / v_weight; + + B_mat(row_index, 0) = coords(p_id, 0) / v_weight; + B_mat(row_index, 1) = coords(p_id, 1) / v_weight; + B_mat(row_index, 2) = coords(p_id, 2) / v_weight; + } + }; + + // With uniform Laplacian, we just need the valence, thus we + // call query and set oriented to false + auto block = cooperative_groups::this_thread_block(); + + Query query(context); + ShmemAllocator shrd_alloc; + query.dispatch( + block, + shrd_alloc, + init_lambda, + [](VertexHandle) { return true; }, + !use_uniform_laplace); +} template -__global__ static void mcf_A_X_B_setup( - const rxmesh::Context context, - rxmesh::VertexAttribute coords, // for non-uniform - rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix X_mat, - rxmesh::DenseMatrix B_mat, - const bool use_uniform_laplace, // for non-uniform - const T time_step) +__global__ static void mcf_A_X_setup( + const rxmesh::Context context, + const rxmesh::VertexAttribute coords, + rxmesh::SparseMatrix A_mat, + rxmesh::DenseMatrix X_mat, + const bool use_uniform_laplace, // for non-uniform + const T time_step) { using namespace rxmesh; - auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) { + auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) { T sum_e_weight(0); + T v_weight(0); - T v_weight = iter.size(); + VertexHandle q_id = iter.back(); // reference value calculation - auto r_ids = v_id.unpack(); + auto r_ids = p_id.unpack(); uint32_t r_patch_id = r_ids.first; uint16_t r_local_id = r_ids.second; uint32_t row_index = A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; - // set up B matrix - B_mat(row_index, 0) = coords(v_id, 0) * v_weight; - B_mat(row_index, 1) = coords(v_id, 1) * v_weight; - B_mat(row_index, 2) = coords(v_id, 2) * v_weight; - // set up initial X matrix - X_mat(row_index, 0) = coords(v_id, 0); - X_mat(row_index, 1) = coords(v_id, 1); - X_mat(row_index, 2) = coords(v_id, 2); + X_mat(row_index, 0) = coords(p_id, 0); + X_mat(row_index, 1) = coords(p_id, 1); + X_mat(row_index, 2) = coords(p_id, 2); - Vector<3, float> vi_coord( - coords(v_id, 0), coords(v_id, 1), coords(v_id, 2)); + // set up matrix A for (uint32_t v = 0; v < iter.size(); ++v) { - T e_weight = 1; + VertexHandle r_id = iter[v]; + + T e_weight = 0; + if (use_uniform_laplace) { + e_weight = 1; + } else { + VertexHandle s_id = + (v == iter.size() - 1) ? iter[0] : iter[v + 1]; + + e_weight = edge_cotan_weight(p_id, r_id, q_id, s_id, coords); + e_weight = (static_cast(e_weight >= 0.0)) * e_weight; + } + e_weight *= time_step; sum_e_weight += e_weight; - A_mat(v_id, iter[v]) = -e_weight; + A_mat(p_id, iter[v]) = -e_weight; + + // compute vertex weight + if (use_uniform_laplace) { + ++v_weight; + } else { + T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); + v_weight += (tri_area > 0) ? tri_area : 0; + q_id = r_id; + } + } + + // Diagonal entry + if (use_uniform_laplace) { + v_weight = 1.0 / v_weight; + } else { + v_weight = 0.5 / v_weight; } - A_mat(v_id, v_id) = v_weight + sum_e_weight; + assert(!isnan(v_weight)); + assert(!isinf(v_weight)); + + A_mat(p_id, p_id) = (1.0 / v_weight) + sum_e_weight; }; auto block = cooperative_groups::this_thread_block(); @@ -107,25 +190,40 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, DenseMatrix X_mat(num_vertices, 3); DenseMatrix B_mat(num_vertices, 3); + // B set up + LaunchBox launch_box_B; + rxmesh.prepare_launch_box( + {Op::VV}, launch_box_B, (void*)mcf_B_setup); + + mcf_B_setup<<>>( + rxmesh.get_context(), *coords, B_mat, Arg.use_uniform_laplace); + printf("use_uniform_laplace: %d, time_step: %f\n", Arg.use_uniform_laplace, Arg.time_step); - LaunchBox launch_box; + // A and X set up + LaunchBox launch_box_A_X; rxmesh.prepare_launch_box( - {Op::VV}, launch_box, (void*)mcf_A_X_B_setup); - - mcf_A_X_B_setup - <<>>(rxmesh.get_context(), - *coords, - A_mat, - X_mat, - B_mat, - true, - Arg.time_step); + {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup); + + mcf_A_X_setup + <<>>(rxmesh.get_context(), + *coords, + A_mat, + X_mat, + Arg.use_uniform_laplace, + Arg.time_step); + + printf("use_uniform_laplace: %d, time_step: %f\n", + Arg.use_uniform_laplace, + Arg.time_step); + // Solving the linear system A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); auto smooth_X = @@ -141,9 +239,14 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, launch_box_smooth.num_threads, launch_box_smooth.smem_bytes_dyn>>>( rxmesh.get_context(), *smooth_X, A_mat, X_mat); - smooth_X->move(rxmesh::DEVICE, rxmesh::HOST); + printf("use_uniform_laplace: %d, time_step: %f\n", + Arg.use_uniform_laplace, + Arg.time_step); + + rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X); + const T tol = 0.001; T tmp_tol = tol; bool passed = true; From 34d39fd4ddcae1149fb8b4322c140f447f45322e Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Sun, 26 Feb 2023 00:33:04 -0800 Subject: [PATCH 17/44] CUSOLVER_STATUS_INTERNAL_ERROR by Invalid __shared__ write at mcf_sparse_matrix.cuh:58 --- CMakeLists.txt | 2 ++ apps/MCF/mcf_sparse_matrix.cuh | 13 ------------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad56a02e..8e548767 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,8 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") +set(USE_POLYSCOPE "OFF") + if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") else() diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index 2a0c0a39..b169f467 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -219,10 +219,6 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, Arg.use_uniform_laplace, Arg.time_step); - printf("use_uniform_laplace: %d, time_step: %f\n", - Arg.use_uniform_laplace, - Arg.time_step); - // Solving the linear system A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); @@ -244,10 +240,6 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, smooth_X->move(rxmesh::DEVICE, rxmesh::HOST); truth_X->move(rxmesh::DEVICE, rxmesh::HOST); - printf("use_uniform_laplace: %d, time_step: %f\n", - Arg.use_uniform_laplace, - Arg.time_step); - rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X); const T tol = 0.001; @@ -275,10 +267,5 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, } }); - auto ps_mesh = rxmesh.get_polyscope_mesh(); - ps_mesh->addVertexColorQuantity("smooth_x", *smooth_X); - ps_mesh->addVertexColorQuantity("smooth_om", *truth_X); - polyscope::show(); - EXPECT_TRUE(passed); } \ No newline at end of file From eb01a05d4cb718af050f24f19a22eb67c4b7b32c Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Sun, 26 Feb 2023 11:29:57 -0800 Subject: [PATCH 18/44] resolve changes --- CMakeLists.txt | 5 +- include/rxmesh/matrix/sparse_matrix.cuh | 34 +++++++++- tests/RXMesh_test/test_sparse_matrix.cuh | 85 +++--------------------- 3 files changed, 43 insertions(+), 81 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9fcf5984..ee091248 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,5 @@ cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 75) -endif() - project(RXMesh VERSION 0.2.1 LANGUAGES C CXX CUDA) @@ -11,6 +7,7 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") set(USE_POLYSCOPE "OFF") + if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") else() diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 8123df8b..0b4c1958 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -258,7 +258,7 @@ struct SparseMatrix return m_d_val[get_val_idx(row_v, col_v)]; } - __device__ T& direct_access(IndexT x, IndexT y) + __device__ T& operator()(const IndexT x, const IndexT y) { const IndexT start = m_d_row_ptr[x]; const IndexT end = m_d_row_ptr[x + 1]; @@ -271,6 +271,35 @@ struct SparseMatrix assert(1 != 1); } + __device__ T& operator()(const IndexT x, const IndexT y) const + { + const IndexT start = m_d_row_ptr[x]; + const IndexT end = m_d_row_ptr[x + 1]; + + for (IndexT i = start; i < end; ++i) { + if (m_d_col_idx[i] == y) { + return m_d_val[i]; + } + } + assert(1 != 1); + } + + __host__ __device__ IndexT& get_nnz() const { + return m_nnz; + } + + __device__ IndexT get_row_ptr_at(IndexT idx) const { + return m_d_row_ptr[idx]; + } + + __device__ IndexT get_col_idx_at(IndexT idx) const { + return m_d_col_idx[idx]; + } + + __device__ IndexT get_val_at(IndexT idx) const { + return m_d_val[idx]; + } + void free() { CUDA_ERROR(cudaFree(m_d_row_ptr)); @@ -585,7 +614,7 @@ struct SparseMatrix "for " "Solver::LU"); } - cudaDeviceSynchronize(); + CUDA_ERROR(cudaDeviceSynchronize()); if (0 <= singularity) { RXMESH_WARN( @@ -595,6 +624,7 @@ struct SparseMatrix } } + private: const Context m_context; cusparseHandle_t m_cusparse_handle; cusolverSpHandle_t m_cusolver_sphandle; diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 00053aaa..169fc498 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -26,26 +26,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context, query.dispatch(block, shrd_alloc, compute_valence); } -template -__global__ static void sparse_mat_query_test( - const rxmesh::Context context, - rxmesh::SparseMatrix sparse_mat) -{ - using namespace rxmesh; - auto fillin = [&](VertexHandle& v_id, const VertexIterator& iter) { - sparse_mat(v_id, v_id) = 2; - for (uint32_t v = 0; v < iter.size(); ++v) { - sparse_mat(v_id, iter[v]) = 2; - sparse_mat(iter[v], v_id) = 2; - } - }; - - auto block = cooperative_groups::this_thread_block(); - Query query(context); - ShmemAllocator shrd_alloc; - query.dispatch(block, shrd_alloc, fillin); -} - template __global__ static void sparse_mat_edge_len_test( const rxmesh::Context context, @@ -62,7 +42,7 @@ __global__ static void sparse_mat_edge_len_test( uint16_t r_local_id = r_ids.second; uint32_t row_index = - sparse_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; + context.m_vertex_prefix[r_patch_id] + r_local_id; arr_ref[row_index] = 0; sparse_mat(v_id, v_id) = 0; @@ -93,11 +73,11 @@ __global__ void spmat_multi_hardwired_kernel(T* vec, int tid = threadIdx.x + blockIdx.x * blockDim.x; float sum = 0; if (tid < N) { - uint32_t start = sparse_mat.m_d_row_ptr[tid]; - uint32_t end = sparse_mat.m_d_row_ptr[tid + 1]; + uint32_t start = sparse_mat.get_row_ptr_at(tid); + uint32_t end = sparse_mat.get_row_ptr_at(tid + 1); for (int i = 0; i < end - start; i++) { - sum += vec[sparse_mat.m_d_col_idx[start + i]] * - sparse_mat.m_d_val[start + i]; + sum += vec[sparse_mat.get_col_idx_at(start + i)] * + sparse_mat.get_val_at(start + i); } out[tid] = sum; } @@ -123,7 +103,7 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, uint16_t r_local_id = r_ids.second; uint32_t row_index = - A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; + context.m_vertex_prefix[r_patch_id] + r_local_id; B_mat(row_index, 0) = iter.size() * 7.4f; B_mat(row_index, 1) = iter.size() * 2.6f; @@ -221,51 +201,6 @@ TEST(RXMeshStatic, SparseMatrix) spmat.free(); } -/* check if the operator '()' works normally */ -TEST(RXMeshStatic, SparseMatrixQuery) -{ - using namespace rxmesh; - - // Select device - cuda_query(0); - - // generate rxmesh obj - std::string obj_path = STRINGIFY(INPUT_DIR) "cube.obj"; - RXMeshStatic rxmesh(obj_path); - - uint32_t num_vertices = rxmesh.get_num_vertices(); - - const uint32_t threads = 256; - const uint32_t blocks = DIVIDE_UP(num_vertices, threads); - - SparseMatrix spmat(rxmesh); - spmat.set_ones(); - - LaunchBox launch_box; - rxmesh.prepare_launch_box( - {Op::VV}, launch_box, (void*)sparse_mat_query_test); - - // test kernel - sparse_mat_query_test - <<>>(rxmesh.get_context(), spmat); - - std::vector h_result(spmat.m_nnz); - CUDA_ERROR(cudaMemcpy(h_result.data(), - spmat.m_d_val, - spmat.m_nnz * sizeof(int), - cudaMemcpyDeviceToHost)); - - std::vector h_ref(spmat.m_nnz, 2); - - for (int i = 0; i < spmat.m_nnz; ++i) { - EXPECT_EQ(h_result[i], h_ref[i]); - } - - spmat.free(); -} - /* First replace the sparse matrix entry with the edge length and then do spmv * with an all one array and check the result */ @@ -386,15 +321,15 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) RXMESH_TRACE("SPMM_rxmesh() took {} (ms) ", timer.elapsed_millis()); std::vector h_ret_mat(num_vertices); - cudaMemcpy(h_ret_mat.data(), + CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), ret_mat.data(), num_vertices * 3 * sizeof(float), - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost)); std::vector h_B_mat(num_vertices); - cudaMemcpy(h_B_mat.data(), + CUDA_ERROR(cudaMemcpy(h_B_mat.data(), B_mat.data(), num_vertices * 3 * sizeof(float), - cudaMemcpyDeviceToHost); + cudaMemcpyDeviceToHost)); for (uint32_t i = 0; i < num_vertices; ++i) { for (uint32_t j = 0; j < 3; ++j) { From 8b2f36159a315f666a7ee66ecb619b4104162bb4 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 27 Feb 2023 17:55:55 -0800 Subject: [PATCH 19/44] test mem err --- apps/MCF/mcf.cu | 4 +-- apps/MCF/mcf_rxmesh.h | 2 ++ apps/MCF/mcf_sparse_matrix.cuh | 42 +++++++++++++------------- include/rxmesh/matrix/dense_matrix.cuh | 1 + 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu index cdedf01c..bfa73bdc 100644 --- a/apps/MCF/mcf.cu +++ b/apps/MCF/mcf.cu @@ -53,10 +53,10 @@ TEST(App, MCF) mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth); // RXMesh Impl - mcf_rxmesh(rxmesh, ground_truth); + mcf_rxmesh(rxmesh, ground_truth); //mcf_rxmesh_cg // RXMesh cusolver Impl - mcf_rxmesh_solver(rxmesh, ground_truth); + mcf_rxmesh_solver(rxmesh, ground_truth); //mcf_rxmesh_cusolver_chol } int main(int argc, char** argv) diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h index f17a1591..fa3e589d 100644 --- a/apps/MCF/mcf_rxmesh.h +++ b/apps/MCF/mcf_rxmesh.h @@ -119,6 +119,8 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, launch_box_init_B.smem_bytes_dyn>>>( rxmesh.get_context(), *X, *B, Arg.use_uniform_laplace); + CUDA_ERROR(cudaDeviceSynchronize()); + // CG scalars T alpha(0), beta(0), delta_new(0), delta_old(0); diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index bda1a88f..f34dc8a5 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -26,30 +26,30 @@ __global__ static void mcf_B_setup(const rxmesh::Context context, B_mat(row_index, 1) = coords(p_id, 1) * valence; B_mat(row_index, 2) = coords(p_id, 2) * valence; } else { - T v_weight = 0; + // T v_weight = 0; // this is the last vertex in the one-ring (before r_id) - VertexHandle q_id = iter.back(); + // VertexHandle q_id = iter.back(); - for (uint32_t v = 0; v < iter.size(); ++v) { - // the current one ring vertex - VertexHandle r_id = iter[v]; + // for (uint32_t v = 0; v < iter.size(); ++v) { + // // the current one ring vertex + // VertexHandle r_id = iter[v]; - T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); + // T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); - v_weight += (tri_area > 0) ? tri_area : 0.0; + // v_weight += (tri_area > 0) ? tri_area : 0.0; - q_id = r_id; - } - v_weight = 0.5 / v_weight; + // q_id = r_id; + // } + // v_weight = 0.5 / v_weight; - B_mat(row_index, 0) = coords(p_id, 0) / v_weight; - B_mat(row_index, 1) = coords(p_id, 1) / v_weight; - B_mat(row_index, 2) = coords(p_id, 2) / v_weight; - printf("check: %f, %f, %f\n", - B_mat(row_index, 0), - B_mat(row_index, 1), - B_mat(row_index, 2)); + B_mat(row_index, 0) = coords(p_id, 0); // / v_weight; + B_mat(row_index, 1) = coords(p_id, 1); // / v_weight; + B_mat(row_index, 2) = coords(p_id, 2); // / v_weight; + // printf("check: %f, %f, %f\n", + // B_mat(row_index, 0), + // B_mat(row_index, 1), + // B_mat(row_index, 2)); } }; @@ -190,6 +190,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, DenseMatrix X_mat(num_vertices, 3); DenseMatrix B_mat(num_vertices, 3); + printf("use_uniform_laplace: %d, time_step: %f\n", + Arg.use_uniform_laplace, + Arg.time_step); + // B set up LaunchBox launch_box_B; rxmesh.prepare_launch_box( @@ -202,10 +206,6 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, CUDA_ERROR(cudaDeviceSynchronize()); - printf("use_uniform_laplace: %d, time_step: %f\n", - Arg.use_uniform_laplace, - Arg.time_step); - // A and X set up LaunchBox launch_box_A_X; rxmesh.prepare_launch_box( diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index 889a201f..a290e56c 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -43,6 +43,7 @@ struct DenseMatrix __device__ T& operator()(const uint32_t row, const uint32_t col) { + //assert(col < m_col_size); return m_d_val[col * m_row_size + row]; } From fcfb4855bfb74f970a22c5d29634c944bdf548ba Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 27 Feb 2023 19:40:09 -0800 Subject: [PATCH 20/44] fix illegal memory access --- apps/MCF/mcf_rxmesh.h | 5 ++--- apps/MCF/mcf_sparse_matrix.cuh | 38 ++++++++++++++-------------------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h index fa3e589d..362416c6 100644 --- a/apps/MCF/mcf_rxmesh.h +++ b/apps/MCF/mcf_rxmesh.h @@ -106,11 +106,11 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, rxmesh.prepare_launch_box({rxmesh::Op::VV}, launch_box_init_B, (void*)init_B, - true); + !Arg.use_uniform_laplace); rxmesh.prepare_launch_box({rxmesh::Op::VV}, launch_box_matvec, (void*)rxmesh_matvec, - true); + !Arg.use_uniform_laplace); // init kernel to initialize RHS (B) @@ -229,7 +229,6 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, // output to obj // rxmesh.export_obj("mcf_rxmesh.obj", *X); - rxmesh.export_obj("mcf_rxmesh_cg.obj", *X); // Verify const T tol = 0.001; diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index f34dc8a5..101e3cbe 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -26,30 +26,26 @@ __global__ static void mcf_B_setup(const rxmesh::Context context, B_mat(row_index, 1) = coords(p_id, 1) * valence; B_mat(row_index, 2) = coords(p_id, 2) * valence; } else { - // T v_weight = 0; + T v_weight = 0; - // this is the last vertex in the one-ring (before r_id) - // VertexHandle q_id = iter.back(); + //this is the last vertex in the one-ring (before r_id) + VertexHandle q_id = iter.back(); - // for (uint32_t v = 0; v < iter.size(); ++v) { - // // the current one ring vertex - // VertexHandle r_id = iter[v]; + for (uint32_t v = 0; v < iter.size(); ++v) { + // the current one ring vertex + VertexHandle r_id = iter[v]; - // T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); + T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); - // v_weight += (tri_area > 0) ? tri_area : 0.0; + v_weight += (tri_area > 0) ? tri_area : 0.0; - // q_id = r_id; - // } - // v_weight = 0.5 / v_weight; + q_id = r_id; + } + v_weight = 0.5 / v_weight; - B_mat(row_index, 0) = coords(p_id, 0); // / v_weight; - B_mat(row_index, 1) = coords(p_id, 1); // / v_weight; - B_mat(row_index, 2) = coords(p_id, 2); // / v_weight; - // printf("check: %f, %f, %f\n", - // B_mat(row_index, 0), - // B_mat(row_index, 1), - // B_mat(row_index, 2)); + B_mat(row_index, 0) = coords(p_id, 0) / v_weight; + B_mat(row_index, 1) = coords(p_id, 1) / v_weight; + B_mat(row_index, 2) = coords(p_id, 2) / v_weight; } }; @@ -163,8 +159,6 @@ __global__ static void update_smooth_result(const rxmesh::Context context, uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; - // printf("check: %f\n", X_mat(row_index, 0)); - smooth_X(v_id, 0) = X_mat(row_index, 0); smooth_X(v_id, 1) = X_mat(row_index, 1); smooth_X(v_id, 2) = X_mat(row_index, 2); @@ -197,7 +191,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, // B set up LaunchBox launch_box_B; rxmesh.prepare_launch_box( - {Op::VV}, launch_box_B, (void*)mcf_B_setup); + {Op::VV}, launch_box_B, (void*)mcf_B_setup, !Arg.use_uniform_laplace); mcf_B_setup<< launch_box_A_X; rxmesh.prepare_launch_box( - {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup); + {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup, !Arg.use_uniform_laplace); mcf_A_X_setup << Date: Mon, 27 Feb 2023 19:52:19 -0800 Subject: [PATCH 21/44] rename the function and format the code --- apps/MCF/mcf.cu | 4 ++-- apps/MCF/mcf_rxmesh.h | 2 +- apps/MCF/mcf_sparse_matrix.cuh | 20 ++++++++++++-------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu index bfa73bdc..4ac988c0 100644 --- a/apps/MCF/mcf.cu +++ b/apps/MCF/mcf.cu @@ -53,10 +53,10 @@ TEST(App, MCF) mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth); // RXMesh Impl - mcf_rxmesh(rxmesh, ground_truth); //mcf_rxmesh_cg + mcf_rxmesh_cg(rxmesh, ground_truth); // RXMesh cusolver Impl - mcf_rxmesh_solver(rxmesh, ground_truth); //mcf_rxmesh_cusolver_chol + mcf_rxmesh_cusolver_chol(rxmesh, ground_truth); } int main(int argc, char** argv) diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h index 362416c6..41ca5fab 100644 --- a/apps/MCF/mcf_rxmesh.h +++ b/apps/MCF/mcf_rxmesh.h @@ -49,7 +49,7 @@ void init_PR(rxmesh::RXMeshStatic& rxmesh, } template -void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, +void mcf_rxmesh_cg(rxmesh::RXMeshStatic& rxmesh, const std::vector>& ground_truth) { using namespace rxmesh; diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index 101e3cbe..1a4484a2 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -28,7 +28,7 @@ __global__ static void mcf_B_setup(const rxmesh::Context context, } else { T v_weight = 0; - //this is the last vertex in the one-ring (before r_id) + // this is the last vertex in the one-ring (before r_id) VertexHandle q_id = iter.back(); for (uint32_t v = 0; v < iter.size(); ++v) { @@ -171,8 +171,8 @@ __global__ static void update_smooth_result(const rxmesh::Context context, } template -void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, - const std::vector>& ground_truth) +void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, + const std::vector>& ground_truth) { using namespace rxmesh; constexpr uint32_t blockThreads = 256; @@ -190,8 +190,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, // B set up LaunchBox launch_box_B; - rxmesh.prepare_launch_box( - {Op::VV}, launch_box_B, (void*)mcf_B_setup, !Arg.use_uniform_laplace); + rxmesh.prepare_launch_box({Op::VV}, + launch_box_B, + (void*)mcf_B_setup, + !Arg.use_uniform_laplace); mcf_B_setup<< launch_box_A_X; - rxmesh.prepare_launch_box( - {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup, !Arg.use_uniform_laplace); + rxmesh.prepare_launch_box({Op::VV}, + launch_box_A_X, + (void*)mcf_A_X_setup, + !Arg.use_uniform_laplace); mcf_A_X_setup << Date: Mon, 27 Feb 2023 19:52:19 -0800 Subject: [PATCH 22/44] rename the function and format the code --- apps/MCF/mcf.cu | 4 ++-- apps/MCF/mcf_rxmesh.h | 4 +--- apps/MCF/mcf_sparse_matrix.cuh | 20 ++++++++++++-------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu index bfa73bdc..4ac988c0 100644 --- a/apps/MCF/mcf.cu +++ b/apps/MCF/mcf.cu @@ -53,10 +53,10 @@ TEST(App, MCF) mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth); // RXMesh Impl - mcf_rxmesh(rxmesh, ground_truth); //mcf_rxmesh_cg + mcf_rxmesh_cg(rxmesh, ground_truth); // RXMesh cusolver Impl - mcf_rxmesh_solver(rxmesh, ground_truth); //mcf_rxmesh_cusolver_chol + mcf_rxmesh_cusolver_chol(rxmesh, ground_truth); } int main(int argc, char** argv) diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h index 362416c6..bb2c4ab4 100644 --- a/apps/MCF/mcf_rxmesh.h +++ b/apps/MCF/mcf_rxmesh.h @@ -49,7 +49,7 @@ void init_PR(rxmesh::RXMeshStatic& rxmesh, } template -void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, +void mcf_rxmesh_cg(rxmesh::RXMeshStatic& rxmesh, const std::vector>& ground_truth) { using namespace rxmesh; @@ -119,8 +119,6 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, launch_box_init_B.smem_bytes_dyn>>>( rxmesh.get_context(), *X, *B, Arg.use_uniform_laplace); - CUDA_ERROR(cudaDeviceSynchronize()); - // CG scalars T alpha(0), beta(0), delta_new(0), delta_old(0); diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index 101e3cbe..1a4484a2 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -28,7 +28,7 @@ __global__ static void mcf_B_setup(const rxmesh::Context context, } else { T v_weight = 0; - //this is the last vertex in the one-ring (before r_id) + // this is the last vertex in the one-ring (before r_id) VertexHandle q_id = iter.back(); for (uint32_t v = 0; v < iter.size(); ++v) { @@ -171,8 +171,8 @@ __global__ static void update_smooth_result(const rxmesh::Context context, } template -void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, - const std::vector>& ground_truth) +void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, + const std::vector>& ground_truth) { using namespace rxmesh; constexpr uint32_t blockThreads = 256; @@ -190,8 +190,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic& rxmesh, // B set up LaunchBox launch_box_B; - rxmesh.prepare_launch_box( - {Op::VV}, launch_box_B, (void*)mcf_B_setup, !Arg.use_uniform_laplace); + rxmesh.prepare_launch_box({Op::VV}, + launch_box_B, + (void*)mcf_B_setup, + !Arg.use_uniform_laplace); mcf_B_setup<< launch_box_A_X; - rxmesh.prepare_launch_box( - {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup, !Arg.use_uniform_laplace); + rxmesh.prepare_launch_box({Op::VV}, + launch_box_A_X, + (void*)mcf_A_X_setup, + !Arg.use_uniform_laplace); mcf_A_X_setup << Date: Fri, 3 Mar 2023 13:55:55 -0800 Subject: [PATCH 23/44] remove test code --- CMakeLists.txt | 2 -- apps/MCF/mcf_sparse_matrix.cuh | 13 +++---------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d78ad40f..d54b4340 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,8 +7,6 @@ project(RXMesh set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization") -set(USE_POLYSCOPE "OFF") - if(${USE_POLYSCOPE}) message(STATUS "Polyscope is enabled") else() diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index 1a4484a2..d2ed2752 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -184,7 +184,7 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, DenseMatrix X_mat(num_vertices, 3); DenseMatrix B_mat(num_vertices, 3); - printf("use_uniform_laplace: %d, time_step: %f\n", + RXMESH_INFO("use_uniform_laplace: {}, time_step: {}", Arg.use_uniform_laplace, Arg.time_step); @@ -224,8 +224,6 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, auto smooth_X = rxmesh.add_vertex_attribute("smooth_X", 3, rxmesh::LOCATION_ALL); - auto truth_X = - rxmesh.add_vertex_attribute("truth_X", 3, rxmesh::LOCATION_ALL); LaunchBox launch_box_smooth; rxmesh.prepare_launch_box({Op::VV}, @@ -238,7 +236,6 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, launch_box_smooth.smem_bytes_dyn>>>( rxmesh.get_context(), *smooth_X, A_mat, X_mat); smooth_X->move(rxmesh::DEVICE, rxmesh::HOST); - truth_X->move(rxmesh::DEVICE, rxmesh::HOST); rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X); @@ -249,18 +246,14 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, uint32_t v_id = rxmesh.map_to_global(vh); for (uint32_t i = 0; i < 3; ++i) { - (*truth_X)(vh, i) = ground_truth[v_id][i]; tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) / ground_truth[v_id][i]); + if (tmp_tol > tol) { - printf("val: %f, truth: %f, tol: %f\n", + RXMESH_WARN("val: {}, truth: {}, tol: {}\n", (*smooth_X)(vh, i), ground_truth[v_id][i], tmp_tol); - } - - if (std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) / - ground_truth[v_id][i]) > tol) { passed = false; break; } From 2597f187c614882f62da4b7cd5de38e5c0757534 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 6 Mar 2023 02:33:27 -0800 Subject: [PATCH 24/44] dense matrix host-device --- apps/MCF/mcf_sparse_matrix.cuh | 8 +- include/rxmesh/matrix/dense_matrix.cuh | 116 +++++++++++++++++++++++-- 2 files changed, 116 insertions(+), 8 deletions(-) diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index d2ed2752..f9244cfb 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -237,13 +237,19 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, rxmesh.get_context(), *smooth_X, A_mat, X_mat); smooth_X->move(rxmesh::DEVICE, rxmesh::HOST); - rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X); + X_mat.move(rxmesh::DEVICE, rxmesh::HOST); const T tol = 0.001; T tmp_tol = tol; bool passed = true; rxmesh.for_each_vertex(HOST, [&](const VertexHandle vh) { uint32_t v_id = rxmesh.map_to_global(vh); + uint32_t v_linear_id = rxmesh.linear_id(vh); + + T a = X_mat(v_linear_id, 0); + T b = (*smooth_X)(vh, 0); + + EXPECT_EQ(X_mat(v_linear_id, 0), (*smooth_X)(vh, 0)); for (uint32_t i = 0; i < 3; ++i) { tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) / diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index a290e56c..fb70b322 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -9,12 +9,15 @@ namespace rxmesh { // Currently this is device only // host/device transormation will be added -// only col major is supportedF +// only col major is supported template struct DenseMatrix { DenseMatrix(IndexT row_size, IndexT col_size) - : m_row_size(row_size), m_col_size(col_size), m_dendescr(NULL) + : m_row_size(row_size), + m_col_size(col_size), + m_dendescr(NULL), + m_allocated(LOCATION_NONE) { CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); @@ -41,15 +44,29 @@ struct DenseMatrix return m_row_size; } - __device__ T& operator()(const uint32_t row, const uint32_t col) + __host__ __device__ T& operator()(const uint32_t row, const uint32_t col) { - //assert(col < m_col_size); + assert(row < m_row_size); + assert(col < m_col_size); + +#ifdef __CUDA_ARCH__ return m_d_val[col * m_row_size + row]; +#else + return m_h_val[col * m_row_size + row]; +#endif } - __device__ T& operator()(const uint32_t row, const uint32_t col) const + __host__ __device__ T& operator()(const uint32_t row, + const uint32_t col) const { + assert(row < m_row_size); + assert(col < m_col_size); + +#ifdef __CUDA_ARCH__ return m_d_val[col * m_row_size + row]; +#else + return m_h_val[col * m_row_size + row]; +#endif } T* data() const @@ -57,9 +74,18 @@ struct DenseMatrix return m_d_val; } - T* col_data(const uint32_t ld_idx) const + T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const { - return m_d_val + ld_idx * lead_dim(); + if ((location & HOST) == HOST) { + return m_h_val + ld_idx * lead_dim(); + } + + if ((location & DEVICE) == DEVICE) { + return m_d_val + ld_idx * lead_dim(); + } + + assert(1 != 1); + return 0; } IndexT bytes() const @@ -67,12 +93,88 @@ struct DenseMatrix return m_row_size * m_col_size * sizeof(T); } + void move(locationT source, locationT target, cudaStream_t stream = NULL) + { + if (source == target) { + RXMESH_WARN( + "DenseMatrix::move() source ({}) and target ({}) " + "are the same.", + location_to_string(source), + location_to_string(target)); + return; + } + + if ((source == HOST || source == DEVICE) && + ((source & m_allocated) != source)) { + RXMESH_ERROR( + "DenseMatrix::move() moving source is not valid" + " because it was not allocated on source i.e., {}", + location_to_string(source)); + } + + if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) && + ((target & m_allocated) != target)) { + RXMESH_WARN( + "DenseMatrix::move() allocating target before moving to {}", + location_to_string(target)); + + printf("aaaaaaaaaaaaaaaaaaa\n"); + allocate(target); + } + + if (source == HOST && target == DEVICE) { + CUDA_ERROR(cudaMemcpyAsync( + m_d_val, m_h_val, bytes(), cudaMemcpyHostToDevice, stream)); + } else if (source == DEVICE && target == HOST) { + CUDA_ERROR(cudaMemcpyAsync( + m_h_val, m_d_val, bytes(), cudaMemcpyDeviceToHost, stream)); + } + } + + void release(locationT location = LOCATION_ALL) + { + if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { + free(m_h_val); + m_h_val = nullptr; + m_allocated = m_allocated & (~HOST); + } + + if (((location & DEVICE) == DEVICE) && + ((m_allocated & DEVICE) == DEVICE)) { + GPU_FREE(m_d_val); + m_allocated = m_allocated & (~DEVICE); + } + } + + void allocate(locationT location) + { + if ((location & HOST) == HOST) { + release(HOST); + + m_h_val = static_cast(malloc(bytes())); + + printf("aaaaaaaaaaaaaaaaaaa\n"); + + m_allocated = m_allocated | HOST; + } + + if ((location & DEVICE) == DEVICE) { + release(DEVICE); + + CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); + + m_allocated = m_allocated | DEVICE; + } + } + // TODO: something like attribute->move() cusparseDnMatDescr_t m_dendescr; + locationT m_allocated; IndexT m_row_size; IndexT m_col_size; T* m_d_val; + T* m_h_val; }; } // namespace rxmesh From 80a7fafae45cb2c8fb8d6a966dbc3f40fa8db4fe Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 6 Mar 2023 02:41:50 -0800 Subject: [PATCH 25/44] update test and detete smooth attribute --- apps/MCF/mcf_sparse_matrix.cuh | 47 ++------------------------ include/rxmesh/matrix/dense_matrix.cuh | 4 --- 2 files changed, 2 insertions(+), 49 deletions(-) diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index f9244cfb..4dbae51a 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -145,31 +145,6 @@ __global__ static void mcf_A_X_setup( !use_uniform_laplace); } -template -__global__ static void update_smooth_result(const rxmesh::Context context, - rxmesh::VertexAttribute smooth_X, - rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix X_mat) -{ - using namespace rxmesh; - auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) { - auto r_ids = v_id.unpack(); - uint32_t r_patch_id = r_ids.first; - uint16_t r_local_id = r_ids.second; - - uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; - - smooth_X(v_id, 0) = X_mat(row_index, 0); - smooth_X(v_id, 1) = X_mat(row_index, 1); - smooth_X(v_id, 2) = X_mat(row_index, 2); - }; - - auto block = cooperative_groups::this_thread_block(); - Query query(context); - ShmemAllocator shrd_alloc; - query.dispatch(block, shrd_alloc, init_lambda); -} - template void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, const std::vector>& ground_truth) @@ -222,21 +197,6 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, // Solving the linear system using chol factorization and no reordering A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); - auto smooth_X = - rxmesh.add_vertex_attribute("smooth_X", 3, rxmesh::LOCATION_ALL); - - LaunchBox launch_box_smooth; - rxmesh.prepare_launch_box({Op::VV}, - launch_box_smooth, - (void*)update_smooth_result); - - update_smooth_result - <<>>( - rxmesh.get_context(), *smooth_X, A_mat, X_mat); - smooth_X->move(rxmesh::DEVICE, rxmesh::HOST); - X_mat.move(rxmesh::DEVICE, rxmesh::HOST); const T tol = 0.001; @@ -247,17 +207,14 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, uint32_t v_linear_id = rxmesh.linear_id(vh); T a = X_mat(v_linear_id, 0); - T b = (*smooth_X)(vh, 0); - - EXPECT_EQ(X_mat(v_linear_id, 0), (*smooth_X)(vh, 0)); for (uint32_t i = 0; i < 3; ++i) { - tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) / + tmp_tol = std::abs((X_mat(v_linear_id, i) - ground_truth[v_id][i]) / ground_truth[v_id][i]); if (tmp_tol > tol) { RXMESH_WARN("val: {}, truth: {}, tol: {}\n", - (*smooth_X)(vh, i), + X_mat(v_linear_id, i), ground_truth[v_id][i], tmp_tol); passed = false; diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index fb70b322..1ae1c59a 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -117,8 +117,6 @@ struct DenseMatrix RXMESH_WARN( "DenseMatrix::move() allocating target before moving to {}", location_to_string(target)); - - printf("aaaaaaaaaaaaaaaaaaa\n"); allocate(target); } @@ -153,8 +151,6 @@ struct DenseMatrix m_h_val = static_cast(malloc(bytes())); - printf("aaaaaaaaaaaaaaaaaaa\n"); - m_allocated = m_allocated | HOST; } From d8096b157c6d82e60856c132d8cca0bd43fbcd6c Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 6 Mar 2023 02:50:29 -0800 Subject: [PATCH 26/44] minor --- include/rxmesh/matrix/dense_matrix.cuh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index 1ae1c59a..c3317be4 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -71,7 +71,16 @@ struct DenseMatrix T* data() const { - return m_d_val; + if ((location & HOST) == HOST) { + return m_h_val; + } + + if ((location & DEVICE) == DEVICE) { + return m_d_val; + } + + assert(1 != 1); + return 0; } T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const From 122e2475ba2f253a6e192a09ea5cc41ead87ef27 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 6 Mar 2023 03:13:20 -0800 Subject: [PATCH 27/44] fix error --- include/rxmesh/matrix/dense_matrix.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index c3317be4..f264d426 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -69,7 +69,7 @@ struct DenseMatrix #endif } - T* data() const + T* data(locationT location = DEVICE) const { if ((location & HOST) == HOST) { return m_h_val; From 2051e6d60a4ee32aa8ab438b9c4ffbd27f798562 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 20 Mar 2023 23:44:31 -0700 Subject: [PATCH 28/44] low level init --- include/rxmesh/matrix/sparse_matrix.cuh | 249 +++++++++++++++++++++++- 1 file changed, 245 insertions(+), 4 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index e185fec7..b11f9f18 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -6,6 +6,7 @@ #include "rxmesh/query.cuh" #include "rxmesh/types.h" +#include "cusolverSp_LOWLEVEL_PREVIEW.h" #include "rxmesh/matrix/dense_matrix.cuh" namespace rxmesh { @@ -284,19 +285,23 @@ struct SparseMatrix assert(1 != 1); } - __host__ __device__ IndexT& get_nnz() const { + __host__ __device__ IndexT& get_nnz() const + { return m_nnz; } - __device__ IndexT& get_row_ptr_at(IndexT idx) const { + __device__ IndexT& get_row_ptr_at(IndexT idx) const + { return m_d_row_ptr[idx]; } - __device__ IndexT& get_col_idx_at(IndexT idx) const { + __device__ IndexT& get_col_idx_at(IndexT idx) const + { return m_d_col_idx[idx]; } - __device__ T& get_val_at(IndexT idx) const { + __device__ T& get_val_at(IndexT idx) const + { return m_d_val[idx]; } @@ -310,6 +315,7 @@ struct SparseMatrix CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle)); } + /* ----- CUSPARSE SPMM & SPMV ----- */ /** * @brief wrap up the cusparse api for sparse matrix dense matrix @@ -471,6 +477,10 @@ struct SparseMatrix } } + /* ----- SOLVER ----- */ + + /* --- HIGH LEVEL API --- */ + /** * @brief solve the Ax=b for x where x and b are all array */ @@ -624,6 +634,225 @@ struct SparseMatrix } } + /* --- LOW LEVEL API --- */ + + void spmat_chol_analysis() + { + CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&m_chol_info)); + m_internalDataInBytes = 0; + m_workspaceInBytes = 0; + CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_row_ptr, + m_d_col_idx, + m_chol_info)); + } + + void spmat_chol_buffer_alloc() + { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_val, + m_d_row_ptr, + m_d_col_idx, + m_chol_info, + &m_internalDataInBytes, + &m_workspaceInBytes)); + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholBufferInfo(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_val, + m_d_row_ptr, + m_d_col_idx, + m_chol_info, + &m_internalDataInBytes, + &m_workspaceInBytes)); + } + + CUDA_ERROR(cudaMalloc((void**)&m_chol_buffer, m_workspaceInBytes)); + } + + void spmat_chol_buffer_free() + { + CUDA_ERROR(cudaFree(m_chol_buffer)); + } + + void spmat_chol_factor() + { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_val, + m_d_row_ptr, + m_d_col_idx, + m_chol_info, + m_chol_buffer)); + } + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholFactor(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_val, + m_d_row_ptr, + m_d_col_idx, + m_chol_info, + m_chol_buffer)); + } + + double tol = 1.0e-8; + int singularity; + + CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot( + m_cusolver_sphandle, m_chol_info, tol, &singularity)); + if (0 <= singularity) { + RXMESH_WARN( + "WARNING: the matrix is singular at row {} under tol ({})", + singularity, + tol); + } + } + + void spmat_chol_solve(T* d_b, T* d_x) + { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholSolve(m_cusolver_sphandle, + m_row_size, + d_b, + d_x, + m_chol_info, + m_chol_buffer)); + } + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholSolve(m_cusolver_sphandle, + m_row_size, + d_b, + d_x, + m_chol_info, + m_chol_buffer)); + } + } + + /* Host compatibility */ + + void move(locationT source, locationT target, cudaStream_t stream = NULL) + { + if (source == target) { + RXMESH_WARN( + "DenseMatrix::move() source ({}) and target ({}) " + "are the same.", + location_to_string(source), + location_to_string(target)); + return; + } + + if ((source == HOST || source == DEVICE) && + ((source & m_allocated) != source)) { + RXMESH_ERROR( + "DenseMatrix::move() moving source is not valid" + " because it was not allocated on source i.e., {}", + location_to_string(source)); + } + + if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) && + ((target & m_allocated) != target)) { + RXMESH_WARN( + "DenseMatrix::move() allocating target before moving to {}", + location_to_string(target)); + allocate(target); + } + + if (source == HOST && target == DEVICE) { + CUDA_ERROR(cudaMemcpyAsync(m_d_val, + m_h_val, + m_nnz * sizeof(T), + cudaMemcpyHostToDevice, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_d_row_ptr, + m_h_row_ptr, + m_row_size * sizeof(IndexT), + cudaMemcpyHostToDevice, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_d_col_idx, + m_h_col_idx, + m_nnz * sizeof(IndexT), + cudaMemcpyHostToDevice, + stream)); + } else if (source == DEVICE && target == HOST) { + CUDA_ERROR(cudaMemcpyAsync(m_h_val, + m_d_val, + m_nnz * sizeof(T), + cudaMemcpyDeviceToHost, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_h_row_ptr, + m_d_row_ptr, + m_row_size * sizeof(IndexT), + cudaMemcpyDeviceToHost, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_h_col_idx, + m_d_col_idx, + m_nnz * sizeof(IndexT), + cudaMemcpyDeviceToHost, + stream)); + } + } + + void release(locationT location = LOCATION_ALL) + { + if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { + free(m_h_val); + m_h_val = nullptr; + m_h_row_ptr = nullptr; + m_h_col_idx = nullptr; + m_allocated = m_allocated & (~HOST); + } + + if (((location & DEVICE) == DEVICE) && + ((m_allocated & DEVICE) == DEVICE)) { + GPU_FREE(m_d_val); + GPU_FREE(m_h_row_ptr); + GPU_FREE(m_h_col_idx); + m_allocated = m_allocated & (~DEVICE); + } + } + + void allocate(locationT location) + { + if ((location & HOST) == HOST) { + release(HOST); + + m_h_val = static_cast(malloc(m_nnz * sizeof(T))); + m_h_row_ptr = static_cast(malloc(m_row_size * sizeof(IndexT))); + m_d_col_idx = static_cast(malloc(m_nnz * sizeof(IndexT))); + + m_allocated = m_allocated | HOST; + } + + if ((location & DEVICE) == DEVICE) { + release(DEVICE); + + CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T))); + CUDA_ERROR( + cudaMalloc((void**)&m_h_row_ptr, m_row_size * sizeof(IndexT))); + CUDA_ERROR( + cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT))); + + m_allocated = m_allocated | DEVICE; + } + } + + private: const Context m_context; cusparseHandle_t m_cusparse_handle; @@ -640,6 +869,18 @@ struct SparseMatrix IndexT m_row_size; IndexT m_col_size; IndexT m_nnz; + + // host data parameters + locationT m_allocated; + IndexT* m_h_row_ptr; + IndexT* m_h_col_idx; + T* m_h_val; + + // lower level API parameters + csrcholInfo_t m_chol_info; + size_t m_internalDataInBytes; + size_t m_workspaceInBytes; + void* m_chol_buffer; }; } // namespace rxmesh \ No newline at end of file From d7b9c33d6392a0ded0520ed94d768a4f905bf4aa Mon Sep 17 00:00:00 2001 From: ericycc Date: Tue, 21 Mar 2023 17:08:05 -0700 Subject: [PATCH 29/44] host-device compatibility --- include/rxmesh/matrix/sparse_matrix.cuh | 27 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index b11f9f18..0877822b 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -860,27 +860,34 @@ struct SparseMatrix cusparseSpMatDescr_t m_spdescr; cusparseMatDescr_t m_descr; - size_t m_spmm_buffer_size; - size_t m_spmv_buffer_size; + IndexT m_row_size; + IndexT m_col_size; + IndexT m_nnz; IndexT* m_d_row_ptr; IndexT* m_d_col_idx; T* m_d_val; - IndexT m_row_size; - IndexT m_col_size; - IndexT m_nnz; - // host data parameters - locationT m_allocated; - IndexT* m_h_row_ptr; - IndexT* m_h_col_idx; - T* m_h_val; + // host csr data + IndexT* m_h_row_ptr; + IndexT* m_h_col_idx; + T* m_h_val; + + // susparse buffer + size_t m_spmm_buffer_size; + size_t m_spmv_buffer_size; // lower level API parameters csrcholInfo_t m_chol_info; size_t m_internalDataInBytes; size_t m_workspaceInBytes; void* m_chol_buffer; + + // purmutation + IndexT* m_h_permute; + IndexT* m_d_permute; + + locationT m_allocated; }; } // namespace rxmesh \ No newline at end of file From daac342461932a8c68b0d2ed1259a60fe633bac1 Mon Sep 17 00:00:00 2001 From: ericycc Date: Tue, 21 Mar 2023 17:08:05 -0700 Subject: [PATCH 30/44] host-device compatibility --- include/rxmesh/matrix/sparse_matrix.cuh | 34 +++++++++++++++++-------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index b11f9f18..8f05fbd7 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -860,27 +860,41 @@ struct SparseMatrix cusparseSpMatDescr_t m_spdescr; cusparseMatDescr_t m_descr; - size_t m_spmm_buffer_size; - size_t m_spmv_buffer_size; + IndexT m_row_size; + IndexT m_col_size; + IndexT m_nnz; IndexT* m_d_row_ptr; IndexT* m_d_col_idx; T* m_d_val; - IndexT m_row_size; - IndexT m_col_size; - IndexT m_nnz; - // host data parameters - locationT m_allocated; - IndexT* m_h_row_ptr; - IndexT* m_h_col_idx; - T* m_h_val; + // host csr data + IndexT* m_h_row_ptr; + IndexT* m_h_col_idx; + T* m_h_val; + + // susparse buffer + size_t m_spmm_buffer_size; + size_t m_spmv_buffer_size; // lower level API parameters csrcholInfo_t m_chol_info; size_t m_internalDataInBytes; size_t m_workspaceInBytes; void* m_chol_buffer; + + // purmutation array + IndexT* m_h_permute; + IndexT* m_d_permute; + + // permuted CSR matrix + // equal to the original matrix if not permutated + // only allocated as a new CSR matrix if permutated + IndexT* m_d_p_row_ptr; + IndexT* m_d_p_col_idx; + T* m_d_p_val; + + locationT m_allocated; }; } // namespace rxmesh \ No newline at end of file From 882b40693150995e344577c82ddedb5b9ac31d4b Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 27 Mar 2023 12:06:08 -0700 Subject: [PATCH 31/44] add reordring and debugging --- include/rxmesh/matrix/sparse_matrix.cuh | 260 ++++++++++++++++++++--- tests/RXMesh_test/test_sparse_matrix.cuh | 81 ++++++- 2 files changed, 301 insertions(+), 40 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 8f05fbd7..41b31f2b 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -122,7 +122,8 @@ struct SparseMatrix m_descr(NULL), m_spdescr(NULL), m_spmm_buffer_size(0), - m_spmv_buffer_size(0) + m_spmv_buffer_size(0), + m_use_reorder(false) { using namespace rxmesh; constexpr uint32_t blockThreads = 256; @@ -636,8 +637,151 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ + void spmat_chol_reorder(rxmesh::Reorder reorder) + { + if (reorder == Reorder::NONE) { + RXMESH_INFO("None reordering is specified", + "Continue without reordering"); + m_use_reorder = false; + return; + } + + m_use_reorder = true; + + /*check on host*/ + bool on_host = true; + if ((HOST & m_allocated) != HOST) { + move(DEVICE, HOST); + on_host = false; + } + + if (reorder == Reorder::SYMRCM) { + CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute)); + } else if (reorder == Reorder::SYMAMD) { + CUSOLVER_ERROR(cusolverSpXcsrsymamdHost(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute)); + } else if (reorder == Reorder::NSTDIS) { + CUSOLVER_ERROR(cusolverSpXcsrmetisndHost(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + NULL, + m_h_permute)); + } + + // working space for permutation: B = A*Q*A^T + size_t size_perm = 0; + void* perm_buffer_cpu = NULL; + + CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle, + m_row_size, + m_col_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute, + m_h_permute, + &size_perm)); + + perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm); + assert(NULL != perm_buffer_cpu); + + IndexT* h_mapBfromQ = + static_cast(malloc(m_nnz * sizeof(IndexT))); + IndexT* d_mapBfromQ = CUDA_ERROR( + cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT))); + + // do the permutation which works only on the col and row indices + CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle, + m_row_size, + m_col_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute, + m_h_permute, + h_mapBfromQ, + perm_buffer_cpu)); + + // allocate the purmutated csr and copy from the host + CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T))); + CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr, + m_row_size * sizeof(IndexT))); + CUDA_ERROR( + cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT))); + + CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val, + m_h_val, + m_nnz * sizeof(T), + cudaMemcpyHostToDevice)); + CUDA_ERROR(cudaMemcpyAsync(m_d_solver_row_ptr, + m_h_row_ptr, + m_row_size * sizeof(IndexT), + cudaMemcpyHostToDevice)); + CUDA_ERROR(cudaMemcpyAsync(m_d_solver_col_idx, + m_h_col_idx, + m_nnz * sizeof(IndexT), + cudaMemcpyHostToDevice)); + + // do the permutation for val indices + cusparseDnVecDescr_t val_values; + cusparseSpVecDescr_t val_permutation; + + CUDA_ERROR(cudaMemcpyAsync(d_mapBfromQ, + h_mapBfromQ, + m_nnz * sizeof(IndexT), + cudaMemcpyHostToDevice)); + + CUSPARSE_ERROR(cusparseCreateSpVec(&val_permutation, + m_nnz, + m_nnz, + d_mapBfromQ, + m_d_val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + CUSPARSE_ERROR(cusparseCreateDnVec( + &val_values, m_nnz, m_d_solver_val, CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseScatter(m_cusparse_handle, val_permutation, val_values)); + CUSPARSE_ERROR(cusparseDestroyDnVec(val_values)); + CUSPARSE_ERROR(cusparseDestroySpVec(val_permutation)); + + free(perm_buffer_cpu); + free(h_mapBfromQ); + CUDA_ERROR(cudaFree(d_mapBfromQ)); + + // restore the host data back to the original + if (on_host) { + move(DEVICE, HOST); + } else { + release(HOST); + } + } + void spmat_chol_analysis() { + if (!m_use_reorder) { + m_d_solver_row_ptr = m_d_row_ptr; + m_d_solver_col_idx = m_d_col_idx; + m_d_solver_val = m_d_val; + } + CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&m_chol_info)); m_internalDataInBytes = 0; m_workspaceInBytes = 0; @@ -645,8 +789,8 @@ struct SparseMatrix m_row_size, m_nnz, m_descr, - m_d_row_ptr, - m_d_col_idx, + m_d_solver_row_ptr, + m_d_solver_col_idx, m_chol_info)); } @@ -657,9 +801,9 @@ struct SparseMatrix m_row_size, m_nnz, m_descr, - m_d_val, - m_d_row_ptr, - m_d_col_idx, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, m_chol_info, &m_internalDataInBytes, &m_workspaceInBytes)); @@ -670,9 +814,9 @@ struct SparseMatrix m_row_size, m_nnz, m_descr, - m_d_val, - m_d_row_ptr, - m_d_col_idx, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, m_chol_info, &m_internalDataInBytes, &m_workspaceInBytes)); @@ -693,9 +837,9 @@ struct SparseMatrix m_row_size, m_nnz, m_descr, - m_d_val, - m_d_row_ptr, - m_d_col_idx, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, m_chol_info, m_chol_buffer)); } @@ -704,9 +848,9 @@ struct SparseMatrix m_row_size, m_nnz, m_descr, - m_d_val, - m_d_row_ptr, - m_d_col_idx, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, m_chol_info, m_chol_buffer)); } @@ -726,25 +870,75 @@ struct SparseMatrix void spmat_chol_solve(T* d_b, T* d_x) { + + T* d_solver_b; + T* d_solver_x; + + if (m_use_reorder) { + /* purmute b and x*/ + CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T))); + CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T))); + + cusparseDnVecDescr_t b_values; + cusparseSpVecDescr_t b_permutation; + + CUSPARSE_ERROR(cusparseCreateSpVec(&b_permutation, + m_row_size, + m_row_size, + m_d_permute, + d_b, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseCreateDnVec(&b_values, m_nnz, d_solver_b, CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseScatter(m_cusparse_handle, b_permutation, b_values)); + CUSPARSE_ERROR(cusparseDestroyDnVec(b_values)); + CUSPARSE_ERROR(cusparseDestroySpVec(b_permutation)); + + cusparseDnVecDescr_t x_values; + cusparseSpVecDescr_t x_permutation; + + CUSPARSE_ERROR(cusparseCreateSpVec(&x_permutation, + m_col_size, + m_col_size, + m_d_permute, + d_x, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseCreateDnVec(&x_values, m_nnz, d_solver_x, CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseScatter(m_cusparse_handle, x_permutation, x_values)); + CUSPARSE_ERROR(cusparseDestroyDnVec(x_values)); + CUSPARSE_ERROR(cusparseDestroySpVec(x_permutation)); + } else { + d_solver_b = d_b; + d_solver_x = d_x; + } + if constexpr (std::is_same_v) { CUSOLVER_ERROR(cusolverSpScsrcholSolve(m_cusolver_sphandle, m_row_size, - d_b, - d_x, + d_solver_b, + d_solver_x, m_chol_info, m_chol_buffer)); } + if constexpr (std::is_same_v) { CUSOLVER_ERROR(cusolverSpDcsrcholSolve(m_cusolver_sphandle, m_row_size, - d_b, - d_x, + d_solver_b, + d_solver_x, m_chol_info, m_chol_buffer)); } } - /* Host compatibility */ + /* Host data compatibility */ void move(locationT source, locationT target, cudaStream_t stream = NULL) { @@ -812,6 +1006,8 @@ struct SparseMatrix { if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { free(m_h_val); + free(m_h_row_ptr); + free(m_h_col_idx); m_h_val = nullptr; m_h_row_ptr = nullptr; m_h_col_idx = nullptr; @@ -832,9 +1028,10 @@ struct SparseMatrix if ((location & HOST) == HOST) { release(HOST); - m_h_val = static_cast(malloc(m_nnz * sizeof(T))); - m_h_row_ptr = static_cast(malloc(m_row_size * sizeof(IndexT))); - m_d_col_idx = static_cast(malloc(m_nnz * sizeof(IndexT))); + m_h_val = static_cast(malloc(m_nnz * sizeof(T))); + m_h_row_ptr = + static_cast(malloc(m_row_size * sizeof(IndexT))); + m_h_col_idx = static_cast(malloc(m_nnz * sizeof(IndexT))); m_allocated = m_allocated | HOST; } @@ -844,7 +1041,7 @@ struct SparseMatrix CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T))); CUDA_ERROR( - cudaMalloc((void**)&m_h_row_ptr, m_row_size * sizeof(IndexT))); + cudaMalloc((void**)&m_d_row_ptr, m_row_size * sizeof(IndexT))); CUDA_ERROR( cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT))); @@ -864,6 +1061,7 @@ struct SparseMatrix IndexT m_col_size; IndexT m_nnz; + // device csr data IndexT* m_d_row_ptr; IndexT* m_d_col_idx; T* m_d_val; @@ -887,13 +1085,15 @@ struct SparseMatrix IndexT* m_h_permute; IndexT* m_d_permute; - // permuted CSR matrix - // equal to the original matrix if not permutated - // only allocated as a new CSR matrix if permutated - IndexT* m_d_p_row_ptr; - IndexT* m_d_p_col_idx; - T* m_d_p_val; + // CSR matrix for solving only + // equal to the original matrix if not permutated + // only allocated as a new CSR matrix if permutated + IndexT* m_d_solver_row_ptr; + IndexT* m_d_solver_col_idx; + T* m_d_solver_val; + // flags + bool m_use_reorder; locationT m_allocated; }; diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 169fc498..09ff49aa 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -41,8 +41,7 @@ __global__ static void sparse_mat_edge_len_test( uint32_t r_patch_id = r_ids.first; uint16_t r_local_id = r_ids.second; - uint32_t row_index = - context.m_vertex_prefix[r_patch_id] + r_local_id; + uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; arr_ref[row_index] = 0; sparse_mat(v_id, v_id) = 0; @@ -102,8 +101,7 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, uint32_t r_patch_id = r_ids.first; uint16_t r_local_id = r_ids.second; - uint32_t row_index = - context.m_vertex_prefix[r_patch_id] + r_local_id; + uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; B_mat(row_index, 0) = iter.size() * 7.4f; B_mat(row_index, 1) = iter.size() * 2.6f; @@ -322,14 +320,77 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) std::vector h_ret_mat(num_vertices); CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), - ret_mat.data(), - num_vertices * 3 * sizeof(float), - cudaMemcpyDeviceToHost)); + ret_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + std::vector h_B_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_B_mat.data(), + B_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + + for (uint32_t i = 0; i < num_vertices; ++i) { + for (uint32_t j = 0; j < 3; ++j) { + EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); + } + } + A_mat.free(); +} + +TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) +{ + using namespace rxmesh; + + // Select device + cuda_query(0); + + // generate rxmesh obj + std::string obj_path = rxmesh_args.obj_file_name; + RXMeshStatic rxmesh(obj_path, rxmesh_args.quite); + + uint32_t num_vertices = rxmesh.get_num_vertices(); + + const uint32_t threads = 256; + const uint32_t blocks = DIVIDE_UP(num_vertices, threads); + + auto coords = rxmesh.get_input_vertex_coordinates(); + SparseMatrix A_mat(rxmesh); + DenseMatrix X_mat(num_vertices, 3); + DenseMatrix B_mat(num_vertices, 3); + DenseMatrix ret_mat(num_vertices, 3); + + float time_step = 1.f; + + LaunchBox launch_box; + rxmesh.prepare_launch_box( + {Op::VV}, launch_box, (void*)simple_A_X_B_setup); + + simple_A_X_B_setup<<>>( + rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); + + // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); + A_mat.spmat_chol_analysis(); + A_mat.spmat_chol_buffer_alloc(); + A_mat.spmat_chol_factor(); + + for (int i = 0; i < B_mat.m_col_size; ++i) { + A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); + } + + A_mat.denmat_mul(X_mat, ret_mat); + + std::vector h_ret_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), + ret_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); std::vector h_B_mat(num_vertices); CUDA_ERROR(cudaMemcpy(h_B_mat.data(), - B_mat.data(), - num_vertices * 3 * sizeof(float), - cudaMemcpyDeviceToHost)); + B_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); for (uint32_t i = 0; i < num_vertices; ++i) { for (uint32_t j = 0; j < 3; ++j) { From 41d47f108ec9f4ffd7c7ae07e769e0917984f179 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 27 Mar 2023 21:19:18 -0700 Subject: [PATCH 32/44] lower api work, reorder debug --- include/rxmesh/matrix/sparse_matrix.cuh | 102 +++++++++++++++++++---- tests/RXMesh_test/test_sparse_matrix.cuh | 11 ++- 2 files changed, 95 insertions(+), 18 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 41b31f2b..409c8de7 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -306,7 +306,7 @@ struct SparseMatrix return m_d_val[idx]; } - void free() + void free_mat() { CUDA_ERROR(cudaFree(m_d_row_ptr)); CUDA_ERROR(cudaFree(m_d_col_idx)); @@ -637,17 +637,90 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ + void spmat_chol_test() + { + // cholesky csr info + csrcholInfo_t chol_info; + CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&chol_info)); + + // cholesky analysis + CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_row_ptr, + m_d_col_idx, + chol_info)); + + size_t internalDataInBytes = 0; + size_t workspaceInBytes = 0; + + // allocate cholesky buffer + CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_val, + m_d_row_ptr, + m_d_col_idx, + chol_info, + &internalDataInBytes, + &workspaceInBytes)); + + void* chol_buffer; + CUDA_ERROR(cudaMalloc((void**)&chol_buffer, workspaceInBytes)); + + + // cholesky factor + CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_val, + m_d_row_ptr, + m_d_col_idx, + chol_info, + chol_buffer)); + + double tol = 1.0e-8; + int singularity; + + CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot( + m_cusolver_sphandle, chol_info, tol, &singularity)); + if (0 <= singularity) { + printf("WARNING: the matrix is singular at row %d under tol (%E)\n", + singularity, + tol); + } + } + void spmat_chol_reorder(rxmesh::Reorder reorder) { if (reorder == Reorder::NONE) { RXMESH_INFO("None reordering is specified", "Continue without reordering"); m_use_reorder = false; + + if (m_reorder_allocated) { + GPU_FREE(m_d_val); + GPU_FREE(m_h_row_ptr); + GPU_FREE(m_h_col_idx); + m_reorder_allocated = false; + } + return; } m_use_reorder = true; + // allocate the purmutated csr + m_reorder_allocated = true; + CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T))); + CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr, + m_row_size * sizeof(IndexT))); + CUDA_ERROR( + cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT))); + /*check on host*/ bool on_host = true; if ((HOST & m_allocated) != HOST) { @@ -702,8 +775,8 @@ struct SparseMatrix IndexT* h_mapBfromQ = static_cast(malloc(m_nnz * sizeof(IndexT))); - IndexT* d_mapBfromQ = CUDA_ERROR( - cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT))); + IndexT* d_mapBfromQ; + CUDA_ERROR(cudaMalloc((void**)&d_mapBfromQ, m_nnz * sizeof(IndexT))); // do the permutation which works only on the col and row indices CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle, @@ -718,13 +791,7 @@ struct SparseMatrix h_mapBfromQ, perm_buffer_cpu)); - // allocate the purmutated csr and copy from the host - CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T))); - CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr, - m_row_size * sizeof(IndexT))); - CUDA_ERROR( - cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT))); - + // copy the purmutated csr from the host CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val, m_h_val, m_nnz * sizeof(T), @@ -858,8 +925,14 @@ struct SparseMatrix double tol = 1.0e-8; int singularity; - CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot( - m_cusolver_sphandle, m_chol_info, tol, &singularity)); + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot( + m_cusolver_sphandle, m_chol_info, tol, &singularity)); + } + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot( + m_cusolver_sphandle, m_chol_info, tol, &singularity)); + } if (0 <= singularity) { RXMESH_WARN( "WARNING: the matrix is singular at row {} under tol ({})", @@ -1017,8 +1090,8 @@ struct SparseMatrix if (((location & DEVICE) == DEVICE) && ((m_allocated & DEVICE) == DEVICE)) { GPU_FREE(m_d_val); - GPU_FREE(m_h_row_ptr); - GPU_FREE(m_h_col_idx); + GPU_FREE(m_d_row_ptr); + GPU_FREE(m_d_col_idx); m_allocated = m_allocated & (~DEVICE); } } @@ -1088,6 +1161,7 @@ struct SparseMatrix // CSR matrix for solving only // equal to the original matrix if not permutated // only allocated as a new CSR matrix if permutated + bool m_reorder_allocated; IndexT* m_d_solver_row_ptr; IndexT* m_d_solver_col_idx; T* m_d_solver_val; diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 09ff49aa..e202d36d 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -196,7 +196,7 @@ TEST(RXMeshStatic, SparseMatrix) CUDA_ERROR(cudaFree(d_arr_ones)); CUDA_ERROR(cudaFree(d_result)); CUDA_ERROR(cudaFree(vet_degree)); - spmat.free(); + spmat.free_mat(); } /* First replace the sparse matrix entry with the edge length and then do spmv @@ -267,7 +267,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) CUDA_ERROR(cudaFree(d_arr_ref)); CUDA_ERROR(cudaFree(d_arr_ones)); CUDA_ERROR(cudaFree(d_result)); - spmat.free(); + spmat.free_mat(); } /* set up a simple AX=B system where A is a sparse matrix, B and C are dense @@ -334,7 +334,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); } } - A_mat.free(); + A_mat.free_mat(); } TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) @@ -371,6 +371,9 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); + + // A_mat.spmat_chol_test(); + A_mat.spmat_chol_reorder(Reorder::NSTDIS); A_mat.spmat_chol_analysis(); A_mat.spmat_chol_buffer_alloc(); A_mat.spmat_chol_factor(); @@ -397,5 +400,5 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); } } - A_mat.free(); + A_mat.free_mat(); } \ No newline at end of file From 9bb2ac8ef24c248019923ecf9399c243c689b989 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Tue, 28 Mar 2023 05:16:30 -0700 Subject: [PATCH 33/44] sync lab PC --- include/rxmesh/matrix/sparse_matrix.cuh | 74 ++++++++++++++----------- 1 file changed, 42 insertions(+), 32 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 409c8de7..42d3a157 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -123,7 +123,8 @@ struct SparseMatrix m_spdescr(NULL), m_spmm_buffer_size(0), m_spmv_buffer_size(0), - m_use_reorder(false) + m_use_reorder(false), + m_allocated(LOCATION_NONE) { using namespace rxmesh; constexpr uint32_t blockThreads = 256; @@ -711,6 +712,13 @@ struct SparseMatrix return; } + /*check on host*/ + bool on_host = true; + if ((HOST & m_allocated) != HOST) { + move(DEVICE, HOST); + on_host = false; + } + m_use_reorder = true; // allocate the purmutated csr @@ -721,12 +729,9 @@ struct SparseMatrix CUDA_ERROR( cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT))); - /*check on host*/ - bool on_host = true; - if ((HOST & m_allocated) != HOST) { - move(DEVICE, HOST); - on_host = false; - } + m_h_permute = (IndexT*)malloc(m_row_size * sizeof(IndexT)); + CUDA_ERROR( + cudaMalloc((void**)&m_d_permute, m_row_size * sizeof(IndexT))); if (reorder == Reorder::SYMRCM) { CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle, @@ -755,7 +760,19 @@ struct SparseMatrix m_h_permute)); } + CUDA_ERROR(cudaMemcpyAsync(m_d_permute, + m_h_permute, + m_row_size * sizeof(IndexT), + cudaMemcpyHostToDevice)); + // working space for permutation: B = A*Q*A^T + // the permutation for matrix A which works only for the col and row + // indices, the val will be done on device with the d/h_val_permute + IndexT* h_val_permute = + static_cast(malloc(m_nnz * sizeof(IndexT))); + IndexT* d_val_permute; + CUDA_ERROR(cudaMalloc((void**)&d_val_permute, m_nnz * sizeof(IndexT))); + size_t size_perm = 0; void* perm_buffer_cpu = NULL; @@ -771,14 +788,7 @@ struct SparseMatrix &size_perm)); perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm); - assert(NULL != perm_buffer_cpu); - IndexT* h_mapBfromQ = - static_cast(malloc(m_nnz * sizeof(IndexT))); - IndexT* d_mapBfromQ; - CUDA_ERROR(cudaMalloc((void**)&d_mapBfromQ, m_nnz * sizeof(IndexT))); - - // do the permutation which works only on the col and row indices CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle, m_row_size, m_col_size, @@ -788,7 +798,7 @@ struct SparseMatrix m_h_col_idx, m_h_permute, m_h_permute, - h_mapBfromQ, + h_val_permute, perm_buffer_cpu)); // copy the purmutated csr from the host @@ -805,33 +815,33 @@ struct SparseMatrix m_nnz * sizeof(IndexT), cudaMemcpyHostToDevice)); - // do the permutation for val indices - cusparseDnVecDescr_t val_values; - cusparseSpVecDescr_t val_permutation; + // do the permutation for val indices on device + cusparseDnVecDescr_t val_final_desc; + cusparseSpVecDescr_t val_permute_desc; - CUDA_ERROR(cudaMemcpyAsync(d_mapBfromQ, - h_mapBfromQ, + CUDA_ERROR(cudaMemcpyAsync(d_val_permute, + h_val_permute, m_nnz * sizeof(IndexT), cudaMemcpyHostToDevice)); - CUSPARSE_ERROR(cusparseCreateSpVec(&val_permutation, + CUSPARSE_ERROR(cusparseCreateSpVec(&val_permute_desc, m_nnz, m_nnz, - d_mapBfromQ, + d_val_permute, m_d_val, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); CUSPARSE_ERROR(cusparseCreateDnVec( - &val_values, m_nnz, m_d_solver_val, CUDA_R_32F)); - CUSPARSE_ERROR( - cusparseScatter(m_cusparse_handle, val_permutation, val_values)); - CUSPARSE_ERROR(cusparseDestroyDnVec(val_values)); - CUSPARSE_ERROR(cusparseDestroySpVec(val_permutation)); + &val_final_desc, m_nnz, m_d_solver_val, CUDA_R_32F)); + CUSPARSE_ERROR(cusparseScatter( + m_cusparse_handle, val_permute_desc, val_final_desc)); + CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc)); + CUSPARSE_ERROR(cusparseDestroySpVec(val_permute_desc)); free(perm_buffer_cpu); - free(h_mapBfromQ); - CUDA_ERROR(cudaFree(d_mapBfromQ)); + free(h_val_permute); + // CUDA_ERROR(cudaFree(d_val_permute)); // restore the host data back to the original if (on_host) { @@ -1017,7 +1027,7 @@ struct SparseMatrix { if (source == target) { RXMESH_WARN( - "DenseMatrix::move() source ({}) and target ({}) " + "SparseMatrix::move() source ({}) and target ({}) " "are the same.", location_to_string(source), location_to_string(target)); @@ -1027,7 +1037,7 @@ struct SparseMatrix if ((source == HOST || source == DEVICE) && ((source & m_allocated) != source)) { RXMESH_ERROR( - "DenseMatrix::move() moving source is not valid" + "SparseMatrix::move() moving source is not valid" " because it was not allocated on source i.e., {}", location_to_string(source)); } @@ -1035,7 +1045,7 @@ struct SparseMatrix if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) && ((target & m_allocated) != target)) { RXMESH_WARN( - "DenseMatrix::move() allocating target before moving to {}", + "SparseMatrix::move() allocating target before moving to {}", location_to_string(target)); allocate(target); } From 69207d29c84c3edcd181caade9f00a40dcf11649 Mon Sep 17 00:00:00 2001 From: ericycc Date: Wed, 29 Mar 2023 12:09:44 -0700 Subject: [PATCH 34/44] test reordering error --- include/rxmesh/matrix/dense_matrix.cuh | 2 + include/rxmesh/matrix/sparse_matrix.cuh | 184 +++++++++++++++-------- tests/RXMesh_test/test_sparse_matrix.cuh | 58 +++---- 3 files changed, 156 insertions(+), 88 deletions(-) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index f264d426..19c77f7f 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -20,6 +20,7 @@ struct DenseMatrix m_allocated(LOCATION_NONE) { CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); + m_allocated = m_allocated | DEVICE; CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr, m_row_size, @@ -28,6 +29,7 @@ struct DenseMatrix m_d_val, CUDA_R_32F, CUSPARSE_ORDER_COL)); + } void set_ones() diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 42d3a157..c137ae6b 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -191,7 +191,7 @@ struct SparseMatrix // val pointer allocation, actual value init should be in another // function - CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(IndexT))); + CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T))); CUSPARSE_ERROR(cusparseCreateMatDescr(&m_descr)); CUSPARSE_ERROR( @@ -213,6 +213,8 @@ struct SparseMatrix CUSPARSE_ERROR(cusparseCreate(&m_cusparse_handle)); CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle)); + + m_allocated = m_allocated | DEVICE; } void set_ones() @@ -638,61 +640,83 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ - void spmat_chol_test() + void spmat_chol_test_reorder() { - // cholesky csr info - csrcholInfo_t chol_info; - CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&chol_info)); - - // cholesky analysis - CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle, - m_row_size, - m_nnz, - m_descr, - m_d_row_ptr, - m_d_col_idx, - chol_info)); - - size_t internalDataInBytes = 0; - size_t workspaceInBytes = 0; - - // allocate cholesky buffer - CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle, - m_row_size, - m_nnz, - m_descr, - m_d_val, - m_d_row_ptr, - m_d_col_idx, - chol_info, - &internalDataInBytes, - &workspaceInBytes)); - - void* chol_buffer; - CUDA_ERROR(cudaMalloc((void**)&chol_buffer, workspaceInBytes)); - - - // cholesky factor - CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle, - m_row_size, - m_nnz, - m_descr, - m_d_val, - m_d_row_ptr, - m_d_col_idx, - chol_info, - chol_buffer)); - - double tol = 1.0e-8; - int singularity; + } - CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot( - m_cusolver_sphandle, chol_info, tol, &singularity)); - if (0 <= singularity) { - printf("WARNING: the matrix is singular at row %d under tol (%E)\n", - singularity, - tol); + void spmat_chol_test_purmute() + { + // Host problem definition + int size = 8; + int nnz = 4; + int hX_indices[] = {0, 3, 4, 7, 1, 2, 5, 6}; + float hX_values[] = {5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f}; + float hY[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + float hY_result[] = {5.0f, 1.0f, 2.0f, 6.0f, 7.0f, 3.0f, 4.0f, 8.0f}; + //-------------------------------------------------------------------------- + // Device memory management + int* dX_indices; + float *dY, *dX_values; + CUDA_ERROR(cudaMalloc((void**)&dX_indices, size * sizeof(int))); + CUDA_ERROR(cudaMalloc((void**)&dX_values, size * sizeof(float))); + CUDA_ERROR(cudaMalloc((void**)&dY, size * sizeof(float))); + + CUDA_ERROR(cudaMemcpy(dX_indices, + hX_indices, + size * sizeof(int), + cudaMemcpyHostToDevice)); + CUDA_ERROR(cudaMemcpy(dX_values, + hX_values, + size * sizeof(float), + cudaMemcpyHostToDevice)); + CUDA_ERROR( + cudaMemcpy(dY, hY, size * sizeof(float), cudaMemcpyHostToDevice)); + //-------------------------------------------------------------------------- + // CUSPARSE APIs + cusparseHandle_t handle = NULL; + cusparseSpVecDescr_t vecX; + cusparseDnVecDescr_t vecY; + CUSPARSE_ERROR(cusparseCreate(&handle)); + // Create sparse vector X + CUSPARSE_ERROR(cusparseCreateSpVec(&vecX, + size, + size, + dX_indices, + dX_values, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + // Create dense vector y + CUSPARSE_ERROR(cusparseCreateDnVec(&vecY, size, dY, CUDA_R_32F)); + + // execute Scatter + CUSPARSE_ERROR(cusparseScatter(handle, vecX, vecY)); + + // destroy matrix/vector descriptors + CUSPARSE_ERROR(cusparseDestroySpVec(vecX)); + CUSPARSE_ERROR(cusparseDestroyDnVec(vecY)); + CUSPARSE_ERROR(cusparseDestroy(handle)); + //-------------------------------------------------------------------------- + // device result check + CUDA_ERROR( + cudaMemcpy(hY, dY, size * sizeof(float), cudaMemcpyDeviceToHost)); + int correct = 1; + for (int i = 0; i < size; i++) { + RXMESH_INFO("hY[]: {}; hY_result[i]: {};", hY[i], hY_result[i]); + if (hY[i] != hY_result[i]) { + correct = 0; + break; + } } + if (correct) + printf("scatter_example test PASSED\n"); + else + printf("scatter_example test FAILED: wrong result\n"); + //-------------------------------------------------------------------------- + // device memory deallocation + CUDA_ERROR(cudaFree(dX_indices)); + CUDA_ERROR(cudaFree(dX_values)); + CUDA_ERROR(cudaFree(dY)); } void spmat_chol_reorder(rxmesh::Reorder reorder) @@ -703,9 +727,9 @@ struct SparseMatrix m_use_reorder = false; if (m_reorder_allocated) { - GPU_FREE(m_d_val); - GPU_FREE(m_h_row_ptr); - GPU_FREE(m_h_col_idx); + GPU_FREE(m_d_solver_val); + GPU_FREE(m_d_solver_row_ptr); + GPU_FREE(m_d_solver_col_idx); m_reorder_allocated = false; } @@ -733,6 +757,27 @@ struct SparseMatrix CUDA_ERROR( cudaMalloc((void**)&m_d_permute, m_row_size * sizeof(IndexT))); + CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle)); + + RXMESH_INFO("nnz {} - size_row {} - size_col {}", + m_nnz, + m_row_size, + m_col_size); + + RXMESH_INFO("print: {}, {}, {}, {}, {}", + m_h_row_ptr[0], + m_h_row_ptr[1], + m_h_row_ptr[2], + m_h_row_ptr[3], + m_h_row_ptr[4]); + + RXMESH_INFO("print: {}, {}, {}, {}, {}", + m_h_col_idx[0], + m_h_col_idx[1], + m_h_col_idx[2], + m_h_col_idx[3], + m_h_col_idx[4]); + if (reorder == Reorder::SYMRCM) { CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle, m_row_size, @@ -760,6 +805,13 @@ struct SparseMatrix m_h_permute)); } + RXMESH_INFO("print: {}, {}, {}, {}, {}", + m_h_permute[0], + m_h_permute[1], + m_h_permute[2], + m_h_permute[3], + m_h_permute[4]); + CUDA_ERROR(cudaMemcpyAsync(m_d_permute, m_h_permute, m_row_size * sizeof(IndexT), @@ -789,6 +841,11 @@ struct SparseMatrix perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm); + RXMESH_INFO("size_perm {} - size_row {} - size_col {}", + size_perm, + m_row_size, + m_col_size); + CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle, m_row_size, m_col_size, @@ -801,6 +858,14 @@ struct SparseMatrix h_val_permute, perm_buffer_cpu)); + + RXMESH_INFO("print: {}, {}, {}, {}, {}", + h_val_permute[0], + h_val_permute[1], + h_val_permute[2], + h_val_permute[3], + h_val_permute[4]); + // copy the purmutated csr from the host CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val, m_h_val, @@ -816,8 +881,8 @@ struct SparseMatrix cudaMemcpyHostToDevice)); // do the permutation for val indices on device - cusparseDnVecDescr_t val_final_desc; cusparseSpVecDescr_t val_permute_desc; + cusparseDnVecDescr_t val_final_desc; CUDA_ERROR(cudaMemcpyAsync(d_val_permute, h_val_permute, @@ -836,12 +901,13 @@ struct SparseMatrix &val_final_desc, m_nnz, m_d_solver_val, CUDA_R_32F)); CUSPARSE_ERROR(cusparseScatter( m_cusparse_handle, val_permute_desc, val_final_desc)); - CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc)); + CUSPARSE_ERROR(cusparseDestroySpVec(val_permute_desc)); + CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc)); free(perm_buffer_cpu); free(h_val_permute); - // CUDA_ERROR(cudaFree(d_val_permute)); + CUDA_ERROR(cudaFree(d_val_permute)); // restore the host data back to the original if (on_host) { diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index e202d36d..6d7db9e8 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -372,33 +372,33 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); - // A_mat.spmat_chol_test(); - A_mat.spmat_chol_reorder(Reorder::NSTDIS); - A_mat.spmat_chol_analysis(); - A_mat.spmat_chol_buffer_alloc(); - A_mat.spmat_chol_factor(); - - for (int i = 0; i < B_mat.m_col_size; ++i) { - A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); - } - - A_mat.denmat_mul(X_mat, ret_mat); - - std::vector h_ret_mat(num_vertices); - CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), - ret_mat.data(), - num_vertices * 3 * sizeof(float), - cudaMemcpyDeviceToHost)); - std::vector h_B_mat(num_vertices); - CUDA_ERROR(cudaMemcpy(h_B_mat.data(), - B_mat.data(), - num_vertices * 3 * sizeof(float), - cudaMemcpyDeviceToHost)); - - for (uint32_t i = 0; i < num_vertices; ++i) { - for (uint32_t j = 0; j < 3; ++j) { - EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); - } - } - A_mat.free_mat(); + A_mat.spmat_chol_test_purmute(); + A_mat.spmat_chol_reorder(Reorder::SYMRCM); + // A_mat.spmat_chol_analysis(); + // A_mat.spmat_chol_buffer_alloc(); + // A_mat.spmat_chol_factor(); + + // for (int i = 0; i < B_mat.m_col_size; ++i) { + // A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); + // } + + // A_mat.denmat_mul(X_mat, ret_mat); + + // std::vector h_ret_mat(num_vertices); + // CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), + // ret_mat.data(), + // num_vertices * 3 * sizeof(float), + // cudaMemcpyDeviceToHost)); + // std::vector h_B_mat(num_vertices); + // CUDA_ERROR(cudaMemcpy(h_B_mat.data(), + // B_mat.data(), + // num_vertices * 3 * sizeof(float), + // cudaMemcpyDeviceToHost)); + + // for (uint32_t i = 0; i < num_vertices; ++i) { + // for (uint32_t j = 0; j < 3; ++j) { + // EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); + // } + // } + // A_mat.free_mat(); } \ No newline at end of file From 41fb9d7b7257a26447cf9064714c20c0978e2201 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Wed, 29 Mar 2023 21:38:25 -0700 Subject: [PATCH 35/44] val_permute_bug --- include/rxmesh/matrix/sparse_matrix.cuh | 52 +++++++++++++-------- tests/RXMesh_test/test_sparse_matrix.cuh | 58 ++++++++++++------------ 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index c137ae6b..c424d1a3 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -640,7 +640,7 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ - void spmat_chol_test_reorder() + void spmat_chol_test_val_reorder() { } @@ -749,7 +749,7 @@ struct SparseMatrix m_reorder_allocated = true; CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T))); CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr, - m_row_size * sizeof(IndexT))); + (m_row_size + 1) * sizeof(IndexT))); CUDA_ERROR( cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT))); @@ -764,14 +764,14 @@ struct SparseMatrix m_row_size, m_col_size); - RXMESH_INFO("print: {}, {}, {}, {}, {}", + RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", m_h_row_ptr[0], m_h_row_ptr[1], m_h_row_ptr[2], m_h_row_ptr[3], m_h_row_ptr[4]); - RXMESH_INFO("print: {}, {}, {}, {}, {}", + RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", m_h_col_idx[0], m_h_col_idx[1], m_h_col_idx[2], @@ -805,7 +805,7 @@ struct SparseMatrix m_h_permute)); } - RXMESH_INFO("print: {}, {}, {}, {}, {}", + RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}", m_h_permute[0], m_h_permute[1], m_h_permute[2], @@ -828,6 +828,20 @@ struct SparseMatrix size_t size_perm = 0; void* perm_buffer_cpu = NULL; + RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", + m_h_row_ptr[0], + m_h_row_ptr[1], + m_h_row_ptr[2], + m_h_row_ptr[3], + m_h_row_ptr[4]); + + RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", + m_h_col_idx[0], + m_h_col_idx[1], + m_h_col_idx[2], + m_h_col_idx[3], + m_h_col_idx[4]); + CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle, m_row_size, m_col_size, @@ -841,10 +855,11 @@ struct SparseMatrix perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm); - RXMESH_INFO("size_perm {} - size_row {} - size_col {}", - size_perm, - m_row_size, - m_col_size); + RXMESH_INFO("size_perm {}", size_perm); + + for (int j = 0; j < m_nnz; j++) { + h_val_permute[j] = j; + } CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle, m_row_size, @@ -858,8 +873,7 @@ struct SparseMatrix h_val_permute, perm_buffer_cpu)); - - RXMESH_INFO("print: {}, {}, {}, {}, {}", + RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}", h_val_permute[0], h_val_permute[1], h_val_permute[2], @@ -873,7 +887,7 @@ struct SparseMatrix cudaMemcpyHostToDevice)); CUDA_ERROR(cudaMemcpyAsync(m_d_solver_row_ptr, m_h_row_ptr, - m_row_size * sizeof(IndexT), + (m_row_size + 1) * sizeof(IndexT), cudaMemcpyHostToDevice)); CUDA_ERROR(cudaMemcpyAsync(m_d_solver_col_idx, m_h_col_idx, @@ -1040,7 +1054,7 @@ struct SparseMatrix CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); CUSPARSE_ERROR( - cusparseCreateDnVec(&b_values, m_nnz, d_solver_b, CUDA_R_32F)); + cusparseCreateDnVec(&b_values, m_row_size, d_solver_b, CUDA_R_32F)); CUSPARSE_ERROR( cusparseScatter(m_cusparse_handle, b_permutation, b_values)); CUSPARSE_ERROR(cusparseDestroyDnVec(b_values)); @@ -1058,7 +1072,7 @@ struct SparseMatrix CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); CUSPARSE_ERROR( - cusparseCreateDnVec(&x_values, m_nnz, d_solver_x, CUDA_R_32F)); + cusparseCreateDnVec(&x_values, m_col_size, d_solver_x, CUDA_R_32F)); CUSPARSE_ERROR( cusparseScatter(m_cusparse_handle, x_permutation, x_values)); CUSPARSE_ERROR(cusparseDestroyDnVec(x_values)); @@ -1124,7 +1138,7 @@ struct SparseMatrix stream)); CUDA_ERROR(cudaMemcpyAsync(m_d_row_ptr, m_h_row_ptr, - m_row_size * sizeof(IndexT), + (m_row_size + 1) * sizeof(IndexT), cudaMemcpyHostToDevice, stream)); CUDA_ERROR(cudaMemcpyAsync(m_d_col_idx, @@ -1140,7 +1154,7 @@ struct SparseMatrix stream)); CUDA_ERROR(cudaMemcpyAsync(m_h_row_ptr, m_d_row_ptr, - m_row_size * sizeof(IndexT), + (m_row_size + 1) * sizeof(IndexT), cudaMemcpyDeviceToHost, stream)); CUDA_ERROR(cudaMemcpyAsync(m_h_col_idx, @@ -1179,7 +1193,7 @@ struct SparseMatrix m_h_val = static_cast(malloc(m_nnz * sizeof(T))); m_h_row_ptr = - static_cast(malloc(m_row_size * sizeof(IndexT))); + static_cast(malloc((m_row_size + 1) * sizeof(IndexT))); m_h_col_idx = static_cast(malloc(m_nnz * sizeof(IndexT))); m_allocated = m_allocated | HOST; @@ -1189,8 +1203,8 @@ struct SparseMatrix release(DEVICE); CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T))); - CUDA_ERROR( - cudaMalloc((void**)&m_d_row_ptr, m_row_size * sizeof(IndexT))); + CUDA_ERROR(cudaMalloc((void**)&m_d_row_ptr, + (m_row_size + 1) * sizeof(IndexT))); CUDA_ERROR( cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT))); diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 6d7db9e8..5df3b8e2 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -372,33 +372,33 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); - A_mat.spmat_chol_test_purmute(); - A_mat.spmat_chol_reorder(Reorder::SYMRCM); - // A_mat.spmat_chol_analysis(); - // A_mat.spmat_chol_buffer_alloc(); - // A_mat.spmat_chol_factor(); - - // for (int i = 0; i < B_mat.m_col_size; ++i) { - // A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); - // } - - // A_mat.denmat_mul(X_mat, ret_mat); - - // std::vector h_ret_mat(num_vertices); - // CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), - // ret_mat.data(), - // num_vertices * 3 * sizeof(float), - // cudaMemcpyDeviceToHost)); - // std::vector h_B_mat(num_vertices); - // CUDA_ERROR(cudaMemcpy(h_B_mat.data(), - // B_mat.data(), - // num_vertices * 3 * sizeof(float), - // cudaMemcpyDeviceToHost)); - - // for (uint32_t i = 0; i < num_vertices; ++i) { - // for (uint32_t j = 0; j < 3; ++j) { - // EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); - // } - // } - // A_mat.free_mat(); + // A_mat.spmat_chol_test_purmute(); + // A_mat.spmat_chol_reorder(Reorder::NSTDIS); + A_mat.spmat_chol_analysis(); + A_mat.spmat_chol_buffer_alloc(); + A_mat.spmat_chol_factor(); + + for (int i = 0; i < B_mat.m_col_size; ++i) { + A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); + } + + A_mat.denmat_mul(X_mat, ret_mat); + + std::vector h_ret_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), + ret_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + std::vector h_B_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_B_mat.data(), + B_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + + for (uint32_t i = 0; i < num_vertices; ++i) { + for (uint32_t j = 0; j < 3; ++j) { + EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); + } + } + A_mat.free_mat(); } \ No newline at end of file From a58e54a0b2b3f9be9f580be8cf53e9142776291a Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Fri, 31 Mar 2023 12:44:40 -0700 Subject: [PATCH 36/44] request debug --- include/rxmesh/matrix/sparse_matrix.cuh | 138 ++++++++++++++--------- tests/RXMesh_test/test_sparse_matrix.cuh | 29 ++++- 2 files changed, 111 insertions(+), 56 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index c424d1a3..87115869 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -640,10 +640,6 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ - void spmat_chol_test_val_reorder() - { - } - void spmat_chol_test_purmute() { // Host problem definition @@ -759,24 +755,31 @@ struct SparseMatrix CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle)); - RXMESH_INFO("nnz {} - size_row {} - size_col {}", - m_nnz, - m_row_size, - m_col_size); - - RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", - m_h_row_ptr[0], - m_h_row_ptr[1], - m_h_row_ptr[2], - m_h_row_ptr[3], - m_h_row_ptr[4]); - - RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", - m_h_col_idx[0], - m_h_col_idx[1], - m_h_col_idx[2], - m_h_col_idx[3], - m_h_col_idx[4]); + // RXMESH_INFO("nnz {} - size_row {} - size_col {}", + // m_nnz, + // m_row_size, + // m_col_size); + + // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", + // m_h_row_ptr[0], + // m_h_row_ptr[1], + // m_h_row_ptr[2], + // m_h_row_ptr[3], + // m_h_row_ptr[4]); + + // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", + // m_h_col_idx[0], + // m_h_col_idx[1], + // m_h_col_idx[2], + // m_h_col_idx[3], + // m_h_col_idx[4]); + + // printf("m_h_val: %f %f %f %f %f\n", + // m_h_val[0], + // m_h_val[1], + // m_h_val[2], + // m_h_val[3], + // m_h_val[4]); if (reorder == Reorder::SYMRCM) { CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle, @@ -805,12 +808,12 @@ struct SparseMatrix m_h_permute)); } - RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}", - m_h_permute[0], - m_h_permute[1], - m_h_permute[2], - m_h_permute[3], - m_h_permute[4]); + // RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}", + // m_h_permute[0], + // m_h_permute[1], + // m_h_permute[2], + // m_h_permute[3], + // m_h_permute[4]); CUDA_ERROR(cudaMemcpyAsync(m_d_permute, m_h_permute, @@ -828,20 +831,6 @@ struct SparseMatrix size_t size_perm = 0; void* perm_buffer_cpu = NULL; - RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", - m_h_row_ptr[0], - m_h_row_ptr[1], - m_h_row_ptr[2], - m_h_row_ptr[3], - m_h_row_ptr[4]); - - RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", - m_h_col_idx[0], - m_h_col_idx[1], - m_h_col_idx[2], - m_h_col_idx[3], - m_h_col_idx[4]); - CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle, m_row_size, m_col_size, @@ -873,12 +862,51 @@ struct SparseMatrix h_val_permute, perm_buffer_cpu)); - RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}", - h_val_permute[0], - h_val_permute[1], - h_val_permute[2], - h_val_permute[3], - h_val_permute[4]); + // RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}", + // h_val_permute[0], + // h_val_permute[1], + // h_val_permute[2], + // h_val_permute[3], + // h_val_permute[4]); + + // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", + // m_h_row_ptr[0], + // m_h_row_ptr[1], + // m_h_row_ptr[2], + // m_h_row_ptr[3], + // m_h_row_ptr[4]); + + // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", + // m_h_col_idx[0], + // m_h_col_idx[1], + // m_h_col_idx[2], + // m_h_col_idx[3], + // m_h_col_idx[4]); + + // printf("m_h_val: %f %f %f %f %f\n", + // m_h_val[0], + // m_h_val[1], + // m_h_val[2], + // m_h_val[3], + // m_h_val[4]); + + + // host test val permute + // T* tmp_h_val = static_cast(malloc(m_nnz * sizeof(T))); + + // for (int j = 0; j < m_nnz; j++) { + // tmp_h_val[j] = m_h_val[j]; + // } + // for (int j = 0; j < m_nnz; j++) { + // m_h_val[j] = tmp_h_val[h_val_permute[j]]; + // } + + // printf("m_h_val: %f %f %f %f %f\n", + // m_h_val[0], + // m_h_val[1], + // m_h_val[2], + // m_h_val[3], + // m_h_val[4]); // copy the purmutated csr from the host CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val, @@ -1042,26 +1070,26 @@ struct SparseMatrix CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T))); CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T))); - cusparseDnVecDescr_t b_values; cusparseSpVecDescr_t b_permutation; + cusparseDnVecDescr_t b_values; CUSPARSE_ERROR(cusparseCreateSpVec(&b_permutation, m_row_size, m_row_size, m_d_permute, - d_b, + static_cast(d_b), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - CUSPARSE_ERROR( - cusparseCreateDnVec(&b_values, m_row_size, d_solver_b, CUDA_R_32F)); + CUSPARSE_ERROR(cusparseCreateDnVec( + &b_values, m_row_size, d_solver_b, CUDA_R_32F)); CUSPARSE_ERROR( cusparseScatter(m_cusparse_handle, b_permutation, b_values)); CUSPARSE_ERROR(cusparseDestroyDnVec(b_values)); CUSPARSE_ERROR(cusparseDestroySpVec(b_permutation)); - cusparseDnVecDescr_t x_values; cusparseSpVecDescr_t x_permutation; + cusparseDnVecDescr_t x_values; CUSPARSE_ERROR(cusparseCreateSpVec(&x_permutation, m_col_size, @@ -1071,8 +1099,8 @@ struct SparseMatrix CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); - CUSPARSE_ERROR( - cusparseCreateDnVec(&x_values, m_col_size, d_solver_x, CUDA_R_32F)); + CUSPARSE_ERROR(cusparseCreateDnVec( + &x_values, m_col_size, d_solver_x, CUDA_R_32F)); CUSPARSE_ERROR( cusparseScatter(m_cusparse_handle, x_permutation, x_values)); CUSPARSE_ERROR(cusparseDestroyDnVec(x_values)); diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 5df3b8e2..c4063aef 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -130,6 +130,21 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, query.dispatch(block, shrd_alloc, mat_setup); } +template +__global__ static void check_diag(const rxmesh::Context context, + rxmesh::SparseMatrix A_mat) +{ + using namespace rxmesh; + auto init = [&](VertexHandle& v_id, const VertexIterator& iter) { + printf("%lf\n", A_mat(v_id, v_id)); + }; + + auto block = cooperative_groups::this_thread_block(); + Query query(context); + ShmemAllocator shrd_alloc; + query.dispatch(block, shrd_alloc, init); +} + /* Check the access of the sparse matrix in CSR format in device */ TEST(RXMeshStatic, SparseMatrix) @@ -370,10 +385,22 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); + + LaunchBox test_launch_box; + rxmesh.prepare_launch_box( + {Op::VV}, test_launch_box, (void*)check_diag); + + check_diag + <<>>(rxmesh.get_context(), A_mat); + + cudaDeviceSynchronize(); + // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); // A_mat.spmat_chol_test_purmute(); - // A_mat.spmat_chol_reorder(Reorder::NSTDIS); + A_mat.spmat_chol_reorder(Reorder::NSTDIS); A_mat.spmat_chol_analysis(); A_mat.spmat_chol_buffer_alloc(); A_mat.spmat_chol_factor(); From bdbe290065800dc6d46fbe8a611afc9607d3587c Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 3 Apr 2023 11:07:42 -0700 Subject: [PATCH 37/44] worksfine --- include/rxmesh/matrix/dense_matrix.cuh | 53 ++++--- include/rxmesh/matrix/sparse_matrix.cuh | 183 +++++++---------------- tests/RXMesh_test/test_sparse_matrix.cuh | 16 +- 3 files changed, 96 insertions(+), 156 deletions(-) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index 19c77f7f..66ff6de1 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -10,18 +10,27 @@ namespace rxmesh { // Currently this is device only // host/device transormation will be added // only col major is supported -template +template struct DenseMatrix { DenseMatrix(IndexT row_size, IndexT col_size) : m_row_size(row_size), m_col_size(col_size), m_dendescr(NULL), - m_allocated(LOCATION_NONE) + m_allocated(LOCATION_NONE), + m_col_pad_bytes(0), + m_col_pad_idx(0) { - CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); m_allocated = m_allocated | DEVICE; + IndexT col_data_bytes = m_row_size * sizeof(T); + if (MemAlignSize != 0 && col_data_bytes % MemAlignSize != 0) { + m_col_pad_bytes = MemAlignSize - (col_data_bytes % MemAlignSize); + m_col_pad_idx = m_col_pad_bytes / sizeof(T); + } + + CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); + CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr, m_row_size, m_col_size, @@ -29,17 +38,16 @@ struct DenseMatrix m_d_val, CUDA_R_32F, CUSPARSE_ORDER_COL)); - } - void set_ones() - { - std::vector init_tmp_arr(m_row_size * m_col_size, 1); - CUDA_ERROR(cudaMemcpy(m_d_val, - init_tmp_arr.data(), - bytes() * sizeof(T), - cudaMemcpyHostToDevice)); - } + // void set_ones() + // { + // std::vector init_tmp_arr(m_row_size * m_col_size, 1); + // CUDA_ERROR(cudaMemcpy(m_d_val, + // init_tmp_arr.data(), + // bytes() * sizeof(T), + // cudaMemcpyHostToDevice)); + // } IndexT lead_dim() const { @@ -52,9 +60,9 @@ struct DenseMatrix assert(col < m_col_size); #ifdef __CUDA_ARCH__ - return m_d_val[col * m_row_size + row]; + return m_d_val[col * (m_row_size + m_col_pad_idx) + row]; #else - return m_h_val[col * m_row_size + row]; + return m_h_val[col * (m_row_size + m_col_pad_idx) + row]; #endif } @@ -65,9 +73,9 @@ struct DenseMatrix assert(col < m_col_size); #ifdef __CUDA_ARCH__ - return m_d_val[col * m_row_size + row]; + return m_d_val[col * (m_row_size + m_col_pad_idx) + row]; #else - return m_h_val[col * m_row_size + row]; + return m_h_val[col * (m_row_size + m_col_pad_idx) + row]; #endif } @@ -88,11 +96,15 @@ struct DenseMatrix T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const { if ((location & HOST) == HOST) { - return m_h_val + ld_idx * lead_dim(); + return m_h_val + ld_idx * (m_row_size + m_col_pad_idx); } if ((location & DEVICE) == DEVICE) { - return m_d_val + ld_idx * lead_dim(); + return m_d_val + ld_idx * (m_row_size + m_col_pad_idx); + } + + if ((location & m_allocated) == location) { + RXMESH_ERROR("Requested data not allocated on {}", location); } assert(1 != 1); @@ -101,7 +113,7 @@ struct DenseMatrix IndexT bytes() const { - return m_row_size * m_col_size * sizeof(T); + return (m_row_size + m_col_pad_idx) * m_col_size * sizeof(T); } void move(locationT source, locationT target, cudaStream_t stream = NULL) @@ -182,6 +194,9 @@ struct DenseMatrix IndexT m_col_size; T* m_d_val; T* m_h_val; + + IndexT m_col_pad_bytes; + IndexT m_col_pad_idx; }; } // namespace rxmesh diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 87115869..1d11ee0e 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -6,6 +6,11 @@ #include "rxmesh/query.cuh" #include "rxmesh/types.h" +#include "thrust/device_ptr.h" +#include "thrust/execution_policy.h" +#include "thrust/gather.h" +#include "thrust/scatter.h" + #include "cusolverSp_LOWLEVEL_PREVIEW.h" #include "rxmesh/matrix/dense_matrix.cuh" @@ -102,8 +107,43 @@ __global__ static void sparse_mat_col_fill(const rxmesh::Context context, query.dispatch(block, shrd_alloc, col_fillin); } +// d_out[d_p[i]] = d_in[i] +template +void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size) +{ + thrust::device_ptr t_p(d_p); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); + + thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o); +} + +// d_out[i] = d_in[d_p[i]] +template +void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size) +{ + thrust::device_ptr t_p(d_p); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); + + thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o); +} + } // namespace detail +__global__ void check_arr_int(int* arr, int size) { + for (int i = 0; i < size; ++i) { + printf("%d ", arr[i]); + } + printf("\n "); +} + +__global__ void check_arr_float(float* arr, int size) { + for (int i = 0; i < size; ++i) { + printf("%f ", arr[i]); + } + printf("\n "); +} // TODO: add compatibility for EE, FF, VE...... // TODO: purge operation? @@ -755,32 +795,6 @@ struct SparseMatrix CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle)); - // RXMESH_INFO("nnz {} - size_row {} - size_col {}", - // m_nnz, - // m_row_size, - // m_col_size); - - // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", - // m_h_row_ptr[0], - // m_h_row_ptr[1], - // m_h_row_ptr[2], - // m_h_row_ptr[3], - // m_h_row_ptr[4]); - - // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", - // m_h_col_idx[0], - // m_h_col_idx[1], - // m_h_col_idx[2], - // m_h_col_idx[3], - // m_h_col_idx[4]); - - // printf("m_h_val: %f %f %f %f %f\n", - // m_h_val[0], - // m_h_val[1], - // m_h_val[2], - // m_h_val[3], - // m_h_val[4]); - if (reorder == Reorder::SYMRCM) { CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle, m_row_size, @@ -808,13 +822,6 @@ struct SparseMatrix m_h_permute)); } - // RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}", - // m_h_permute[0], - // m_h_permute[1], - // m_h_permute[2], - // m_h_permute[3], - // m_h_permute[4]); - CUDA_ERROR(cudaMemcpyAsync(m_d_permute, m_h_permute, m_row_size * sizeof(IndexT), @@ -844,8 +851,6 @@ struct SparseMatrix perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm); - RXMESH_INFO("size_perm {}", size_perm); - for (int j = 0; j < m_nnz; j++) { h_val_permute[j] = j; } @@ -862,36 +867,8 @@ struct SparseMatrix h_val_permute, perm_buffer_cpu)); - // RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}", - // h_val_permute[0], - // h_val_permute[1], - // h_val_permute[2], - // h_val_permute[3], - // h_val_permute[4]); - - // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}", - // m_h_row_ptr[0], - // m_h_row_ptr[1], - // m_h_row_ptr[2], - // m_h_row_ptr[3], - // m_h_row_ptr[4]); - - // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}", - // m_h_col_idx[0], - // m_h_col_idx[1], - // m_h_col_idx[2], - // m_h_col_idx[3], - // m_h_col_idx[4]); - - // printf("m_h_val: %f %f %f %f %f\n", - // m_h_val[0], - // m_h_val[1], - // m_h_val[2], - // m_h_val[3], - // m_h_val[4]); - - - // host test val permute + + // do the permutation for val indices on host // T* tmp_h_val = static_cast(malloc(m_nnz * sizeof(T))); // for (int j = 0; j < m_nnz; j++) { @@ -901,13 +878,6 @@ struct SparseMatrix // m_h_val[j] = tmp_h_val[h_val_permute[j]]; // } - // printf("m_h_val: %f %f %f %f %f\n", - // m_h_val[0], - // m_h_val[1], - // m_h_val[2], - // m_h_val[3], - // m_h_val[4]); - // copy the purmutated csr from the host CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val, m_h_val, @@ -923,33 +893,15 @@ struct SparseMatrix cudaMemcpyHostToDevice)); // do the permutation for val indices on device - cusparseSpVecDescr_t val_permute_desc; - cusparseDnVecDescr_t val_final_desc; - CUDA_ERROR(cudaMemcpyAsync(d_val_permute, h_val_permute, m_nnz * sizeof(IndexT), cudaMemcpyHostToDevice)); + + detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz); - CUSPARSE_ERROR(cusparseCreateSpVec(&val_permute_desc, - m_nnz, - m_nnz, - d_val_permute, - m_d_val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F)); - CUSPARSE_ERROR(cusparseCreateDnVec( - &val_final_desc, m_nnz, m_d_solver_val, CUDA_R_32F)); - CUSPARSE_ERROR(cusparseScatter( - m_cusparse_handle, val_permute_desc, val_final_desc)); - - CUSPARSE_ERROR(cusparseDestroySpVec(val_permute_desc)); - CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc)); - - free(perm_buffer_cpu); free(h_val_permute); - CUDA_ERROR(cudaFree(d_val_permute)); + GPU_FREE(d_val_permute); // restore the host data back to the original if (on_host) { @@ -1068,43 +1020,10 @@ struct SparseMatrix if (m_use_reorder) { /* purmute b and x*/ CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T))); - CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T))); + detail::permute_gather(m_d_permute, d_b, d_solver_b, m_row_size); - cusparseSpVecDescr_t b_permutation; - cusparseDnVecDescr_t b_values; - - CUSPARSE_ERROR(cusparseCreateSpVec(&b_permutation, - m_row_size, - m_row_size, - m_d_permute, - static_cast(d_b), - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F)); - CUSPARSE_ERROR(cusparseCreateDnVec( - &b_values, m_row_size, d_solver_b, CUDA_R_32F)); - CUSPARSE_ERROR( - cusparseScatter(m_cusparse_handle, b_permutation, b_values)); - CUSPARSE_ERROR(cusparseDestroyDnVec(b_values)); - CUSPARSE_ERROR(cusparseDestroySpVec(b_permutation)); - - cusparseSpVecDescr_t x_permutation; - cusparseDnVecDescr_t x_values; - - CUSPARSE_ERROR(cusparseCreateSpVec(&x_permutation, - m_col_size, - m_col_size, - m_d_permute, - d_x, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F)); - CUSPARSE_ERROR(cusparseCreateDnVec( - &x_values, m_col_size, d_solver_x, CUDA_R_32F)); - CUSPARSE_ERROR( - cusparseScatter(m_cusparse_handle, x_permutation, x_values)); - CUSPARSE_ERROR(cusparseDestroyDnVec(x_values)); - CUSPARSE_ERROR(cusparseDestroySpVec(x_permutation)); + CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T))); + detail::permute_gather(m_d_permute, d_x, d_solver_x, m_row_size); } else { d_solver_b = d_b; d_solver_x = d_x; @@ -1127,6 +1046,12 @@ struct SparseMatrix m_chol_info, m_chol_buffer)); } + + if (m_use_reorder) { + detail::permute_scatter(m_d_permute, d_solver_x, d_x, m_row_size); + GPU_FREE(d_solver_b); + GPU_FREE(d_solver_x); + } } /* Host data compatibility */ diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index c4063aef..4e6ab2bd 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -386,16 +386,16 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); - LaunchBox test_launch_box; - rxmesh.prepare_launch_box( - {Op::VV}, test_launch_box, (void*)check_diag); + // LaunchBox test_launch_box; + // rxmesh.prepare_launch_box( + // {Op::VV}, test_launch_box, (void*)check_diag); - check_diag - <<>>(rxmesh.get_context(), A_mat); + // check_diag + // <<>>(rxmesh.get_context(), A_mat); - cudaDeviceSynchronize(); + // cudaDeviceSynchronize(); // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); From a53ab72c7f4683e3a57a7fecd374b5e4801dcff9 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 3 Apr 2023 11:21:01 -0700 Subject: [PATCH 38/44] clean up --- include/rxmesh/matrix/sparse_matrix.cuh | 107 +++-------------------- tests/RXMesh_test/test_sparse_matrix.cuh | 15 +--- 2 files changed, 15 insertions(+), 107 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 1d11ee0e..4f73e405 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -112,8 +112,8 @@ template void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size) { thrust::device_ptr t_p(d_p); - thrust::device_ptr t_i(d_in); - thrust::device_ptr t_o(d_out); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o); } @@ -123,28 +123,14 @@ template void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size) { thrust::device_ptr t_p(d_p); - thrust::device_ptr t_i(d_in); - thrust::device_ptr t_o(d_out); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o); } } // namespace detail -__global__ void check_arr_int(int* arr, int size) { - for (int i = 0; i < size; ++i) { - printf("%d ", arr[i]); - } - printf("\n "); -} - -__global__ void check_arr_float(float* arr, int size) { - for (int i = 0; i < size; ++i) { - printf("%f ", arr[i]); - } - printf("\n "); -} - // TODO: add compatibility for EE, FF, VE...... // TODO: purge operation? template @@ -357,6 +343,12 @@ struct SparseMatrix CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle)); CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr)); CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle)); + + if (m_reorder_allocated) { + GPU_FREE(m_d_solver_val); + GPU_FREE(m_d_solver_row_ptr); + GPU_FREE(m_d_solver_col_idx); + } } /* ----- CUSPARSE SPMM & SPMV ----- */ @@ -680,81 +672,6 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ - void spmat_chol_test_purmute() - { - // Host problem definition - int size = 8; - int nnz = 4; - int hX_indices[] = {0, 3, 4, 7, 1, 2, 5, 6}; - float hX_values[] = {5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f}; - float hY[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - float hY_result[] = {5.0f, 1.0f, 2.0f, 6.0f, 7.0f, 3.0f, 4.0f, 8.0f}; - //-------------------------------------------------------------------------- - // Device memory management - int* dX_indices; - float *dY, *dX_values; - CUDA_ERROR(cudaMalloc((void**)&dX_indices, size * sizeof(int))); - CUDA_ERROR(cudaMalloc((void**)&dX_values, size * sizeof(float))); - CUDA_ERROR(cudaMalloc((void**)&dY, size * sizeof(float))); - - CUDA_ERROR(cudaMemcpy(dX_indices, - hX_indices, - size * sizeof(int), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(dX_values, - hX_values, - size * sizeof(float), - cudaMemcpyHostToDevice)); - CUDA_ERROR( - cudaMemcpy(dY, hY, size * sizeof(float), cudaMemcpyHostToDevice)); - //-------------------------------------------------------------------------- - // CUSPARSE APIs - cusparseHandle_t handle = NULL; - cusparseSpVecDescr_t vecX; - cusparseDnVecDescr_t vecY; - CUSPARSE_ERROR(cusparseCreate(&handle)); - // Create sparse vector X - CUSPARSE_ERROR(cusparseCreateSpVec(&vecX, - size, - size, - dX_indices, - dX_values, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F)); - // Create dense vector y - CUSPARSE_ERROR(cusparseCreateDnVec(&vecY, size, dY, CUDA_R_32F)); - - // execute Scatter - CUSPARSE_ERROR(cusparseScatter(handle, vecX, vecY)); - - // destroy matrix/vector descriptors - CUSPARSE_ERROR(cusparseDestroySpVec(vecX)); - CUSPARSE_ERROR(cusparseDestroyDnVec(vecY)); - CUSPARSE_ERROR(cusparseDestroy(handle)); - //-------------------------------------------------------------------------- - // device result check - CUDA_ERROR( - cudaMemcpy(hY, dY, size * sizeof(float), cudaMemcpyDeviceToHost)); - int correct = 1; - for (int i = 0; i < size; i++) { - RXMESH_INFO("hY[]: {}; hY_result[i]: {};", hY[i], hY_result[i]); - if (hY[i] != hY_result[i]) { - correct = 0; - break; - } - } - if (correct) - printf("scatter_example test PASSED\n"); - else - printf("scatter_example test FAILED: wrong result\n"); - //-------------------------------------------------------------------------- - // device memory deallocation - CUDA_ERROR(cudaFree(dX_indices)); - CUDA_ERROR(cudaFree(dX_values)); - CUDA_ERROR(cudaFree(dY)); - } - void spmat_chol_reorder(rxmesh::Reorder reorder) { if (reorder == Reorder::NONE) { @@ -766,6 +683,8 @@ struct SparseMatrix GPU_FREE(m_d_solver_val); GPU_FREE(m_d_solver_row_ptr); GPU_FREE(m_d_solver_col_idx); + GPU_FREE(m_d_permute); + free(m_h_permute); m_reorder_allocated = false; } @@ -897,7 +816,7 @@ struct SparseMatrix h_val_permute, m_nnz * sizeof(IndexT), cudaMemcpyHostToDevice)); - + detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz); free(h_val_permute); diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 4e6ab2bd..7362bf98 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -385,21 +385,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); - - // LaunchBox test_launch_box; - // rxmesh.prepare_launch_box( - // {Op::VV}, test_launch_box, (void*)check_diag); - - // check_diag - // <<>>(rxmesh.get_context(), A_mat); - - // cudaDeviceSynchronize(); - // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); - // A_mat.spmat_chol_test_purmute(); A_mat.spmat_chol_reorder(Reorder::NSTDIS); A_mat.spmat_chol_analysis(); A_mat.spmat_chol_buffer_alloc(); @@ -409,6 +396,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); } + A_mat.spmat_chol_buffer_free(); + A_mat.denmat_mul(X_mat, ret_mat); std::vector h_ret_mat(num_vertices); From 8f5f563dcf36f00465a2598eb633ea5b2876e3ca Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 3 Apr 2023 11:21:01 -0700 Subject: [PATCH 39/44] clean up --- include/rxmesh/matrix/sparse_matrix.cuh | 109 ++++------------------- tests/RXMesh_test/test_sparse_matrix.cuh | 15 +--- 2 files changed, 17 insertions(+), 107 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 1d11ee0e..c08c0ef2 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -112,8 +112,8 @@ template void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size) { thrust::device_ptr t_p(d_p); - thrust::device_ptr t_i(d_in); - thrust::device_ptr t_o(d_out); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o); } @@ -123,28 +123,14 @@ template void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size) { thrust::device_ptr t_p(d_p); - thrust::device_ptr t_i(d_in); - thrust::device_ptr t_o(d_out); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o); } } // namespace detail -__global__ void check_arr_int(int* arr, int size) { - for (int i = 0; i < size; ++i) { - printf("%d ", arr[i]); - } - printf("\n "); -} - -__global__ void check_arr_float(float* arr, int size) { - for (int i = 0; i < size; ++i) { - printf("%f ", arr[i]); - } - printf("\n "); -} - // TODO: add compatibility for EE, FF, VE...... // TODO: purge operation? template @@ -357,6 +343,14 @@ struct SparseMatrix CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle)); CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr)); CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle)); + + if (m_reorder_allocated) { + GPU_FREE(m_d_solver_val); + GPU_FREE(m_d_solver_row_ptr); + GPU_FREE(m_d_solver_col_idx); + GPU_FREE(m_d_permute); + free(m_h_permute); + } } /* ----- CUSPARSE SPMM & SPMV ----- */ @@ -680,81 +674,6 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ - void spmat_chol_test_purmute() - { - // Host problem definition - int size = 8; - int nnz = 4; - int hX_indices[] = {0, 3, 4, 7, 1, 2, 5, 6}; - float hX_values[] = {5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f}; - float hY[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - float hY_result[] = {5.0f, 1.0f, 2.0f, 6.0f, 7.0f, 3.0f, 4.0f, 8.0f}; - //-------------------------------------------------------------------------- - // Device memory management - int* dX_indices; - float *dY, *dX_values; - CUDA_ERROR(cudaMalloc((void**)&dX_indices, size * sizeof(int))); - CUDA_ERROR(cudaMalloc((void**)&dX_values, size * sizeof(float))); - CUDA_ERROR(cudaMalloc((void**)&dY, size * sizeof(float))); - - CUDA_ERROR(cudaMemcpy(dX_indices, - hX_indices, - size * sizeof(int), - cudaMemcpyHostToDevice)); - CUDA_ERROR(cudaMemcpy(dX_values, - hX_values, - size * sizeof(float), - cudaMemcpyHostToDevice)); - CUDA_ERROR( - cudaMemcpy(dY, hY, size * sizeof(float), cudaMemcpyHostToDevice)); - //-------------------------------------------------------------------------- - // CUSPARSE APIs - cusparseHandle_t handle = NULL; - cusparseSpVecDescr_t vecX; - cusparseDnVecDescr_t vecY; - CUSPARSE_ERROR(cusparseCreate(&handle)); - // Create sparse vector X - CUSPARSE_ERROR(cusparseCreateSpVec(&vecX, - size, - size, - dX_indices, - dX_values, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F)); - // Create dense vector y - CUSPARSE_ERROR(cusparseCreateDnVec(&vecY, size, dY, CUDA_R_32F)); - - // execute Scatter - CUSPARSE_ERROR(cusparseScatter(handle, vecX, vecY)); - - // destroy matrix/vector descriptors - CUSPARSE_ERROR(cusparseDestroySpVec(vecX)); - CUSPARSE_ERROR(cusparseDestroyDnVec(vecY)); - CUSPARSE_ERROR(cusparseDestroy(handle)); - //-------------------------------------------------------------------------- - // device result check - CUDA_ERROR( - cudaMemcpy(hY, dY, size * sizeof(float), cudaMemcpyDeviceToHost)); - int correct = 1; - for (int i = 0; i < size; i++) { - RXMESH_INFO("hY[]: {}; hY_result[i]: {};", hY[i], hY_result[i]); - if (hY[i] != hY_result[i]) { - correct = 0; - break; - } - } - if (correct) - printf("scatter_example test PASSED\n"); - else - printf("scatter_example test FAILED: wrong result\n"); - //-------------------------------------------------------------------------- - // device memory deallocation - CUDA_ERROR(cudaFree(dX_indices)); - CUDA_ERROR(cudaFree(dX_values)); - CUDA_ERROR(cudaFree(dY)); - } - void spmat_chol_reorder(rxmesh::Reorder reorder) { if (reorder == Reorder::NONE) { @@ -766,6 +685,8 @@ struct SparseMatrix GPU_FREE(m_d_solver_val); GPU_FREE(m_d_solver_row_ptr); GPU_FREE(m_d_solver_col_idx); + GPU_FREE(m_d_permute); + free(m_h_permute); m_reorder_allocated = false; } @@ -897,7 +818,7 @@ struct SparseMatrix h_val_permute, m_nnz * sizeof(IndexT), cudaMemcpyHostToDevice)); - + detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz); free(h_val_permute); diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 4e6ab2bd..7362bf98 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -385,21 +385,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); - - // LaunchBox test_launch_box; - // rxmesh.prepare_launch_box( - // {Op::VV}, test_launch_box, (void*)check_diag); - - // check_diag - // <<>>(rxmesh.get_context(), A_mat); - - // cudaDeviceSynchronize(); - // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); - // A_mat.spmat_chol_test_purmute(); A_mat.spmat_chol_reorder(Reorder::NSTDIS); A_mat.spmat_chol_analysis(); A_mat.spmat_chol_buffer_alloc(); @@ -409,6 +396,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); } + A_mat.spmat_chol_buffer_free(); + A_mat.denmat_mul(X_mat, ret_mat); std::vector h_ret_mat(num_vertices); From b94f4b7d95e830396ad716b70a1d64ad9f23c5a1 Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 3 Apr 2023 11:45:12 -0700 Subject: [PATCH 40/44] fix error --- include/rxmesh/matrix/sparse_matrix.cuh | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 6cce2fa7..c08c0ef2 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -348,11 +348,8 @@ struct SparseMatrix GPU_FREE(m_d_solver_val); GPU_FREE(m_d_solver_row_ptr); GPU_FREE(m_d_solver_col_idx); -<<<<<<< HEAD GPU_FREE(m_d_permute); free(m_h_permute); -======= ->>>>>>> a53ab72c7f4683e3a57a7fecd374b5e4801dcff9 } } From 4efe1e4e1d4fff81c6c38da268c042ec6a55e97e Mon Sep 17 00:00:00 2001 From: Eric Yuan <116010276@link.cuhk.edu.cn> Date: Mon, 3 Apr 2023 22:07:49 -0700 Subject: [PATCH 41/44] add comments --- include/rxmesh/matrix/sparse_matrix.cuh | 29 +++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index c08c0ef2..47da78c6 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -674,6 +674,12 @@ struct SparseMatrix /* --- LOW LEVEL API --- */ + /** + * @brief The lower level api of reordering. Sprcify the reordering type or + * simply NONE for no reordering. This should be called at the begining of + * the solving process. Any other function call order would be undefined. + * @param reorder: the reorder method applied. + */ void spmat_chol_reorder(rxmesh::Reorder reorder) { if (reorder == Reorder::NONE) { @@ -832,6 +838,10 @@ struct SparseMatrix } } + /** + * @brief The lower level api of matrix analysis. Generating a member value + * of type csrcholInfo_t for cucolver. + */ void spmat_chol_analysis() { if (!m_use_reorder) { @@ -852,6 +862,10 @@ struct SparseMatrix m_chol_info)); } + /** + * @brief The lower level api of matrix factorization buffer calculation and + * allocation. The buffer is a member variable. + */ void spmat_chol_buffer_alloc() { if constexpr (std::is_same_v) { @@ -883,11 +897,18 @@ struct SparseMatrix CUDA_ERROR(cudaMalloc((void**)&m_chol_buffer, m_workspaceInBytes)); } + /** + * @brief The lower level api of matrix factorization buffer release. + */ void spmat_chol_buffer_free() { CUDA_ERROR(cudaFree(m_chol_buffer)); } + /** + * @brief The lower level api of matrix factorization and save the + * factorization result in to the buffer. + */ void spmat_chol_factor() { if constexpr (std::is_same_v) { @@ -932,6 +953,14 @@ struct SparseMatrix } } + /** + * @brief The lower level api of solving the linear system after using + * cholesky factorization. The format follows Ax=b to solve x, where A is + * the sparse matrix, x and b are device array. As long as A doesn't change. + * This function could be called for many different b and x. + * @param d_b: device array of b + * @param d_x: device array of x + */ void spmat_chol_solve(T* d_b, T* d_x) { From 5503798339d6794c310353aac18320ca41f6c0c4 Mon Sep 17 00:00:00 2001 From: ericycc Date: Tue, 11 Apr 2023 09:49:59 -0700 Subject: [PATCH 42/44] remove test code --- include/rxmesh/matrix/dense_matrix.cuh | 14 +------------- tests/RXMesh_test/test_sparse_matrix.cuh | 16 ---------------- 2 files changed, 1 insertion(+), 29 deletions(-) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index 66ff6de1..d618e7d4 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -7,9 +7,6 @@ namespace rxmesh { -// Currently this is device only -// host/device transormation will be added -// only col major is supported template struct DenseMatrix { @@ -40,15 +37,6 @@ struct DenseMatrix CUSPARSE_ORDER_COL)); } - // void set_ones() - // { - // std::vector init_tmp_arr(m_row_size * m_col_size, 1); - // CUDA_ERROR(cudaMemcpy(m_d_val, - // init_tmp_arr.data(), - // bytes() * sizeof(T), - // cudaMemcpyHostToDevice)); - // } - IndexT lead_dim() const { return m_row_size; @@ -104,7 +92,7 @@ struct DenseMatrix } if ((location & m_allocated) == location) { - RXMESH_ERROR("Requested data not allocated on {}", location); + RXMESH_ERROR("Requested data not allocated on {}", location_to_string(location)); } assert(1 != 1); diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 7362bf98..c4bd941a 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -130,22 +130,6 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, query.dispatch(block, shrd_alloc, mat_setup); } -template -__global__ static void check_diag(const rxmesh::Context context, - rxmesh::SparseMatrix A_mat) -{ - using namespace rxmesh; - auto init = [&](VertexHandle& v_id, const VertexIterator& iter) { - printf("%lf\n", A_mat(v_id, v_id)); - }; - - auto block = cooperative_groups::this_thread_block(); - Query query(context); - ShmemAllocator shrd_alloc; - query.dispatch(block, shrd_alloc, init); -} - - /* Check the access of the sparse matrix in CSR format in device */ TEST(RXMeshStatic, SparseMatrix) { From 556e2103691c6aa1a036d96426f843988d0df370 Mon Sep 17 00:00:00 2001 From: ericycc Date: Tue, 11 Apr 2023 12:28:47 -0700 Subject: [PATCH 43/44] add comments --- include/rxmesh/matrix/dense_matrix.cuh | 28 ++++++++++++++++++++++++- include/rxmesh/matrix/sparse_matrix.cuh | 19 ++++++++++++++--- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index d618e7d4..289b69df 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -7,6 +7,12 @@ namespace rxmesh { +/** + * @brief dense matrix use for device and host, inside is a array. + * The dense matrix is initialized as col major on device. + * We would only support col major dense matrix for now since that's what + * cusparse and cusolver wants. + */ template struct DenseMatrix { @@ -67,6 +73,9 @@ struct DenseMatrix #endif } + /** + * @brief return the raw pointer based on the location specified + */ T* data(locationT location = DEVICE) const { if ((location & HOST) == HOST) { @@ -81,6 +90,10 @@ struct DenseMatrix return 0; } + /** + * @brief return the raw pointer to columns based on column index the + * location specified and + */ T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const { if ((location & HOST) == HOST) { @@ -92,18 +105,25 @@ struct DenseMatrix } if ((location & m_allocated) == location) { - RXMESH_ERROR("Requested data not allocated on {}", location_to_string(location)); + RXMESH_ERROR("Requested data not allocated on {}", + location_to_string(location)); } assert(1 != 1); return 0; } + /** + * @brief return the total number bytes used by the array + */ IndexT bytes() const { return (m_row_size + m_col_pad_idx) * m_col_size * sizeof(T); } + /** + * @brief move the data between host an device + */ void move(locationT source, locationT target, cudaStream_t stream = NULL) { if (source == target) { @@ -140,6 +160,9 @@ struct DenseMatrix } } + /** + * @brief release the data on host or device + */ void release(locationT location = LOCATION_ALL) { if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { @@ -155,6 +178,9 @@ struct DenseMatrix } } + /** + * @brief allocate the data on host or device + */ void allocate(locationT location) { if ((location & HOST) == HOST) { diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 47da78c6..77e1c5fd 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -131,8 +131,13 @@ void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size) } // namespace detail -// TODO: add compatibility for EE, FF, VE...... -// TODO: purge operation? + +/** + * @brief the sparse matrix implementation and all the functions and soolvers + * related. Right now, only VV is supported and we would add more compatibility + * for EE, FF, VE...... + * This is device/host compatiable + */ template struct SparseMatrix { @@ -1005,7 +1010,9 @@ struct SparseMatrix } /* Host data compatibility */ - + /** + * @brief move the data between host an device + */ void move(locationT source, locationT target, cudaStream_t stream = NULL) { if (source == target) { @@ -1068,6 +1075,9 @@ struct SparseMatrix } } + /** + * @brief release the data on host or device + */ void release(locationT location = LOCATION_ALL) { if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { @@ -1089,6 +1099,9 @@ struct SparseMatrix } } + /** + * @brief allocate the data on host or device + */ void allocate(locationT location) { if ((location & HOST) == HOST) { From 0c126777332300fedc85ebd0905ac4365f490ef9 Mon Sep 17 00:00:00 2001 From: Ahmed Mahmoud Date: Mon, 27 Nov 2023 08:52:15 -0500 Subject: [PATCH 44/44] fix mem leak --- include/rxmesh/attribute.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/rxmesh/attribute.h b/include/rxmesh/attribute.h index 80d7e880..cf5ba824 100644 --- a/include/rxmesh/attribute.h +++ b/include/rxmesh/attribute.h @@ -227,7 +227,10 @@ class Attribute : public AttributeBase Attribute(const Attribute& rhs) = default; - virtual ~Attribute() = default; + virtual ~Attribute() + { + free(m_name); + } /** * @brief Get the name of the attribute