diff --git a/CMakeLists.txt b/CMakeLists.txt index 60ae84f8..87f45668 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,4 +172,4 @@ add_subdirectory("include") include(GoogleTest) add_subdirectory(apps) -add_subdirectory(tests) \ No newline at end of file +add_subdirectory(tests) diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu index 2b5179e2..c5c90471 100644 --- a/apps/MCF/mcf.cu +++ b/apps/MCF/mcf.cu @@ -28,6 +28,7 @@ struct arg #include "mcf_openmesh.h" #include "mcf_rxmesh.h" +#include "mcf_sparse_matrix.cuh" TEST(App, MCF) @@ -52,7 +53,10 @@ TEST(App, MCF) mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth); // RXMesh Impl - mcf_rxmesh(rxmesh, ground_truth); + mcf_rxmesh_cg(rxmesh, ground_truth); + + // RXMesh cusolver Impl + mcf_rxmesh_cusolver_chol(rxmesh, ground_truth); } int main(int argc, char** argv) diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h index a58a62df..bb2c4ab4 100644 --- a/apps/MCF/mcf_rxmesh.h +++ b/apps/MCF/mcf_rxmesh.h @@ -49,7 +49,7 @@ void init_PR(rxmesh::RXMeshStatic& rxmesh, } template -void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, +void mcf_rxmesh_cg(rxmesh::RXMeshStatic& rxmesh, const std::vector>& ground_truth) { using namespace rxmesh; @@ -106,11 +106,11 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh, rxmesh.prepare_launch_box({rxmesh::Op::VV}, launch_box_init_B, (void*)init_B, - true); + !Arg.use_uniform_laplace); rxmesh.prepare_launch_box({rxmesh::Op::VV}, launch_box_matvec, (void*)rxmesh_matvec, - true); + !Arg.use_uniform_laplace); // init kernel to initialize RHS (B) diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh index daf1ed4f..4dbae51a 100644 --- a/apps/MCF/mcf_sparse_matrix.cuh +++ b/apps/MCF/mcf_sparse_matrix.cuh @@ -1,47 +1,227 @@ #pragma once +#include "mcf_util.h" #include "rxmesh/attribute.h" #include "rxmesh/matrix/dense_matrix.cuh" #include "rxmesh/matrix/sparse_matrix.cuh" #include "rxmesh/rxmesh_static.h" +template +__global__ static void mcf_B_setup(const rxmesh::Context context, + const rxmesh::VertexAttribute coords, + rxmesh::DenseMatrix B_mat, + const bool use_uniform_laplace) +{ + using namespace rxmesh; + + auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) { + auto r_ids = p_id.unpack(); + uint32_t r_patch_id = r_ids.first; + uint16_t r_local_id = r_ids.second; + + uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; + + if (use_uniform_laplace) { + const T valence = static_cast(iter.size()); + B_mat(row_index, 0) = coords(p_id, 0) * valence; + B_mat(row_index, 1) = coords(p_id, 1) * valence; + B_mat(row_index, 2) = coords(p_id, 2) * valence; + } else { + T v_weight = 0; + + // this is the last vertex in the one-ring (before r_id) + VertexHandle q_id = iter.back(); + + for (uint32_t v = 0; v < iter.size(); ++v) { + // the current one ring vertex + VertexHandle r_id = iter[v]; + + T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); + + v_weight += (tri_area > 0) ? tri_area : 0.0; + + q_id = r_id; + } + v_weight = 0.5 / v_weight; + + B_mat(row_index, 0) = coords(p_id, 0) / v_weight; + B_mat(row_index, 1) = coords(p_id, 1) / v_weight; + B_mat(row_index, 2) = coords(p_id, 2) / v_weight; + } + }; + + // With uniform Laplacian, we just need the valence, thus we + // call query and set oriented to false + auto block = cooperative_groups::this_thread_block(); + + Query query(context); + ShmemAllocator shrd_alloc; + query.dispatch( + block, + shrd_alloc, + init_lambda, + [](VertexHandle) { return true; }, + !use_uniform_laplace); +} template -__global__ static void mcf_A_B_setup( - const rxmesh::Context context, - rxmesh::VertexAttribute coords, // for non-uniform - rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix B_mat, - const bool use_uniform_laplace, // for non-uniform - const T time_step) +__global__ static void mcf_A_X_setup( + const rxmesh::Context context, + const rxmesh::VertexAttribute coords, + rxmesh::SparseMatrix A_mat, + rxmesh::DenseMatrix X_mat, + const bool use_uniform_laplace, // for non-uniform + const T time_step) { using namespace rxmesh; - auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) { + auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) { T sum_e_weight(0); + T v_weight(0); - T v_weight = iter.size(); + VertexHandle q_id = iter.back(); // reference value calculation - auto r_ids = v_id.unpack(); + auto r_ids = p_id.unpack(); uint32_t r_patch_id = r_ids.first; uint16_t r_local_id = r_ids.second; - uint32_t row_index = A_mat.m_d_patch_ptr_v[r_patch_id] + r_local_id; + uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; - B_mat(row_index, 0) = coords(v_id, 0) * v_weight; - B_mat(row_index, 1) = coords(v_id, 1) * v_weight; - B_mat(row_index, 2) = coords(v_id, 2) * v_weight; + // set up initial X matrix + X_mat(row_index, 0) = coords(p_id, 0); + X_mat(row_index, 1) = coords(p_id, 1); + X_mat(row_index, 2) = coords(p_id, 2); - Vector<3, float> vi_coord( - coords(v_id, 0), coords(v_id, 1), coords(v_id, 2)); + // set up matrix A for (uint32_t v = 0; v < iter.size(); ++v) { - T e_weight = 1; - A_mat(v_id, iter[v]) = -time_step * e_weight; + VertexHandle r_id = iter[v]; + + T e_weight = 0; + if (use_uniform_laplace) { + e_weight = 1; + } else { + VertexHandle s_id = + (v == iter.size() - 1) ? iter[0] : iter[v + 1]; + + e_weight = edge_cotan_weight(p_id, r_id, q_id, s_id, coords); + e_weight = (static_cast(e_weight >= 0.0)) * e_weight; + } + e_weight *= time_step; sum_e_weight += e_weight; + + A_mat(p_id, iter[v]) = -e_weight; + + // compute vertex weight + if (use_uniform_laplace) { + ++v_weight; + } else { + T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords); + v_weight += (tri_area > 0) ? tri_area : 0; + q_id = r_id; + } } - A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight; + // Diagonal entry + if (use_uniform_laplace) { + v_weight = 1.0 / v_weight; + } else { + v_weight = 0.5 / v_weight; + } + + assert(!isnan(v_weight)); + assert(!isinf(v_weight)); + + A_mat(p_id, p_id) = (1.0 / v_weight) + sum_e_weight; }; - query_block_dispatcher(context, init_lambda); + auto block = cooperative_groups::this_thread_block(); + Query query(context); + ShmemAllocator shrd_alloc; + query.dispatch( + block, + shrd_alloc, + init_lambda, + [](VertexHandle) { return true; }, + !use_uniform_laplace); } + +template +void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic& rxmesh, + const std::vector>& ground_truth) +{ + using namespace rxmesh; + constexpr uint32_t blockThreads = 256; + + uint32_t num_vertices = rxmesh.get_num_vertices(); + auto coords = rxmesh.get_input_vertex_coordinates(); + + SparseMatrix A_mat(rxmesh); + DenseMatrix X_mat(num_vertices, 3); + DenseMatrix B_mat(num_vertices, 3); + + RXMESH_INFO("use_uniform_laplace: {}, time_step: {}", + Arg.use_uniform_laplace, + Arg.time_step); + + // B set up + LaunchBox launch_box_B; + rxmesh.prepare_launch_box({Op::VV}, + launch_box_B, + (void*)mcf_B_setup, + !Arg.use_uniform_laplace); + + mcf_B_setup<<>>( + rxmesh.get_context(), *coords, B_mat, Arg.use_uniform_laplace); + + CUDA_ERROR(cudaDeviceSynchronize()); + + // A and X set up + LaunchBox launch_box_A_X; + rxmesh.prepare_launch_box({Op::VV}, + launch_box_A_X, + (void*)mcf_A_X_setup, + !Arg.use_uniform_laplace); + + mcf_A_X_setup + <<>>(rxmesh.get_context(), + *coords, + A_mat, + X_mat, + Arg.use_uniform_laplace, + Arg.time_step); + + // Solving the linear system using chol factorization and no reordering + A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE); + + X_mat.move(rxmesh::DEVICE, rxmesh::HOST); + + const T tol = 0.001; + T tmp_tol = tol; + bool passed = true; + rxmesh.for_each_vertex(HOST, [&](const VertexHandle vh) { + uint32_t v_id = rxmesh.map_to_global(vh); + uint32_t v_linear_id = rxmesh.linear_id(vh); + + T a = X_mat(v_linear_id, 0); + + for (uint32_t i = 0; i < 3; ++i) { + tmp_tol = std::abs((X_mat(v_linear_id, i) - ground_truth[v_id][i]) / + ground_truth[v_id][i]); + + if (tmp_tol > tol) { + RXMESH_WARN("val: {}, truth: {}, tol: {}\n", + X_mat(v_linear_id, i), + ground_truth[v_id][i], + tmp_tol); + passed = false; + break; + } + } + }); + + EXPECT_TRUE(passed); +} \ No newline at end of file diff --git a/include/rxmesh/attribute.h b/include/rxmesh/attribute.h index 481c3d33..6c651000 100644 --- a/include/rxmesh/attribute.h +++ b/include/rxmesh/attribute.h @@ -227,7 +227,10 @@ class Attribute : public AttributeBase Attribute(const Attribute& rhs) = default; - virtual ~Attribute() = default; + virtual ~Attribute() + { + free(m_name); + } /** * @brief Get the name of the attribute diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh index 41e4b81a..289b69df 100644 --- a/include/rxmesh/matrix/dense_matrix.cuh +++ b/include/rxmesh/matrix/dense_matrix.cuh @@ -1,93 +1,216 @@ #pragma once #include - +#include "cusparse.h" #include "rxmesh/attribute.h" #include "rxmesh/context.h" #include "rxmesh/types.h" namespace rxmesh { -// Currently this is device only -// host/device transormation will be added -template +/** + * @brief dense matrix use for device and host, inside is a array. + * The dense matrix is initialized as col major on device. + * We would only support col major dense matrix for now since that's what + * cusparse and cusolver wants. + */ +template struct DenseMatrix { DenseMatrix(IndexT row_size, IndexT col_size) - : m_row_size(row_size), m_col_size(col_size) + : m_row_size(row_size), + m_col_size(col_size), + m_dendescr(NULL), + m_allocated(LOCATION_NONE), + m_col_pad_bytes(0), + m_col_pad_idx(0) { - cudaMalloc((void**)&m_d_val, bytes()); - m_is_row_major = false; + m_allocated = m_allocated | DEVICE; + + IndexT col_data_bytes = m_row_size * sizeof(T); + if (MemAlignSize != 0 && col_data_bytes % MemAlignSize != 0) { + m_col_pad_bytes = MemAlignSize - (col_data_bytes % MemAlignSize); + m_col_pad_idx = m_col_pad_bytes / sizeof(T); + } + + CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); + + CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr, + m_row_size, + m_col_size, + m_row_size, // leading dim + m_d_val, + CUDA_R_32F, + CUSPARSE_ORDER_COL)); } - DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major) - : m_row_size(row_size), - m_col_size(col_size), - m_is_row_major(is_row_major) + IndexT lead_dim() const { - cudaMalloc((void**)&m_d_val, bytes()); + return m_row_size; } - void set_ones() + __host__ __device__ T& operator()(const uint32_t row, const uint32_t col) { - std::vector init_tmp_arr(m_row_size * m_col_size, 1); - CUDA_ERROR(cudaMemcpy(m_d_val, - init_tmp_arr.data(), - bytes() * sizeof(T), - cudaMemcpyHostToDevice)); + assert(row < m_row_size); + assert(col < m_col_size); + +#ifdef __CUDA_ARCH__ + return m_d_val[col * (m_row_size + m_col_pad_idx) + row]; +#else + return m_h_val[col * (m_row_size + m_col_pad_idx) + row]; +#endif } - IndexT lead_dim() const + __host__ __device__ T& operator()(const uint32_t row, + const uint32_t col) const { - if (m_is_row_major) { - return m_col_size; - } else { - return m_row_size; - } + assert(row < m_row_size); + assert(col < m_col_size); + +#ifdef __CUDA_ARCH__ + return m_d_val[col * (m_row_size + m_col_pad_idx) + row]; +#else + return m_h_val[col * (m_row_size + m_col_pad_idx) + row]; +#endif } - __device__ T& operator()(const uint32_t row, const uint32_t col) + /** + * @brief return the raw pointer based on the location specified + */ + T* data(locationT location = DEVICE) const { - if (m_is_row_major) { - return m_d_val[row * m_col_size + col]; - } else { - return m_d_val[col * m_row_size + row]; + if ((location & HOST) == HOST) { + return m_h_val; } + + if ((location & DEVICE) == DEVICE) { + return m_d_val; + } + + assert(1 != 1); + return 0; } - __device__ T& operator()(const uint32_t row, const uint32_t col) const + /** + * @brief return the raw pointer to columns based on column index the + * location specified and + */ + T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const { - if (m_is_row_major) { - return m_d_val[row * m_col_size + col]; - } else { - return m_d_val[col * m_row_size + row]; + if ((location & HOST) == HOST) { + return m_h_val + ld_idx * (m_row_size + m_col_pad_idx); } + + if ((location & DEVICE) == DEVICE) { + return m_d_val + ld_idx * (m_row_size + m_col_pad_idx); + } + + if ((location & m_allocated) == location) { + RXMESH_ERROR("Requested data not allocated on {}", + location_to_string(location)); + } + + assert(1 != 1); + return 0; } - T* data() const + /** + * @brief return the total number bytes used by the array + */ + IndexT bytes() const { - return m_d_val; + return (m_row_size + m_col_pad_idx) * m_col_size * sizeof(T); } - T* ld_data(const uint32_t ld_idx) const + /** + * @brief move the data between host an device + */ + void move(locationT source, locationT target, cudaStream_t stream = NULL) { - return m_d_val + ld_idx * lead_dim(); + if (source == target) { + RXMESH_WARN( + "DenseMatrix::move() source ({}) and target ({}) " + "are the same.", + location_to_string(source), + location_to_string(target)); + return; + } + + if ((source == HOST || source == DEVICE) && + ((source & m_allocated) != source)) { + RXMESH_ERROR( + "DenseMatrix::move() moving source is not valid" + " because it was not allocated on source i.e., {}", + location_to_string(source)); + } + + if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) && + ((target & m_allocated) != target)) { + RXMESH_WARN( + "DenseMatrix::move() allocating target before moving to {}", + location_to_string(target)); + allocate(target); + } + + if (source == HOST && target == DEVICE) { + CUDA_ERROR(cudaMemcpyAsync( + m_d_val, m_h_val, bytes(), cudaMemcpyHostToDevice, stream)); + } else if (source == DEVICE && target == HOST) { + CUDA_ERROR(cudaMemcpyAsync( + m_h_val, m_d_val, bytes(), cudaMemcpyDeviceToHost, stream)); + } } - IndexT bytes() const + /** + * @brief release the data on host or device + */ + void release(locationT location = LOCATION_ALL) { - return m_row_size * m_col_size * sizeof(T); + if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { + free(m_h_val); + m_h_val = nullptr; + m_allocated = m_allocated & (~HOST); + } + + if (((location & DEVICE) == DEVICE) && + ((m_allocated & DEVICE) == DEVICE)) { + GPU_FREE(m_d_val); + m_allocated = m_allocated & (~DEVICE); + } } - void quick_tanspose_w_ld_trans() + /** + * @brief allocate the data on host or device + */ + void allocate(locationT location) { - std::swap(m_row_size, m_col_size); - m_is_row_major = !(m_is_row_major); + if ((location & HOST) == HOST) { + release(HOST); + + m_h_val = static_cast(malloc(bytes())); + + m_allocated = m_allocated | HOST; + } + + if ((location & DEVICE) == DEVICE) { + release(DEVICE); + + CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes())); + + m_allocated = m_allocated | DEVICE; + } } - bool m_is_row_major; - IndexT m_row_size; - IndexT m_col_size; - T* m_d_val; + // TODO: something like attribute->move() + + cusparseDnMatDescr_t m_dendescr; + locationT m_allocated; + IndexT m_row_size; + IndexT m_col_size; + T* m_d_val; + T* m_h_val; + + IndexT m_col_pad_bytes; + IndexT m_col_pad_idx; }; } // namespace rxmesh diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh deleted file mode 100644 index b9ce48bc..00000000 --- a/include/rxmesh/matrix/matrix_operation.cuh +++ /dev/null @@ -1,446 +0,0 @@ -#pragma once -#include "cusolverSp.h" -#include "cusparse.h" - -#include "rxmesh/attribute.h" -#include "rxmesh/matrix/dense_matrix.cuh" -#include "rxmesh/matrix/sparse_matrix.cuh" -#include "rxmesh/rxmesh_static.h" - -namespace rxmesh { - -/** - * @brief The enum class for choosing different solver types - */ -enum class Solver -{ - CHOL = 0, - LU = 1, - QR = 2 -}; - -/** - * @brief The enum class for choosing different reorder types - * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee - * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based - * on Quotient Graph, NSTDIS for Nested Dissection - */ -enum class Reorder -{ - NONE = 0, - SYMRCM = 1, - SYMAMD = 2, - NSTDIS = 3 -}; - -static int reorder_to_int(const Reorder& reorder) -{ - switch (reorder) { - case Reorder::NONE: - return 0; - case Reorder::SYMRCM: - return 1; - case Reorder::SYMAMD: - return 2; - case Reorder::NSTDIS: - return 3; - default: { - RXMESH_ERROR("reorder_to_int() unknown input reorder"); - return 0; - } - } -} - -/** - * @brief for transpose the dense matrix using cublas - */ -template -void denmat_transpose(rxmesh::DenseMatrix den_mat) -{ - T* d_rt_arr; - cudaMalloc(&d_rt_arr, den_mat.bytes()); - - float const alpha(1.0); - float const beta(0.0); - - cublasHandle_t handle; - cublasCreate(&handle); - cublasSgeam(handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - den_mat.m_row_size, - den_mat.m_col_size, - &alpha, // 0 - den_mat.data(), // A_arr - den_mat.m_col_size, // ld_a - &beta, // 0 - den_mat.data(), // B_arr - den_mat.m_row_size, // ld_b - d_rt_arr, // rt_arr - den_mat.m_row_size); // ld_cm - cublasDestroy(handle); - - den_mat.m_d_val = d_rt_arr; - // TODO cont; -} - -/** - * @brief solve the Ax=b for x where x and b are all array - */ -template -void spmat_linear_solve(rxmesh::SparseMatrix A_mat, - T* B_arr, - T* X_arr, - rxmesh::Solver solver, - rxmesh::Reorder reorder) -{ - cusolverSpHandle_t handle = NULL; - cusparseHandle_t cusparseHandle = NULL; - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; - - cusolverSpCreate(&handle); - cusparseCreate(&cusparseHandle); - - cudaStreamCreate(&stream); - cusolverSpSetStream(handle, stream); - cusparseSetStream(cusparseHandle, stream); - - cusparseCreateMatDescr(&descrA); - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); - - cusparse_linear_solver_wrapper(solver, - reorder, - handle, - cusparseHandle, - descrA, - A_mat.m_row_size, - A_mat.m_col_size, - A_mat.m_nnz, - A_mat.m_d_row_ptr, - A_mat.m_d_col_idx, - A_mat.m_d_val, - B_arr, - X_arr); - - cusolverSpDestroy(handle); - cusparseDestroy(cusparseHandle); - cudaStreamDestroy(stream); - cusparseDestroyMatDescr(descrA); -} - -/** - * @brief solve the AX=B for X where X and B are all dense matrix and we would - * solve it in a column wise manner - */ -template -void spmat_linear_solve(rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix B_mat, - rxmesh::DenseMatrix X_mat, - rxmesh::Solver solver, - rxmesh::Reorder reorder) -{ - cusolverSpHandle_t handle = NULL; - cusparseHandle_t cusparseHandle = NULL; - cudaStream_t stream = NULL; - cusparseMatDescr_t descrA = NULL; - - cusolverSpCreate(&handle); - cusparseCreate(&cusparseHandle); - - cudaStreamCreate(&stream); - cusolverSpSetStream(handle, stream); - cusparseSetStream(cusparseHandle, stream); - - cusparseCreateMatDescr(&descrA); - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO); - - for (int i = 0; i < B_mat.m_col_size; ++i) { - cusparse_linear_solver_wrapper(solver, - reorder, - handle, - cusparseHandle, - descrA, - A_mat.m_row_size, - A_mat.m_col_size, - A_mat.m_nnz, - A_mat.m_d_row_ptr, - A_mat.m_d_col_idx, - A_mat.m_d_val, - B_mat.ld_data(i), - X_mat.ld_data(i)); - } - - cusolverSpDestroy(handle); - cusparseDestroy(cusparseHandle); - cudaStreamDestroy(stream); - cusparseDestroyMatDescr(descrA); -} - -/** - * @brief wrap up the cusolver api for solving linear systems. This is a lower - * level api - */ -template -void cusparse_linear_solver_wrapper(const rxmesh::Solver solver, - const rxmesh::Reorder reorder, - cusolverSpHandle_t handle, - cusparseHandle_t cusparseHandle, - cusparseMatDescr_t descrA, - int rowsA, - int colsA, - int nnzA, - int* d_csrRowPtrA, - int* d_csrColIndA, - T* d_csrValA, - T* d_b, - T* d_x) -{ - if constexpr ((!std::is_same_v)&&(!std::is_same_v)) { - RXMESH_ERROR( - "Unsupported type for cusparse: {}" - "Only float and double are supported", - typeid(T).name()); - } - - double tol = 1.e-12; - int singularity = 0; /* -1 if A is invertible under tol. */ - - /* solve B*z = Q*b */ - if (solver == Solver::CHOL) { - if constexpr (std::is_same_v) { - cusolverSpScsrlsvchol(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); - } - - if constexpr (std::is_same_v) { - cusolverSpDcsrlsvchol(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); - } - - } else if (solver == Solver::QR) { - if constexpr (std::is_same_v) { - cusolverSpScsrlsvqr(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); - } - - if constexpr (std::is_same_v) { - cusolverSpDcsrlsvqr(handle, - rowsA, - nnzA, - descrA, - d_csrValA, - d_csrRowPtrA, - d_csrColIndA, - d_b, - tol, - reorder_to_int(reorder), - d_x, - &singularity); - } - } else { - RXMESH_ERROR( - "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x for " - "Solver::LU"); - } - cudaDeviceSynchronize(); - - if (0 <= singularity) { - RXMESH_WARN("WARNING: the matrix is singular at row {} under tol ({})", - singularity, - tol); - } -} - -/** - * @brief wrap up the cusparse api for sparse matrix dense matrix - * multiplication. - */ -template -void spmat_denmat_mul(rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix B_mat, - rxmesh::DenseMatrix C_mat) -{ - float alpha = 1.0f; - float beta = 0.0f; - - cusparseHandle_t handle = NULL; - cusparseSpMatDescr_t matA; - cusparseDnMatDescr_t matB, matC; - void* dBuffer = NULL; - size_t bufferSize = 0; - - cusparseCreate(&handle); - // Create sparse matrix A in CSR format - cusparseCreateCsr(&matA, - A_mat.m_row_size, - A_mat.m_col_size, - A_mat.m_nnz, - A_mat.m_d_row_ptr, - A_mat.m_d_col_idx, - A_mat.m_d_val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F); - // Create dense matrix B - cusparseCreateDnMat(&matB, - B_mat.m_row_size, - B_mat.m_col_size, - B_mat.lead_dim(), - B_mat.data(), - CUDA_R_32F, - CUSPARSE_ORDER_COL); - // Create dense matrix C - cusparseCreateDnMat(&matC, - C_mat.m_row_size, - C_mat.m_col_size, - C_mat.lead_dim(), - C_mat.data(), - CUDA_R_32F, - CUSPARSE_ORDER_COL); - // allocate an external buffer if needed - cusparseSpMM_bufferSize(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, - matA, - matB, - &beta, - matC, - CUDA_R_32F, - CUSPARSE_SPMM_ALG_DEFAULT, - &bufferSize); - cudaMalloc(&dBuffer, bufferSize); - - // execute SpMM - cusparseSpMM(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &alpha, - matA, - matB, - &beta, - matC, - CUDA_R_32F, - CUSPARSE_SPMM_ALG_DEFAULT, - dBuffer); - - // destroy matrix/vector descriptors - cusparseDestroySpMat(matA); - cusparseDestroyDnMat(matB); - cusparseDestroyDnMat(matC); - cusparseDestroy(handle); -} - - -/** - * @brief wrap up the cusparse api for sparse matrix array - * multiplication. - */ -// only works for float -template -void spmat_arr_mul(rxmesh::SparseMatrix sp_mat, T* in_arr, T* rt_arr) -{ - const float minus_one = -1.0f; - const float one = 1.0f; - const float zero = 0.0f; - - cusparseHandle_t handle = NULL; - void* buffer = NULL; - size_t bufferSize = 0; - cusparseSpMatDescr_t sp_mat_des = NULL; - cusparseDnVecDescr_t vecx = NULL; - cusparseDnVecDescr_t vecy = NULL; - - cusparseCreate(&handle); - cusparseCreateCsr(&sp_mat_des, - sp_mat.m_row_size, - sp_mat.m_col_size, - sp_mat.m_nnz, - sp_mat.m_d_row_ptr, - sp_mat.m_d_col_idx, - sp_mat.m_d_val, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_32I, - CUSPARSE_INDEX_BASE_ZERO, - CUDA_R_32F); - cusparseCreateDnVec(&vecx, sp_mat.m_col_size, in_arr, CUDA_R_32F); - cusparseCreateDnVec(&vecy, sp_mat.m_row_size, rt_arr, CUDA_R_32F); - - cusparseSpMV_bufferSize(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - sp_mat_des, - vecx, - &zero, - vecy, - CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, - &bufferSize); - cudaMalloc(&buffer, bufferSize); - - cusparseSpMV(handle, - CUSPARSE_OPERATION_NON_TRANSPOSE, - &one, - sp_mat_des, - vecx, - &zero, - vecy, - CUDA_R_32F, - CUSPARSE_SPMV_ALG_DEFAULT, - buffer); - - cusparseDestroySpMat(sp_mat_des); - cusparseDestroyDnVec(vecx); - cusparseDestroyDnVec(vecy); - cusparseDestroy(handle); -} - -/** - * @brief do the sparse matrix dense matrix multiplication using sparse matrix - * array multiplication in a column wise way - */ -template -void spmat_denmat_mul_cw(rxmesh::SparseMatrix A_mat, - rxmesh::DenseMatrix B_mat, - rxmesh::DenseMatrix C_mat) -{ - for (int i = 0; i < B_mat.m_col_size; ++i) { - spmat_arr_mul(A_mat, B_mat.ld_data(i), C_mat.ld_data(i)); - } -} - -} // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh index 97133fbe..77e1c5fd 100644 --- a/include/rxmesh/matrix/sparse_matrix.cuh +++ b/include/rxmesh/matrix/sparse_matrix.cuh @@ -1,11 +1,63 @@ #pragma once +#include "cusolverSp.h" +#include "cusparse.h" #include "rxmesh/attribute.h" #include "rxmesh/context.h" #include "rxmesh/query.cuh" #include "rxmesh/types.h" +#include "thrust/device_ptr.h" +#include "thrust/execution_policy.h" +#include "thrust/gather.h" +#include "thrust/scatter.h" + +#include "cusolverSp_LOWLEVEL_PREVIEW.h" +#include "rxmesh/matrix/dense_matrix.cuh" + namespace rxmesh { +/** + * @brief The enum class for choosing different solver types + */ +enum class Solver +{ + CHOL = 0, + LU = 1, + QR = 2 +}; + +/** + * @brief The enum class for choosing different reorder types + * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee + * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based + * on Quotient Graph, NSTDIS for Nested Dissection + */ +enum class Reorder +{ + NONE = 0, + SYMRCM = 1, + SYMAMD = 2, + NSTDIS = 3 +}; + +static int reorder_to_int(const Reorder& reorder) +{ + switch (reorder) { + case Reorder::NONE: + return 0; + case Reorder::SYMRCM: + return 1; + case Reorder::SYMAMD: + return 2; + case Reorder::NSTDIS: + return 3; + default: { + RXMESH_ERROR("reorder_to_int() unknown input reorder"); + return 0; + } + } +} + namespace detail { // this is the function for the CSR calculation template @@ -55,11 +107,37 @@ __global__ static void sparse_mat_col_fill(const rxmesh::Context context, query.dispatch(block, shrd_alloc, col_fillin); } +// d_out[d_p[i]] = d_in[i] +template +void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size) +{ + thrust::device_ptr t_p(d_p); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); + + thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o); +} + +// d_out[i] = d_in[d_p[i]] +template +void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size) +{ + thrust::device_ptr t_p(d_p); + thrust::device_ptr t_i(d_in); + thrust::device_ptr t_o(d_out); + + thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o); +} + } // namespace detail -// TODO: add compatibility for EE, FF, VE...... -// TODO: purge operation? +/** + * @brief the sparse matrix implementation and all the functions and soolvers + * related. Right now, only VV is supported and we would add more compatibility + * for EE, FF, VE...... + * This is device/host compatiable + */ template struct SparseMatrix { @@ -70,7 +148,14 @@ struct SparseMatrix m_row_size(0), m_col_size(0), m_nnz(0), - m_context(rx.get_context()) + m_context(rx.get_context()), + m_cusparse_handle(NULL), + m_descr(NULL), + m_spdescr(NULL), + m_spmm_buffer_size(0), + m_spmv_buffer_size(0), + m_use_reorder(false), + m_allocated(LOCATION_NONE) { using namespace rxmesh; constexpr uint32_t blockThreads = 256; @@ -137,7 +222,30 @@ struct SparseMatrix // val pointer allocation, actual value init should be in another // function - CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(IndexT))); + CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T))); + + CUSPARSE_ERROR(cusparseCreateMatDescr(&m_descr)); + CUSPARSE_ERROR( + cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_ERROR( + cusparseSetMatIndexBase(m_descr, CUSPARSE_INDEX_BASE_ZERO)); + + CUSPARSE_ERROR(cusparseCreateCsr(&m_spdescr, + m_row_size, + m_col_size, + m_nnz, + m_d_row_ptr, + m_d_col_idx, + m_d_val, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_32F)); + + CUSPARSE_ERROR(cusparseCreate(&m_cusparse_handle)); + CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle)); + + m_allocated = m_allocated | DEVICE; } void set_ones() @@ -186,20 +294,887 @@ struct SparseMatrix return m_d_val[get_val_idx(row_v, col_v)]; } - void free() + __device__ T& operator()(const IndexT x, const IndexT y) + { + const IndexT start = m_d_row_ptr[x]; + const IndexT end = m_d_row_ptr[x + 1]; + + for (IndexT i = start; i < end; ++i) { + if (m_d_col_idx[i] == y) { + return m_d_val[i]; + } + } + assert(1 != 1); + } + + __device__ T& operator()(const IndexT x, const IndexT y) const + { + const IndexT start = m_d_row_ptr[x]; + const IndexT end = m_d_row_ptr[x + 1]; + + for (IndexT i = start; i < end; ++i) { + if (m_d_col_idx[i] == y) { + return m_d_val[i]; + } + } + assert(1 != 1); + } + + __host__ __device__ IndexT& get_nnz() const + { + return m_nnz; + } + + __device__ IndexT& get_row_ptr_at(IndexT idx) const + { + return m_d_row_ptr[idx]; + } + + __device__ IndexT& get_col_idx_at(IndexT idx) const + { + return m_d_col_idx[idx]; + } + + __device__ T& get_val_at(IndexT idx) const + { + return m_d_val[idx]; + } + + void free_mat() { CUDA_ERROR(cudaFree(m_d_row_ptr)); CUDA_ERROR(cudaFree(m_d_col_idx)); CUDA_ERROR(cudaFree(m_d_val)); + CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle)); + CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr)); + CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle)); + + if (m_reorder_allocated) { + GPU_FREE(m_d_solver_val); + GPU_FREE(m_d_solver_row_ptr); + GPU_FREE(m_d_solver_col_idx); + GPU_FREE(m_d_permute); + free(m_h_permute); + } } - const Context m_context; - IndexT* m_d_row_ptr; - IndexT* m_d_col_idx; - T* m_d_val; - IndexT m_row_size; - IndexT m_col_size; - IndexT m_nnz; + /* ----- CUSPARSE SPMM & SPMV ----- */ + + /** + * @brief wrap up the cusparse api for sparse matrix dense matrix + * multiplication buffer size calculation. + */ + void denmat_mul_buffer_size(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix C_mat, + cudaStream_t stream = 0) + { + float alpha = 1.0f; + float beta = 0.0f; + + cusparseSpMatDescr_t matA = m_spdescr; + cusparseDnMatDescr_t matB = B_mat.m_dendescr; + cusparseDnMatDescr_t matC = C_mat.m_dendescr; + void* dBuffer = NULL; + + CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream)); + + // allocate an external buffer if needed + CUSPARSE_ERROR(cusparseSpMM_bufferSize(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + matB, + &beta, + matC, + CUDA_R_32F, + CUSPARSE_SPMM_ALG_DEFAULT, + &m_spmm_buffer_size)); + } + + /** + * @brief wrap up the cusparse api for sparse matrix dense matrix + * multiplication. + */ + void denmat_mul(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix C_mat, + cudaStream_t stream = 0) + { + float alpha = 1.0f; + float beta = 0.0f; + + // A_mat.create_cusparse_handle(); + cusparseSpMatDescr_t matA = m_spdescr; + cusparseDnMatDescr_t matB = B_mat.m_dendescr; + cusparseDnMatDescr_t matC = C_mat.m_dendescr; + void* dBuffer = NULL; + + CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream)); + + // allocate an external buffer if needed + if (m_spmm_buffer_size == 0) { + RXMESH_WARN( + "Sparse matrix - Dense matrix multiplication buffer size not " + "initialized.", + "Calculate it now."); + denmat_mul_buffer_size(B_mat, C_mat, stream); + } + CUDA_ERROR(cudaMalloc(&dBuffer, m_spmm_buffer_size)); + + // execute SpMM + CUSPARSE_ERROR(cusparseSpMM(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + matA, + matB, + &beta, + matC, + CUDA_R_32F, + CUSPARSE_SPMM_ALG_DEFAULT, + dBuffer)); + + CUDA_ERROR(cudaFree(dBuffer)); + } + + void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0) + { + const float alpha = 1.0f; + const float beta = 0.0f; + + cusparseDnVecDescr_t vecx = NULL; + cusparseDnVecDescr_t vecy = NULL; + + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F)); + + CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + m_spdescr, + vecx, + &beta, + vecy, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + &m_spmv_buffer_size)); + } + + /** + * @brief wrap up the cusparse api for sparse matrix array + * multiplication. + */ + void arr_mul(T* in_arr, T* rt_arr, cudaStream_t stream = 0) + { + const float alpha = 1.0f; + const float beta = 0.0f; + + void* buffer = NULL; + cusparseDnVecDescr_t vecx = NULL; + cusparseDnVecDescr_t vecy = NULL; + + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F)); + CUSPARSE_ERROR( + cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F)); + + CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream)); + + if (m_spmv_buffer_size == 0) { + RXMESH_WARN( + "Sparse matrix - Array multiplication buffer size not " + "initialized." + "Calculate it now."); + arr_mul_buffer_size(in_arr, rt_arr, stream); + } + + CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size)); + + CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, + m_spdescr, + vecx, + &beta, + vecy, + CUDA_R_32F, + CUSPARSE_SPMV_ALG_DEFAULT, + buffer)); + + CUSPARSE_ERROR(cusparseDestroyDnVec(vecx)); + CUSPARSE_ERROR(cusparseDestroyDnVec(vecy)); + CUDA_ERROR(cudaFree(buffer)); + } + + /** + * @brief do the sparse matrix dense matrix multiplication using sparse + * matrix array multiplication in a column wise way + */ + void spmat_denmat_mul_cw(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix C_mat) + { + for (int i = 0; i < B_mat.m_col_size; ++i) { + arr_mul(B_mat.col_data(i), C_mat.col_data(i)); + } + } + + /* ----- SOLVER ----- */ + + /* --- HIGH LEVEL API --- */ + + /** + * @brief solve the Ax=b for x where x and b are all array + */ + void spmat_linear_solve(T* B_arr, + T* X_arr, + rxmesh::Solver solver, + rxmesh::Reorder reorder) + { + cusparse_linear_solver_wrapper(solver, + reorder, + m_cusolver_sphandle, + m_descr, + m_row_size, + m_col_size, + m_nnz, + m_d_row_ptr, + m_d_col_idx, + m_d_val, + B_arr, + X_arr); + } + + /** + * @brief solve the AX=B for X where X and B are all dense matrix and we + * would solve it in a column wise manner + */ + void spmat_linear_solve(rxmesh::DenseMatrix B_mat, + rxmesh::DenseMatrix X_mat, + rxmesh::Solver solver, + rxmesh::Reorder reorder) + { + for (int i = 0; i < B_mat.m_col_size; ++i) { + cusparse_linear_solver_wrapper(solver, + reorder, + m_cusolver_sphandle, + m_descr, + m_row_size, + m_col_size, + m_nnz, + m_d_row_ptr, + m_d_col_idx, + m_d_val, + B_mat.col_data(i), + X_mat.col_data(i)); + } + } + + /** + * @brief wrap up the cusolver api for solving linear systems. This is a + * lower level api + */ + void cusparse_linear_solver_wrapper(const rxmesh::Solver solver, + const rxmesh::Reorder reorder, + cusolverSpHandle_t handle, + cusparseMatDescr_t descrA, + int rowsA, + int colsA, + int nnzA, + int* d_csrRowPtrA, + int* d_csrColIndA, + T* d_csrValA, + T* d_b, + T* d_x) + { + if constexpr ((!std::is_same_v)&&( + !std::is_same_v)) { + RXMESH_ERROR( + "Unsupported type for cusparse: {}" + "Only float and double are supported", + typeid(T).name()); + } + + double tol = 1.e-12; + int singularity = 0; /* -1 if A is invertible under tol. */ + + /* solve B*z = Q*b */ + if (solver == Solver::CHOL) { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + + } else if (solver == Solver::QR) { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle, + rowsA, + nnzA, + descrA, + d_csrValA, + d_csrRowPtrA, + d_csrColIndA, + d_b, + tol, + reorder_to_int(reorder), + d_x, + &singularity)); + } + } else { + RXMESH_ERROR( + "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x " + "for " + "Solver::LU"); + } + CUDA_ERROR(cudaDeviceSynchronize()); + + if (0 <= singularity) { + RXMESH_WARN( + "WARNING: the matrix is singular at row {} under tol ({})", + singularity, + tol); + } + } + + /* --- LOW LEVEL API --- */ + + /** + * @brief The lower level api of reordering. Sprcify the reordering type or + * simply NONE for no reordering. This should be called at the begining of + * the solving process. Any other function call order would be undefined. + * @param reorder: the reorder method applied. + */ + void spmat_chol_reorder(rxmesh::Reorder reorder) + { + if (reorder == Reorder::NONE) { + RXMESH_INFO("None reordering is specified", + "Continue without reordering"); + m_use_reorder = false; + + if (m_reorder_allocated) { + GPU_FREE(m_d_solver_val); + GPU_FREE(m_d_solver_row_ptr); + GPU_FREE(m_d_solver_col_idx); + GPU_FREE(m_d_permute); + free(m_h_permute); + m_reorder_allocated = false; + } + + return; + } + + /*check on host*/ + bool on_host = true; + if ((HOST & m_allocated) != HOST) { + move(DEVICE, HOST); + on_host = false; + } + + m_use_reorder = true; + + // allocate the purmutated csr + m_reorder_allocated = true; + CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T))); + CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr, + (m_row_size + 1) * sizeof(IndexT))); + CUDA_ERROR( + cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT))); + + m_h_permute = (IndexT*)malloc(m_row_size * sizeof(IndexT)); + CUDA_ERROR( + cudaMalloc((void**)&m_d_permute, m_row_size * sizeof(IndexT))); + + CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle)); + + if (reorder == Reorder::SYMRCM) { + CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute)); + } else if (reorder == Reorder::SYMAMD) { + CUSOLVER_ERROR(cusolverSpXcsrsymamdHost(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute)); + } else if (reorder == Reorder::NSTDIS) { + CUSOLVER_ERROR(cusolverSpXcsrmetisndHost(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + NULL, + m_h_permute)); + } + + CUDA_ERROR(cudaMemcpyAsync(m_d_permute, + m_h_permute, + m_row_size * sizeof(IndexT), + cudaMemcpyHostToDevice)); + + // working space for permutation: B = A*Q*A^T + // the permutation for matrix A which works only for the col and row + // indices, the val will be done on device with the d/h_val_permute + IndexT* h_val_permute = + static_cast(malloc(m_nnz * sizeof(IndexT))); + IndexT* d_val_permute; + CUDA_ERROR(cudaMalloc((void**)&d_val_permute, m_nnz * sizeof(IndexT))); + + size_t size_perm = 0; + void* perm_buffer_cpu = NULL; + + CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle, + m_row_size, + m_col_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute, + m_h_permute, + &size_perm)); + + perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm); + + for (int j = 0; j < m_nnz; j++) { + h_val_permute[j] = j; + } + + CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle, + m_row_size, + m_col_size, + m_nnz, + m_descr, + m_h_row_ptr, + m_h_col_idx, + m_h_permute, + m_h_permute, + h_val_permute, + perm_buffer_cpu)); + + + // do the permutation for val indices on host + // T* tmp_h_val = static_cast(malloc(m_nnz * sizeof(T))); + + // for (int j = 0; j < m_nnz; j++) { + // tmp_h_val[j] = m_h_val[j]; + // } + // for (int j = 0; j < m_nnz; j++) { + // m_h_val[j] = tmp_h_val[h_val_permute[j]]; + // } + + // copy the purmutated csr from the host + CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val, + m_h_val, + m_nnz * sizeof(T), + cudaMemcpyHostToDevice)); + CUDA_ERROR(cudaMemcpyAsync(m_d_solver_row_ptr, + m_h_row_ptr, + (m_row_size + 1) * sizeof(IndexT), + cudaMemcpyHostToDevice)); + CUDA_ERROR(cudaMemcpyAsync(m_d_solver_col_idx, + m_h_col_idx, + m_nnz * sizeof(IndexT), + cudaMemcpyHostToDevice)); + + // do the permutation for val indices on device + CUDA_ERROR(cudaMemcpyAsync(d_val_permute, + h_val_permute, + m_nnz * sizeof(IndexT), + cudaMemcpyHostToDevice)); + + detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz); + + free(h_val_permute); + GPU_FREE(d_val_permute); + + // restore the host data back to the original + if (on_host) { + move(DEVICE, HOST); + } else { + release(HOST); + } + } + + /** + * @brief The lower level api of matrix analysis. Generating a member value + * of type csrcholInfo_t for cucolver. + */ + void spmat_chol_analysis() + { + if (!m_use_reorder) { + m_d_solver_row_ptr = m_d_row_ptr; + m_d_solver_col_idx = m_d_col_idx; + m_d_solver_val = m_d_val; + } + + CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&m_chol_info)); + m_internalDataInBytes = 0; + m_workspaceInBytes = 0; + CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_solver_row_ptr, + m_d_solver_col_idx, + m_chol_info)); + } + + /** + * @brief The lower level api of matrix factorization buffer calculation and + * allocation. The buffer is a member variable. + */ + void spmat_chol_buffer_alloc() + { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, + m_chol_info, + &m_internalDataInBytes, + &m_workspaceInBytes)); + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholBufferInfo(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, + m_chol_info, + &m_internalDataInBytes, + &m_workspaceInBytes)); + } + + CUDA_ERROR(cudaMalloc((void**)&m_chol_buffer, m_workspaceInBytes)); + } + + /** + * @brief The lower level api of matrix factorization buffer release. + */ + void spmat_chol_buffer_free() + { + CUDA_ERROR(cudaFree(m_chol_buffer)); + } + + /** + * @brief The lower level api of matrix factorization and save the + * factorization result in to the buffer. + */ + void spmat_chol_factor() + { + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, + m_chol_info, + m_chol_buffer)); + } + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholFactor(m_cusolver_sphandle, + m_row_size, + m_nnz, + m_descr, + m_d_solver_val, + m_d_solver_row_ptr, + m_d_solver_col_idx, + m_chol_info, + m_chol_buffer)); + } + + double tol = 1.0e-8; + int singularity; + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot( + m_cusolver_sphandle, m_chol_info, tol, &singularity)); + } + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot( + m_cusolver_sphandle, m_chol_info, tol, &singularity)); + } + if (0 <= singularity) { + RXMESH_WARN( + "WARNING: the matrix is singular at row {} under tol ({})", + singularity, + tol); + } + } + + /** + * @brief The lower level api of solving the linear system after using + * cholesky factorization. The format follows Ax=b to solve x, where A is + * the sparse matrix, x and b are device array. As long as A doesn't change. + * This function could be called for many different b and x. + * @param d_b: device array of b + * @param d_x: device array of x + */ + void spmat_chol_solve(T* d_b, T* d_x) + { + + T* d_solver_b; + T* d_solver_x; + + if (m_use_reorder) { + /* purmute b and x*/ + CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T))); + detail::permute_gather(m_d_permute, d_b, d_solver_b, m_row_size); + + CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T))); + detail::permute_gather(m_d_permute, d_x, d_solver_x, m_row_size); + } else { + d_solver_b = d_b; + d_solver_x = d_x; + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpScsrcholSolve(m_cusolver_sphandle, + m_row_size, + d_solver_b, + d_solver_x, + m_chol_info, + m_chol_buffer)); + } + + if constexpr (std::is_same_v) { + CUSOLVER_ERROR(cusolverSpDcsrcholSolve(m_cusolver_sphandle, + m_row_size, + d_solver_b, + d_solver_x, + m_chol_info, + m_chol_buffer)); + } + + if (m_use_reorder) { + detail::permute_scatter(m_d_permute, d_solver_x, d_x, m_row_size); + GPU_FREE(d_solver_b); + GPU_FREE(d_solver_x); + } + } + + /* Host data compatibility */ + /** + * @brief move the data between host an device + */ + void move(locationT source, locationT target, cudaStream_t stream = NULL) + { + if (source == target) { + RXMESH_WARN( + "SparseMatrix::move() source ({}) and target ({}) " + "are the same.", + location_to_string(source), + location_to_string(target)); + return; + } + + if ((source == HOST || source == DEVICE) && + ((source & m_allocated) != source)) { + RXMESH_ERROR( + "SparseMatrix::move() moving source is not valid" + " because it was not allocated on source i.e., {}", + location_to_string(source)); + } + + if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) && + ((target & m_allocated) != target)) { + RXMESH_WARN( + "SparseMatrix::move() allocating target before moving to {}", + location_to_string(target)); + allocate(target); + } + + if (source == HOST && target == DEVICE) { + CUDA_ERROR(cudaMemcpyAsync(m_d_val, + m_h_val, + m_nnz * sizeof(T), + cudaMemcpyHostToDevice, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_d_row_ptr, + m_h_row_ptr, + (m_row_size + 1) * sizeof(IndexT), + cudaMemcpyHostToDevice, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_d_col_idx, + m_h_col_idx, + m_nnz * sizeof(IndexT), + cudaMemcpyHostToDevice, + stream)); + } else if (source == DEVICE && target == HOST) { + CUDA_ERROR(cudaMemcpyAsync(m_h_val, + m_d_val, + m_nnz * sizeof(T), + cudaMemcpyDeviceToHost, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_h_row_ptr, + m_d_row_ptr, + (m_row_size + 1) * sizeof(IndexT), + cudaMemcpyDeviceToHost, + stream)); + CUDA_ERROR(cudaMemcpyAsync(m_h_col_idx, + m_d_col_idx, + m_nnz * sizeof(IndexT), + cudaMemcpyDeviceToHost, + stream)); + } + } + + /** + * @brief release the data on host or device + */ + void release(locationT location = LOCATION_ALL) + { + if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) { + free(m_h_val); + free(m_h_row_ptr); + free(m_h_col_idx); + m_h_val = nullptr; + m_h_row_ptr = nullptr; + m_h_col_idx = nullptr; + m_allocated = m_allocated & (~HOST); + } + + if (((location & DEVICE) == DEVICE) && + ((m_allocated & DEVICE) == DEVICE)) { + GPU_FREE(m_d_val); + GPU_FREE(m_d_row_ptr); + GPU_FREE(m_d_col_idx); + m_allocated = m_allocated & (~DEVICE); + } + } + + /** + * @brief allocate the data on host or device + */ + void allocate(locationT location) + { + if ((location & HOST) == HOST) { + release(HOST); + + m_h_val = static_cast(malloc(m_nnz * sizeof(T))); + m_h_row_ptr = + static_cast(malloc((m_row_size + 1) * sizeof(IndexT))); + m_h_col_idx = static_cast(malloc(m_nnz * sizeof(IndexT))); + + m_allocated = m_allocated | HOST; + } + + if ((location & DEVICE) == DEVICE) { + release(DEVICE); + + CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T))); + CUDA_ERROR(cudaMalloc((void**)&m_d_row_ptr, + (m_row_size + 1) * sizeof(IndexT))); + CUDA_ERROR( + cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT))); + + m_allocated = m_allocated | DEVICE; + } + } + + + private: + const Context m_context; + cusparseHandle_t m_cusparse_handle; + cusolverSpHandle_t m_cusolver_sphandle; + cusparseSpMatDescr_t m_spdescr; + cusparseMatDescr_t m_descr; + + IndexT m_row_size; + IndexT m_col_size; + IndexT m_nnz; + + // device csr data + IndexT* m_d_row_ptr; + IndexT* m_d_col_idx; + T* m_d_val; + + // host csr data + IndexT* m_h_row_ptr; + IndexT* m_h_col_idx; + T* m_h_val; + + // susparse buffer + size_t m_spmm_buffer_size; + size_t m_spmv_buffer_size; + + // lower level API parameters + csrcholInfo_t m_chol_info; + size_t m_internalDataInBytes; + size_t m_workspaceInBytes; + void* m_chol_buffer; + + // purmutation array + IndexT* m_h_permute; + IndexT* m_d_permute; + + // CSR matrix for solving only + // equal to the original matrix if not permutated + // only allocated as a new CSR matrix if permutated + bool m_reorder_allocated; + IndexT* m_d_solver_row_ptr; + IndexT* m_d_solver_col_idx; + T* m_d_solver_val; + + // flags + bool m_use_reorder; + locationT m_allocated; }; } // namespace rxmesh \ No newline at end of file diff --git a/include/rxmesh/util/macros.h b/include/rxmesh/util/macros.h index 8cf0e34f..0ad4be4b 100644 --- a/include/rxmesh/util/macros.h +++ b/include/rxmesh/util/macros.h @@ -1,6 +1,8 @@ #pragma once #include +#include +#include #include #include "rxmesh/util/log.h" @@ -48,6 +50,67 @@ inline void HandleError(cudaError_t err, const char* file, int line) } #define CUDA_ERROR(err) (HandleError(err, __FILE__, __LINE__)) +inline void cusparseHandleError(cusparseStatus_t status, + const char* file, + const int line) +{ + if (status != CUSPARSE_STATUS_SUCCESS) { + Log::get_logger()->error("Line {} File {}", line, file); + Log::get_logger()->error("CUSPARSE ERROR: {}", + cusparseGetErrorString(status)); +#ifdef _WIN32 + system("pause"); +#else + exit(EXIT_FAILURE); +#endif + } + return; +} +#define CUSPARSE_ERROR(err) (cusparseHandleError(err, __FILE__, __LINE__)) + + +static inline void cusolverHandleError(cusolverStatus_t status, + const char* file, + const int line) +{ + if (status != CUSOLVER_STATUS_SUCCESS) { + auto cusolverGetErrorString = [](cusolverStatus_t status) { + switch (status) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + default: + return "UNKNOWN_ERROR"; + } + }; + + Log::get_logger()->error("Line {} File {}", line, file); + Log::get_logger()->error("CUSOLVER ERROR: {}", + cusolverGetErrorString(status)); +#ifdef _WIN32 + system("pause"); +#else + exit(EXIT_FAILURE); +#endif + } + return; +} +#define CUSOLVER_ERROR(err) (cusolverHandleError(err, __FILE__, __LINE__)) + + // GPU_FREE #define GPU_FREE(ptr) \ if (ptr != nullptr) { \ diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh index 1fc4014c..c4bd941a 100644 --- a/tests/RXMesh_test/test_sparse_matrix.cuh +++ b/tests/RXMesh_test/test_sparse_matrix.cuh @@ -1,7 +1,8 @@ #include "gtest/gtest.h" #include "rxmesh/attribute.h" -#include "rxmesh/matrix/matrix_operation.cuh" +#include "rxmesh/matrix/dense_matrix.cuh" +#include "rxmesh/matrix/sparse_matrix.cuh" #include "rxmesh/query.cuh" #include "rxmesh/rxmesh_static.h" @@ -11,7 +12,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context, IndexT* vet_degree) { using namespace rxmesh; - auto compute_valence = [&](VertexHandle& v_id, const VertexIterator& iter) { auto ids = v_id.unpack(); uint32_t patch_id = ids.first; @@ -26,26 +26,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context, query.dispatch(block, shrd_alloc, compute_valence); } -template -__global__ static void sparse_mat_query_test( - const rxmesh::Context context, - rxmesh::SparseMatrix sparse_mat) -{ - using namespace rxmesh; - auto fillin = [&](VertexHandle& v_id, const VertexIterator& iter) { - sparse_mat(v_id, v_id) = 2; - for (uint32_t v = 0; v < iter.size(); ++v) { - sparse_mat(v_id, iter[v]) = 2; - sparse_mat(iter[v], v_id) = 2; - } - }; - - auto block = cooperative_groups::this_thread_block(); - Query query(context); - ShmemAllocator shrd_alloc; - query.dispatch(block, shrd_alloc, fillin); -} - template __global__ static void sparse_mat_edge_len_test( const rxmesh::Context context, @@ -61,8 +41,7 @@ __global__ static void sparse_mat_edge_len_test( uint32_t r_patch_id = r_ids.first; uint16_t r_local_id = r_ids.second; - uint32_t row_index = - sparse_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; + uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; arr_ref[row_index] = 0; sparse_mat(v_id, v_id) = 0; @@ -72,9 +51,9 @@ __global__ static void sparse_mat_edge_len_test( Vector<3, T> vi_coord( coords(iter[v], 0), coords(iter[v], 1), coords(iter[v], 2)); - sparse_mat(v_id, iter[v]) = dist(v_coord, vi_coord); + sparse_mat(v_id, iter[v]) = 1; // dist(v_coord, vi_coord); - arr_ref[row_index] += dist(v_coord, vi_coord); + arr_ref[row_index] += 1; // dist(v_coord, vi_coord); } }; @@ -93,11 +72,11 @@ __global__ void spmat_multi_hardwired_kernel(T* vec, int tid = threadIdx.x + blockIdx.x * blockDim.x; float sum = 0; if (tid < N) { - uint32_t start = sparse_mat.m_d_row_ptr[tid]; - uint32_t end = sparse_mat.m_d_row_ptr[tid + 1]; + uint32_t start = sparse_mat.get_row_ptr_at(tid); + uint32_t end = sparse_mat.get_row_ptr_at(tid + 1); for (int i = 0; i < end - start; i++) { - sum += vec[sparse_mat.m_d_col_idx[start + i]] * - sparse_mat.m_d_val[start + i]; + sum += vec[sparse_mat.get_col_idx_at(start + i)] * + sparse_mat.get_val_at(start + i); } out[tid] = sum; } @@ -122,12 +101,11 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, uint32_t r_patch_id = r_ids.first; uint16_t r_local_id = r_ids.second; - uint32_t row_index = - A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id; + uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id; - B_mat(row_index, 0) = iter.size() * 7; - B_mat(row_index, 1) = iter.size() * 2; - B_mat(row_index, 2) = iter.size() * 10; + B_mat(row_index, 0) = iter.size() * 7.4f; + B_mat(row_index, 1) = iter.size() * 2.6f; + B_mat(row_index, 2) = iter.size() * 10.3f; X_mat(row_index, 0) = coords(v_id, 0) * v_weight; X_mat(row_index, 1) = coords(v_id, 1) * v_weight; @@ -137,12 +115,13 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, coords(v_id, 0), coords(v_id, 1), coords(v_id, 2)); for (uint32_t v = 0; v < iter.size(); ++v) { T e_weight = 1; - A_mat(v_id, iter[v]) = -time_step * e_weight; + A_mat(v_id, iter[v]) = time_step * e_weight; sum_e_weight += e_weight; } - A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight; + A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight + + iter.size() * iter.size() + 1000000; }; auto block = cooperative_groups::this_thread_block(); @@ -151,6 +130,7 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context context, query.dispatch(block, shrd_alloc, mat_setup); } +/* Check the access of the sparse matrix in CSR format in device */ TEST(RXMeshStatic, SparseMatrix) { using namespace rxmesh; @@ -197,6 +177,7 @@ TEST(RXMeshStatic, SparseMatrix) rxmesh.prepare_launch_box( {Op::VV}, launch_box, (void*)sparse_mat_test); + // test kernel sparse_mat_test << spmat(rxmesh); - spmat.set_ones(); - - LaunchBox launch_box; - rxmesh.prepare_launch_box( - {Op::VV}, launch_box, (void*)sparse_mat_query_test); - - sparse_mat_query_test - <<>>(rxmesh.get_context(), spmat); - - std::vector h_result(spmat.m_nnz); - CUDA_ERROR(cudaMemcpy(h_result.data(), - spmat.m_d_val, - spmat.m_nnz * sizeof(int), - cudaMemcpyDeviceToHost)); - - std::vector h_ref(spmat.m_nnz, 2); - - for (int i = 0; i < spmat.m_nnz; ++i) { - EXPECT_EQ(h_result[i], h_ref[i]); - } - - spmat.free(); + spmat.free_mat(); } +/* First replace the sparse matrix entry with the edge length and then do spmv + * with an all one array and check the result + */ TEST(RXMeshStatic, SparseMatrixEdgeLen) { using namespace rxmesh; @@ -268,8 +209,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) cuda_query(0); // generate rxmesh obj - std::string obj_path = STRINGIFY(INPUT_DIR) "cube.obj"; - RXMeshStatic rxmesh(obj_path); + RXMeshStatic rxmesh(rxmesh_args.obj_file_name, rxmesh_args.quite); uint32_t num_vertices = rxmesh.get_num_vertices(); @@ -304,11 +244,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, spmat, d_arr_ref); - // Spmat matrix multiply - - // spmat_multi_hardwired_kernel - // <<>>(d_arr_ones, spmat, d_result, num_vertices); - spmat_arr_mul(spmat, d_arr_ones, d_result); + spmat.arr_mul(d_arr_ones, d_result); // copy the value back to host std::vector h_arr_ref(num_vertices); @@ -330,9 +266,13 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen) CUDA_ERROR(cudaFree(d_arr_ref)); CUDA_ERROR(cudaFree(d_arr_ones)); CUDA_ERROR(cudaFree(d_result)); - spmat.free(); + spmat.free_mat(); } +/* set up a simple AX=B system where A is a sparse matrix, B and C are dense + * matrix. Solve it using the warpped up cusolver API and check the final AX + * with B using warpped up cusparse API. + */ TEST(RXMeshStatic, SparseMatrixSimpleSolve) { using namespace rxmesh; @@ -341,8 +281,8 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) cuda_query(0); // generate rxmesh obj - std::string obj_path = STRINGIFY(INPUT_DIR) "dragon.obj"; - RXMeshStatic rxmesh(obj_path); + std::string obj_path = rxmesh_args.obj_file_name; + RXMeshStatic rxmesh(obj_path, rxmesh_args.quite); uint32_t num_vertices = rxmesh.get_num_vertices(); @@ -366,21 +306,99 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve) launch_box.smem_bytes_dyn>>>( rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); - spmat_linear_solve(A_mat, B_mat, X_mat, Solver::CHOL, Reorder::NONE); + A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); + + // timing begins for spmm + GPUTimer timer; + timer.start(); - spmat_denmat_mul(A_mat, X_mat, ret_mat); - // spmat_denmat_mul_cw(A_mat, X_mat, ret_mat); + A_mat.denmat_mul(X_mat, ret_mat); - std::vector h_ret_mat(num_vertices * 3); - cudaMemcpy(h_ret_mat.data(), - ret_mat.data(), - num_vertices * 3, - cudaMemcpyDeviceToHost); - std::vector h_B_mat(num_vertices * 3); - cudaMemcpy( - h_B_mat.data(), B_mat.data(), num_vertices * 3, cudaMemcpyDeviceToHost); + timer.stop(); + RXMESH_TRACE("SPMM_rxmesh() took {} (ms) ", timer.elapsed_millis()); - for (uint32_t i = 0; i < num_vertices * 3; ++i) { - EXPECT_NEAR(h_ret_mat[i], h_B_mat[i], 1e-3); + std::vector h_ret_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), + ret_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + std::vector h_B_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_B_mat.data(), + B_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + + for (uint32_t i = 0; i < num_vertices; ++i) { + for (uint32_t j = 0; j < 3; ++j) { + EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); + } + } + A_mat.free_mat(); +} + +TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve) +{ + using namespace rxmesh; + + // Select device + cuda_query(0); + + // generate rxmesh obj + std::string obj_path = rxmesh_args.obj_file_name; + RXMeshStatic rxmesh(obj_path, rxmesh_args.quite); + + uint32_t num_vertices = rxmesh.get_num_vertices(); + + const uint32_t threads = 256; + const uint32_t blocks = DIVIDE_UP(num_vertices, threads); + + auto coords = rxmesh.get_input_vertex_coordinates(); + SparseMatrix A_mat(rxmesh); + DenseMatrix X_mat(num_vertices, 3); + DenseMatrix B_mat(num_vertices, 3); + DenseMatrix ret_mat(num_vertices, 3); + + float time_step = 1.f; + + LaunchBox launch_box; + rxmesh.prepare_launch_box( + {Op::VV}, launch_box, (void*)simple_A_X_B_setup); + + simple_A_X_B_setup<<>>( + rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step); + + // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS); + + A_mat.spmat_chol_reorder(Reorder::NSTDIS); + A_mat.spmat_chol_analysis(); + A_mat.spmat_chol_buffer_alloc(); + A_mat.spmat_chol_factor(); + + for (int i = 0; i < B_mat.m_col_size; ++i) { + A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i)); + } + + A_mat.spmat_chol_buffer_free(); + + A_mat.denmat_mul(X_mat, ret_mat); + + std::vector h_ret_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_ret_mat.data(), + ret_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + std::vector h_B_mat(num_vertices); + CUDA_ERROR(cudaMemcpy(h_B_mat.data(), + B_mat.data(), + num_vertices * 3 * sizeof(float), + cudaMemcpyDeviceToHost)); + + for (uint32_t i = 0; i < num_vertices; ++i) { + for (uint32_t j = 0; j < 3; ++j) { + EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3); + } } + A_mat.free_mat(); } \ No newline at end of file