From 58b4c545d226bc11d10a96de9640fef8dcf7bdc6 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Wed, 11 Jan 2023 22:39:51 -0800
Subject: [PATCH 01/44] update

---
 CMakeLists.txt                         |  2 ++
 apps/SparseMatrix/dense_matrix.cuh     | 10 +++++-----
 apps/SparseMatrix/matrix_operation.cuh |  6 ++++--
 3 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2f90908b..a517dcd0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,8 @@ project(RXMesh
 
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
+set(USE_POLYSCOPE "OFF")
+
 
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
diff --git a/apps/SparseMatrix/dense_matrix.cuh b/apps/SparseMatrix/dense_matrix.cuh
index 8713eef6..228582d2 100644
--- a/apps/SparseMatrix/dense_matrix.cuh
+++ b/apps/SparseMatrix/dense_matrix.cuh
@@ -79,11 +79,11 @@ struct DenseMatrix
         return m_row_size * m_col_size * sizeof(T);
     }
 
-    void quick_tanspose_w_ld_trans()
-    {
-        std::swap(m_row_size, m_col_size);
-        m_is_row_major = !(m_is_row_major);
-    }
+    // void quick_tanspose_w_ld_trans()
+    // {
+    //     std::swap(m_row_size, m_col_size);
+    //     m_is_row_major = !(m_is_row_major);
+    // }
 
     bool   m_is_row_major;
     IndexT m_row_size;
diff --git a/apps/SparseMatrix/matrix_operation.cuh b/apps/SparseMatrix/matrix_operation.cuh
index 2dd54cd3..4153935f 100644
--- a/apps/SparseMatrix/matrix_operation.cuh
+++ b/apps/SparseMatrix/matrix_operation.cuh
@@ -99,7 +99,9 @@ void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
                         T*                      B_arr,
                         T*                      X_arr,
                         rxmesh::Solver          solver,
-                        rxmesh::Reorder         reorder)
+                        rxmesh::Reorder         reorder 
+                        //cudaStream_t       stream = null
+                        )
 {
     cusolverSpHandle_t handle         = NULL;
     cusparseHandle_t   cusparseHandle = NULL;
@@ -326,7 +328,7 @@ void spmat_denmat_mul(rxmesh::SparseMatrix<T> A_mat,
     cusparseCreateDnMat(&matB,
                         B_mat.m_row_size,
                         B_mat.m_col_size,
-                        B_mat.lead_dim(),
+                        B_mat.lead_dim(), // lead_dim < row_size
                         B_mat.data(),
                         CUDA_R_32F,
                         CUSPARSE_ORDER_COL);

From 0b16207d54e3a46637b1ebcb754108e8f90b859c Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Thu, 12 Jan 2023 00:17:30 -0800
Subject: [PATCH 02/44] col major

---
 CMakeLists.txt                             |  1 +
 include/rxmesh/matrix/dense_matrix.cuh     | 29 ++++------------------
 include/rxmesh/matrix/matrix_operation.cuh |  6 ++---
 3 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a517dcd0..77844bf4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
+
 project(RXMesh 
         VERSION 0.2.1 
         LANGUAGES C CXX CUDA)
diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index b48d1a7f..fba446fe 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -9,6 +9,7 @@ namespace rxmesh {
 
 // Currently this is device only
 // host/device transormation will be added
+// only col major is supportedF
 template <typename T, typename IndexT = int>
 struct DenseMatrix
 {
@@ -16,7 +17,6 @@ struct DenseMatrix
         : m_row_size(row_size), m_col_size(col_size)
     {
         cudaMalloc((void**)&m_d_val, bytes());
-        m_is_row_major = false;
     }
 
     DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major)
@@ -38,29 +38,17 @@ struct DenseMatrix
 
     IndexT lead_dim() const
     {
-        if (m_is_row_major) {
-            return m_col_size;
-        } else {
-            return m_row_size;
-        }
+        return m_row_size;
     }
 
     __device__ T& operator()(const uint32_t row, const uint32_t col)
     {
-        if (m_is_row_major) {
-            return m_d_val[row * m_col_size + col];
-        } else {
-            return m_d_val[col * m_row_size + row];
-        }
+        return m_d_val[col * m_row_size + row];
     }
 
     __device__ T& operator()(const uint32_t row, const uint32_t col) const
     {
-        if (m_is_row_major) {
-            return m_d_val[row * m_col_size + col];
-        } else {
-            return m_d_val[col * m_row_size + row];
-        }
+        return m_d_val[col * m_row_size + row];
     }
 
     T* data() const
@@ -68,7 +56,7 @@ struct DenseMatrix
         return m_d_val;
     }
 
-    T* ld_data(const uint32_t ld_idx) const
+    T* col_data(const uint32_t ld_idx) const
     {
         return m_d_val + ld_idx * lead_dim();
     }
@@ -78,13 +66,6 @@ struct DenseMatrix
         return m_row_size * m_col_size * sizeof(T);
     }
 
-    // void quick_tanspose_w_ld_trans()
-    // {
-    //     std::swap(m_row_size, m_col_size);
-    //     m_is_row_major = !(m_is_row_major);
-    // }
-
-    bool   m_is_row_major;
     IndexT m_row_size;
     IndexT m_col_size;
     T*     m_d_val;
diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh
index b8575992..7e70857d 100644
--- a/include/rxmesh/matrix/matrix_operation.cuh
+++ b/include/rxmesh/matrix/matrix_operation.cuh
@@ -171,8 +171,8 @@ void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
                                        A_mat.m_d_row_ptr,
                                        A_mat.m_d_col_idx,
                                        A_mat.m_d_val,
-                                       B_mat.ld_data(i),
-                                       X_mat.ld_data(i));
+                                       B_mat.col_data(i),
+                                       X_mat.col_data(i));
     }
 
     cusolverSpDestroy(handle);
@@ -441,7 +441,7 @@ void spmat_denmat_mul_cw(rxmesh::SparseMatrix<T> A_mat,
                          rxmesh::DenseMatrix<T>  C_mat)
 {
     for (int i = 0; i < B_mat.m_col_size; ++i) {
-        spmat_arr_mul(A_mat, B_mat.ld_data(i), C_mat.ld_data(i));
+        spmat_arr_mul(A_mat, B_mat.col_data(i), C_mat.col_data(i));
     }
 }
 

From e418fab6e193e2e09ecce27e87bcfde054d09b6f Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Thu, 12 Jan 2023 00:38:30 -0800
Subject: [PATCH 03/44] col-wise fix

---
 CMakeLists.txt                         | 3 +++
 include/rxmesh/matrix/dense_matrix.cuh | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77844bf4..f126b332 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,8 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 75)
+endif()
 
 project(RXMesh 
         VERSION 0.2.1 
diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index fba446fe..6940b495 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -21,8 +21,7 @@ struct DenseMatrix
 
     DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major)
         : m_row_size(row_size),
-          m_col_size(col_size),
-          m_is_row_major(is_row_major)
+          m_col_size(col_size)
     {
         cudaMalloc((void**)&m_d_val, bytes());
     }

From f3c84d6068cca929c67cb1d9685dce4b5186deec Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Fri, 3 Feb 2023 21:27:36 -0800
Subject: [PATCH 04/44] stable version

---
 include/rxmesh/matrix/dense_matrix.cuh     |  26 +-
 include/rxmesh/matrix/matrix_operation.cuh | 334 ++++-----------------
 include/rxmesh/matrix/sparse_matrix.cuh    | 238 ++++++++++++++-
 include/rxmesh/util/macros.h               |  63 ++++
 tests/RXMesh_test/test_sparse_matrix.cuh   |  73 +++--
 5 files changed, 417 insertions(+), 317 deletions(-)

diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index 6940b495..66b403a5 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -1,6 +1,6 @@
 #pragma once
 #include <vector>
-
+#include "cusparse.h"
 #include "rxmesh/attribute.h"
 #include "rxmesh/context.h"
 #include "rxmesh/types.h"
@@ -14,16 +14,17 @@ template <typename T, typename IndexT = int>
 struct DenseMatrix
 {
     DenseMatrix(IndexT row_size, IndexT col_size)
-        : m_row_size(row_size), m_col_size(col_size)
+        : m_row_size(row_size), m_col_size(col_size), m_dendescr(NULL)
     {
-        cudaMalloc((void**)&m_d_val, bytes());
-    }
+        CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
 
-    DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major)
-        : m_row_size(row_size),
-          m_col_size(col_size)
-    {
-        cudaMalloc((void**)&m_d_val, bytes());
+        CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr,
+                            m_row_size,
+                            m_col_size,
+                            m_row_size, // leading dim
+                            m_d_val,
+                            CUDA_R_32F,
+                            CUSPARSE_ORDER_COL));
     }
 
     void set_ones()
@@ -65,9 +66,10 @@ struct DenseMatrix
         return m_row_size * m_col_size * sizeof(T);
     }
 
-    IndexT m_row_size;
-    IndexT m_col_size;
-    T*     m_d_val;
+    cusparseDnMatDescr_t m_dendescr;
+    IndexT               m_row_size;
+    IndexT               m_col_size;
+    T*                   m_d_val;
 };
 
 }  // namespace rxmesh
diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh
index 7e70857d..0ab008a7 100644
--- a/include/rxmesh/matrix/matrix_operation.cuh
+++ b/include/rxmesh/matrix/matrix_operation.cuh
@@ -1,6 +1,4 @@
 #pragma once
-#include "cusolverSp.h"
-#include "cusparse.h"
 
 #include "rxmesh/attribute.h"
 #include "rxmesh/matrix/dense_matrix.cuh"
@@ -51,39 +49,6 @@ static int reorder_to_int(const Reorder& reorder)
     }
 }
 
-/**
- * @brief for transpose the dense matrix using cublas
- */
-template <typename T>
-void denmat_transpose(rxmesh::DenseMatrix<T> den_mat)
-{
-    T* d_rt_arr;
-    cudaMalloc(&d_rt_arr, den_mat.bytes());
-
-    float const alpha(1.0);
-    float const beta(0.0);
-
-    cublasHandle_t handle;
-    cublasCreate(&handle);
-    cublasSgeam(handle,
-                CUBLAS_OP_T,
-                CUBLAS_OP_N,
-                den_mat.m_row_size,
-                den_mat.m_col_size,
-                &alpha,               // 0
-                den_mat.data(),       // A_arr
-                den_mat.m_col_size,   // ld_a
-                &beta,                // 0
-                den_mat.data(),       // B_arr
-                den_mat.m_row_size,   // ld_b
-                d_rt_arr,             // rt_arr
-                den_mat.m_row_size);  // ld_cm
-    cublasDestroy(handle);
-
-    den_mat.m_d_val = d_rt_arr;
-    // TODO cont;
-}
-
 /**
  * @brief solve the Ax=b for x where x and b are all array
  */
@@ -92,31 +57,13 @@ void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
                         T*                      B_arr,
                         T*                      X_arr,
                         rxmesh::Solver          solver,
-                        rxmesh::Reorder         reorder 
-                        //cudaStream_t       stream = null
-                        )
+                        rxmesh::Reorder         reorder)
 {
-    cusolverSpHandle_t handle         = NULL;
-    cusparseHandle_t   cusparseHandle = NULL;
-    cudaStream_t       stream         = NULL;
-    cusparseMatDescr_t descrA         = NULL;
-
-    cusolverSpCreate(&handle);
-    cusparseCreate(&cusparseHandle);
-
-    cudaStreamCreate(&stream);
-    cusolverSpSetStream(handle, stream);
-    cusparseSetStream(cusparseHandle, stream);
-
-    cusparseCreateMatDescr(&descrA);
-    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
-    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
-
+    A_mat.create_cusolver_sphandle();
     cusparse_linear_solver_wrapper(solver,
                                    reorder,
-                                   handle,
-                                   cusparseHandle,
-                                   descrA,
+                                   A_mat.m_cusolver_sphandles,
+                                   A_mat.m_descr,
                                    A_mat.m_row_size,
                                    A_mat.m_col_size,
                                    A_mat.m_nnz,
@@ -125,11 +72,6 @@ void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
                                    A_mat.m_d_val,
                                    B_arr,
                                    X_arr);
-
-    cusolverSpDestroy(handle);
-    cusparseDestroy(cusparseHandle);
-    cudaStreamDestroy(stream);
-    cusparseDestroyMatDescr(descrA);
 }
 
 /**
@@ -143,28 +85,12 @@ void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
                         rxmesh::Solver          solver,
                         rxmesh::Reorder         reorder)
 {
-    cusolverSpHandle_t handle         = NULL;
-    cusparseHandle_t   cusparseHandle = NULL;
-    cudaStream_t       stream         = NULL;
-    cusparseMatDescr_t descrA         = NULL;
-
-    cusolverSpCreate(&handle);
-    cusparseCreate(&cusparseHandle);
-
-    cudaStreamCreate(&stream);
-    cusolverSpSetStream(handle, stream);
-    cusparseSetStream(cusparseHandle, stream);
-
-    cusparseCreateMatDescr(&descrA);
-    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
-    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
-
+    // A_mat.create_cusolver_sphandle();
     for (int i = 0; i < B_mat.m_col_size; ++i) {
         cusparse_linear_solver_wrapper(solver,
                                        reorder,
-                                       handle,
-                                       cusparseHandle,
-                                       descrA,
+                                       A_mat.m_cusolver_sphandle,
+                                       A_mat.m_descr,
                                        A_mat.m_row_size,
                                        A_mat.m_col_size,
                                        A_mat.m_nnz,
@@ -174,11 +100,6 @@ void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
                                        B_mat.col_data(i),
                                        X_mat.col_data(i));
     }
-
-    cusolverSpDestroy(handle);
-    cusparseDestroy(cusparseHandle);
-    cudaStreamDestroy(stream);
-    cusparseDestroyMatDescr(descrA);
 }
 
 /**
@@ -189,7 +110,6 @@ template <typename T>
 void cusparse_linear_solver_wrapper(const rxmesh::Solver  solver,
                                     const rxmesh::Reorder reorder,
                                     cusolverSpHandle_t    handle,
-                                    cusparseHandle_t      cusparseHandle,
                                     cusparseMatDescr_t    descrA,
                                     int                   rowsA,
                                     int                   colsA,
@@ -213,64 +133,64 @@ void cusparse_linear_solver_wrapper(const rxmesh::Solver  solver,
     /* solve B*z = Q*b */
     if (solver == Solver::CHOL) {
         if constexpr (std::is_same_v<T, float>) {
-            cusolverSpScsrlsvchol(handle,
-                                  rowsA,
-                                  nnzA,
-                                  descrA,
-                                  d_csrValA,
-                                  d_csrRowPtrA,
-                                  d_csrColIndA,
-                                  d_b,
-                                  tol,
-                                  reorder_to_int(reorder),
-                                  d_x,
-                                  &singularity);
+            CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle,
+                                                 rowsA,
+                                                 nnzA,
+                                                 descrA,
+                                                 d_csrValA,
+                                                 d_csrRowPtrA,
+                                                 d_csrColIndA,
+                                                 d_b,
+                                                 tol,
+                                                 reorder_to_int(reorder),
+                                                 d_x,
+                                                 &singularity));
         }
 
         if constexpr (std::is_same_v<T, double>) {
-            cusolverSpDcsrlsvchol(handle,
-                                  rowsA,
-                                  nnzA,
-                                  descrA,
-                                  d_csrValA,
-                                  d_csrRowPtrA,
-                                  d_csrColIndA,
-                                  d_b,
-                                  tol,
-                                  reorder_to_int(reorder),
-                                  d_x,
-                                  &singularity);
+            CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle,
+                                                 rowsA,
+                                                 nnzA,
+                                                 descrA,
+                                                 d_csrValA,
+                                                 d_csrRowPtrA,
+                                                 d_csrColIndA,
+                                                 d_b,
+                                                 tol,
+                                                 reorder_to_int(reorder),
+                                                 d_x,
+                                                 &singularity));
         }
 
     } else if (solver == Solver::QR) {
         if constexpr (std::is_same_v<T, float>) {
-            cusolverSpScsrlsvqr(handle,
-                                rowsA,
-                                nnzA,
-                                descrA,
-                                d_csrValA,
-                                d_csrRowPtrA,
-                                d_csrColIndA,
-                                d_b,
-                                tol,
-                                reorder_to_int(reorder),
-                                d_x,
-                                &singularity);
+            CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle,
+                                               rowsA,
+                                               nnzA,
+                                               descrA,
+                                               d_csrValA,
+                                               d_csrRowPtrA,
+                                               d_csrColIndA,
+                                               d_b,
+                                               tol,
+                                               reorder_to_int(reorder),
+                                               d_x,
+                                               &singularity));
         }
 
         if constexpr (std::is_same_v<T, double>) {
-            cusolverSpDcsrlsvqr(handle,
-                                rowsA,
-                                nnzA,
-                                descrA,
-                                d_csrValA,
-                                d_csrRowPtrA,
-                                d_csrColIndA,
-                                d_b,
-                                tol,
-                                reorder_to_int(reorder),
-                                d_x,
-                                &singularity);
+            CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle,
+                                               rowsA,
+                                               nnzA,
+                                               descrA,
+                                               d_csrValA,
+                                               d_csrRowPtrA,
+                                               d_csrColIndA,
+                                               d_b,
+                                               tol,
+                                               reorder_to_int(reorder),
+                                               d_x,
+                                               &singularity));
         }
     } else {
         RXMESH_ERROR(
@@ -286,150 +206,6 @@ void cusparse_linear_solver_wrapper(const rxmesh::Solver  solver,
     }
 }
 
-/**
- * @brief wrap up the cusparse api for sparse matrix dense matrix
- * multiplication.
- */
-template <typename T>
-void spmat_denmat_mul(rxmesh::SparseMatrix<T> A_mat,
-                      rxmesh::DenseMatrix<T>  B_mat,
-                      rxmesh::DenseMatrix<T>  C_mat)
-{
-    float alpha = 1.0f;
-    float beta  = 0.0f;
-
-    cusparseHandle_t     handle = NULL;
-    cusparseSpMatDescr_t matA;
-    cusparseDnMatDescr_t matB, matC;
-    void*                dBuffer    = NULL;
-    size_t               bufferSize = 0;
-
-    cusparseCreate(&handle);
-    // Create sparse matrix A in CSR format
-    cusparseCreateCsr(&matA,
-                      A_mat.m_row_size,
-                      A_mat.m_col_size,
-                      A_mat.m_nnz,
-                      A_mat.m_d_row_ptr,
-                      A_mat.m_d_col_idx,
-                      A_mat.m_d_val,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_BASE_ZERO,
-                      CUDA_R_32F);
-    // Create dense matrix B
-    cusparseCreateDnMat(&matB,
-                        B_mat.m_row_size,
-                        B_mat.m_col_size,
-                        B_mat.lead_dim(), // lead_dim < row_size
-                        B_mat.data(),
-                        CUDA_R_32F,
-                        CUSPARSE_ORDER_COL);
-    // Create dense matrix C
-    cusparseCreateDnMat(&matC,
-                        C_mat.m_row_size,
-                        C_mat.m_col_size,
-                        C_mat.lead_dim(),
-                        C_mat.data(),
-                        CUDA_R_32F,
-                        CUSPARSE_ORDER_COL);
-    // allocate an external buffer if needed
-    cusparseSpMM_bufferSize(handle,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            &alpha,
-                            matA,
-                            matB,
-                            &beta,
-                            matC,
-                            CUDA_R_32F,
-                            CUSPARSE_SPMM_ALG_DEFAULT,
-                            &bufferSize);
-    cudaMalloc(&dBuffer, bufferSize);
-
-    // execute SpMM
-    cusparseSpMM(handle,
-                 CUSPARSE_OPERATION_NON_TRANSPOSE,
-                 CUSPARSE_OPERATION_NON_TRANSPOSE,
-                 &alpha,
-                 matA,
-                 matB,
-                 &beta,
-                 matC,
-                 CUDA_R_32F,
-                 CUSPARSE_SPMM_ALG_DEFAULT,
-                 dBuffer);
-
-    // destroy matrix/vector descriptors
-    cusparseDestroySpMat(matA);
-    cusparseDestroyDnMat(matB);
-    cusparseDestroyDnMat(matC);
-    cusparseDestroy(handle);
-}
-
-
-/**
- * @brief wrap up the cusparse api for sparse matrix array
- * multiplication.
- */
-// only works for float
-template <typename T>
-void spmat_arr_mul(rxmesh::SparseMatrix<T> sp_mat, T* in_arr, T* rt_arr)
-{
-    const float minus_one = -1.0f;
-    const float one       = 1.0f;
-    const float zero      = 0.0f;
-
-    cusparseHandle_t     handle     = NULL;
-    void*                buffer     = NULL;
-    size_t               bufferSize = 0;
-    cusparseSpMatDescr_t sp_mat_des = NULL;
-    cusparseDnVecDescr_t vecx       = NULL;
-    cusparseDnVecDescr_t vecy       = NULL;
-
-    cusparseCreate(&handle);
-    cusparseCreateCsr(&sp_mat_des,
-                      sp_mat.m_row_size,
-                      sp_mat.m_col_size,
-                      sp_mat.m_nnz,
-                      sp_mat.m_d_row_ptr,
-                      sp_mat.m_d_col_idx,
-                      sp_mat.m_d_val,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_BASE_ZERO,
-                      CUDA_R_32F);
-    cusparseCreateDnVec(&vecx, sp_mat.m_col_size, in_arr, CUDA_R_32F);
-    cusparseCreateDnVec(&vecy, sp_mat.m_row_size, rt_arr, CUDA_R_32F);
-
-    cusparseSpMV_bufferSize(handle,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            &one,
-                            sp_mat_des,
-                            vecx,
-                            &zero,
-                            vecy,
-                            CUDA_R_32F,
-                            CUSPARSE_SPMV_ALG_DEFAULT,
-                            &bufferSize);
-    cudaMalloc(&buffer, bufferSize);
-
-    cusparseSpMV(handle,
-                 CUSPARSE_OPERATION_NON_TRANSPOSE,
-                 &one,
-                 sp_mat_des,
-                 vecx,
-                 &zero,
-                 vecy,
-                 CUDA_R_32F,
-                 CUSPARSE_SPMV_ALG_DEFAULT,
-                 buffer);
-
-    cusparseDestroySpMat(sp_mat_des);
-    cusparseDestroyDnVec(vecx);
-    cusparseDestroyDnVec(vecy);
-    cusparseDestroy(handle);
-}
 
 /**
  * @brief do the sparse matrix dense matrix multiplication using sparse matrix
diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 97133fbe..2f6239cb 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -1,4 +1,6 @@
 #pragma once
+#include "cusolverSp.h"
+#include "cusparse.h"
 #include "rxmesh/attribute.h"
 #include "rxmesh/context.h"
 #include "rxmesh/query.cuh"
@@ -70,7 +72,11 @@ struct SparseMatrix
           m_row_size(0),
           m_col_size(0),
           m_nnz(0),
-          m_context(rx.get_context())
+          m_context(rx.get_context()),
+          m_cusparse_handle(NULL),
+          m_descr(NULL),
+          m_spdescr(NULL),
+          m_spmm_buffer_size(0)
     {
         using namespace rxmesh;
         constexpr uint32_t blockThreads = 256;
@@ -138,6 +144,27 @@ struct SparseMatrix
         // val pointer allocation, actual value init should be in another
         // function
         CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(IndexT)));
+
+        CUSPARSE_ERROR(cusparseCreateMatDescr(&m_descr));
+        CUSPARSE_ERROR(
+            cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+        CUSPARSE_ERROR(
+            cusparseSetMatIndexBase(m_descr, CUSPARSE_INDEX_BASE_ZERO));
+
+        CUSPARSE_ERROR(cusparseCreateCsr(&m_spdescr,
+                                         m_row_size,
+                                         m_col_size,
+                                         m_nnz,
+                                         m_d_row_ptr,
+                                         m_d_col_idx,
+                                         m_d_val,
+                                         CUSPARSE_INDEX_32I,
+                                         CUSPARSE_INDEX_32I,
+                                         CUSPARSE_INDEX_BASE_ZERO,
+                                         CUDA_R_32F));
+
+        CUSPARSE_ERROR(cusparseCreate(&m_cusparse_handle));
+        CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle));
     }
 
     void set_ones()
@@ -186,20 +213,215 @@ struct SparseMatrix
         return m_d_val[get_val_idx(row_v, col_v)];
     }
 
+    __device__ T& direct_access(IndexT x, IndexT y)
+    {
+        const IndexT start = m_d_row_ptr[x];
+        const IndexT end   = m_d_row_ptr[x + 1];
+
+        for (IndexT i = start; i < end; ++i) {
+            if (m_d_col_idx[i] == y) {
+                return m_d_val[i];
+            }
+        }
+        assert(1 != 1);
+    }
+
     void free()
     {
         CUDA_ERROR(cudaFree(m_d_row_ptr));
         CUDA_ERROR(cudaFree(m_d_col_idx));
         CUDA_ERROR(cudaFree(m_d_val));
+        CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle));
+        CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr));
+        CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle));
+    }
+
+
+    /**
+     * @brief wrap up the cusparse api for sparse matrix dense matrix
+     * multiplication buffer size calculation.
+     */
+    void denmat_mul_buffer_size(rxmesh::DenseMatrix<T> B_mat,
+                                rxmesh::DenseMatrix<T> C_mat,
+                                cudaStream_t           stream = 0)
+    {
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        cusparseSpMatDescr_t matA    = m_spdescr;
+        cusparseDnMatDescr_t matB    = B_mat.m_dendescr;
+        cusparseDnMatDescr_t matC    = C_mat.m_dendescr;
+        void*                dBuffer = NULL;
+
+        cusparseSetStream(m_cusparse_handle, stream);
+
+        // allocate an external buffer if needed
+        CUSPARSE_ERROR(cusparseSpMM_bufferSize(m_cusparse_handle,
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                               &alpha,
+                                               matA,
+                                               matB,
+                                               &beta,
+                                               matC,
+                                               CUDA_R_32F,
+                                               CUSPARSE_SPMM_ALG_DEFAULT,
+                                               &m_spmm_buffer_size));
+    }
+
+    /**
+     * @brief wrap up the cusparse api for sparse matrix dense matrix
+     * multiplication.
+     */
+    void denmat_mul(rxmesh::DenseMatrix<T> B_mat,
+                    rxmesh::DenseMatrix<T> C_mat,
+                    cudaStream_t           stream = 0)
+    {
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // A_mat.create_cusparse_handle();
+        cusparseSpMatDescr_t matA    = m_spdescr;
+        cusparseDnMatDescr_t matB    = B_mat.m_dendescr;
+        cusparseDnMatDescr_t matC    = C_mat.m_dendescr;
+        void*                dBuffer = NULL;
+
+        cusparseSetStream(m_cusparse_handle, stream);
+
+        // allocate an external buffer if needed
+        if (m_spmm_buffer_size == 0) {
+            RXMESH_WARN(
+                "Sparse matrix - Dense matrix multiplication buffer size not "
+                "initialized.",
+                "Calculate it now.");
+            denmat_mul_buffer_size(B_mat, C_mat, stream);
+        }
+        CUDA_ERROR(cudaMalloc(&dBuffer, m_spmm_buffer_size));
+
+        // execute SpMM
+        CUSPARSE_ERROR(cusparseSpMM(m_cusparse_handle,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    &alpha,
+                                    matA,
+                                    matB,
+                                    &beta,
+                                    matC,
+                                    CUDA_R_32F,
+                                    CUSPARSE_SPMM_ALG_DEFAULT,
+                                    dBuffer));
+
+        CUDA_ERROR(cudaFree(dBuffer));
     }
 
-    const Context m_context;
-    IndexT*       m_d_row_ptr;
-    IndexT*       m_d_col_idx;
-    T*            m_d_val;
-    IndexT        m_row_size;
-    IndexT        m_col_size;
-    IndexT        m_nnz;
+    // void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0)
+    // {
+    //     const float alpha = 1.0f;
+    //     const float beta  = 0.0f;
+
+    //     cusparseDnVecDescr_t vecx   = NULL;
+    //     cusparseDnVecDescr_t vecy   = NULL;
+
+    //     printf("check\n");
+
+    //     CUSPARSE_ERROR(
+    //         cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F));
+    //     CUSPARSE_ERROR(
+    //         cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F));
+
+    //     printf("check\n");
+
+    //     CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
+    //                                            CUSPARSE_OPERATION_NON_TRANSPOSE,
+    //                                            &alpha,
+    //                                            m_spdescr,
+    //                                            vecx,
+    //                                            &beta,
+    //                                            vecy,
+    //                                            CUDA_R_32F,
+    //                                            CUSPARSE_SPMV_ALG_DEFAULT,
+    //                                            &m_spmv_buffer_size));
+    //     printf("check\n");
+    // }
+
+    /**
+     * @brief wrap up the cusparse api for sparse matrix array
+     * multiplication.
+     */
+    void arr_mul(T* in_arr, T* rt_arr, cudaStream_t stream = 0)
+    {
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+
+        printf("check\n");
+
+        size_t m_spmv_buffer_size = 0;
+
+        printf("check\n");
+
+        void*                buffer = NULL;
+        cusparseDnVecDescr_t vecx   = NULL;
+        cusparseDnVecDescr_t vecy   = NULL;
+
+        printf("check\n");
+
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F));
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F));
+
+        cusparseSetStream(m_cusparse_handle, stream);
+
+        // if (m_spmv_buffer_size == 0) {
+        //     RXMESH_WARN(
+        //         "Sparse matrix - Array multiplication buffer size not "
+        //         "initialized.",
+        //         "Calculate it now.");
+        //     arr_mul_buffer_size(in_arr, rt_arr, stream);
+        // }
+
+        CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                               &alpha,
+                                               m_spdescr,
+                                               vecx,
+                                               &beta,
+                                               vecy,
+                                               CUDA_R_32F,
+                                               CUSPARSE_SPMV_ALG_DEFAULT,
+                                               &m_spmv_buffer_size));
+        CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size));
+
+        CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    &alpha,
+                                    m_spdescr,
+                                    vecx,
+                                    &beta,
+                                    vecy,
+                                    CUDA_R_32F,
+                                    CUSPARSE_SPMV_ALG_DEFAULT,
+                                    buffer));
+
+        CUSPARSE_ERROR(cusparseDestroyDnVec(vecx));
+        CUSPARSE_ERROR(cusparseDestroyDnVec(vecy));
+        CUDA_ERROR(cudaFree(buffer));
+    }
+
+    const Context        m_context;
+    cusparseHandle_t     m_cusparse_handle;
+    cusolverSpHandle_t   m_cusolver_sphandle;
+    cusparseSpMatDescr_t m_spdescr;
+    cusparseMatDescr_t   m_descr;
+
+    size_t m_spmm_buffer_size;
+
+    IndexT* m_d_row_ptr;
+    IndexT* m_d_col_idx;
+    T*      m_d_val;
+    IndexT  m_row_size;
+    IndexT  m_col_size;
+    IndexT  m_nnz;
 };
 
 }  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/macros.h b/include/rxmesh/util/macros.h
index 964bd974..fd630bf9 100644
--- a/include/rxmesh/util/macros.h
+++ b/include/rxmesh/util/macros.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
 #include <stdint.h>
 #include "rxmesh/util/log.h"
 
@@ -49,6 +51,67 @@ inline void HandleError(cudaError_t err, const char* file, int line)
 }
 #define CUDA_ERROR(err) (HandleError(err, __FILE__, __LINE__))
 
+inline void cusparseHandleError(cusparseStatus_t status,
+                                const char*      file,
+                                const int        line)
+{
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+        Log::get_logger()->error("Line {} File {}", line, file);
+        Log::get_logger()->error("CUSPARSE ERROR: {}",
+                                 cusparseGetErrorString(status));
+#ifdef _WIN32
+        system("pause");
+#else
+        exit(EXIT_FAILURE);
+#endif
+    }
+    return;
+}
+#define CUSPARSE_ERROR(err) (cusparseHandleError(err, __FILE__, __LINE__))
+
+
+static inline void cusolverHandleError(cusolverStatus_t status,
+                                       const char*      file,
+                                       const int        line)
+{
+    if (status != CUSOLVER_STATUS_SUCCESS) {
+        auto cusolverGetErrorString = [](cusolverStatus_t status) {
+            switch (status) {
+                case CUSOLVER_STATUS_SUCCESS:
+                    return "CUSOLVER_STATUS_SUCCESS";
+                case CUSOLVER_STATUS_NOT_INITIALIZED:
+                    return "CUSOLVER_STATUS_NOT_INITIALIZED";
+                case CUSOLVER_STATUS_ALLOC_FAILED:
+                    return "CUSOLVER_STATUS_ALLOC_FAILED";
+                case CUSOLVER_STATUS_INVALID_VALUE:
+                    return "CUSOLVER_STATUS_INVALID_VALUE";
+                case CUSOLVER_STATUS_ARCH_MISMATCH:
+                    return "CUSOLVER_STATUS_ARCH_MISMATCH";
+                case CUSOLVER_STATUS_EXECUTION_FAILED:
+                    return "CUSOLVER_STATUS_EXECUTION_FAILED";
+                case CUSOLVER_STATUS_INTERNAL_ERROR:
+                    return "CUSOLVER_STATUS_INTERNAL_ERROR";
+                case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+                    return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+                default:
+                    return "UNKNOWN_ERROR";
+            }
+        };
+
+        Log::get_logger()->error("Line {} File {}", line, file);
+        Log::get_logger()->error("CUSOLVER ERROR: {}",
+                                 cusolverGetErrorString(status));
+#ifdef _WIN32
+        system("pause");
+#else
+        exit(EXIT_FAILURE);
+#endif
+    }
+    return;
+}
+#define CUSOLVER_ERROR(err) (cusolverHandleError(err, __FILE__, __LINE__))
+
+
 // GPU_FREE
 #define GPU_FREE(ptr)              \
     if (ptr != nullptr) {          \
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 1fc4014c..e71621bd 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -72,9 +72,9 @@ __global__ static void sparse_mat_edge_len_test(
             Vector<3, T> vi_coord(
                 coords(iter[v], 0), coords(iter[v], 1), coords(iter[v], 2));
 
-            sparse_mat(v_id, iter[v]) = dist(v_coord, vi_coord);
+            sparse_mat(v_id, iter[v]) = 1; //dist(v_coord, vi_coord);
 
-            arr_ref[row_index] += dist(v_coord, vi_coord);
+            arr_ref[row_index] += 1; //dist(v_coord, vi_coord);
         }
     };
 
@@ -137,12 +137,13 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
             coords(v_id, 0), coords(v_id, 1), coords(v_id, 2));
         for (uint32_t v = 0; v < iter.size(); ++v) {
             T e_weight           = 1;
-            A_mat(v_id, iter[v]) = -time_step * e_weight;
+            A_mat(v_id, iter[v]) = time_step * e_weight;
 
             sum_e_weight += e_weight;
         }
 
-        A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight;
+        A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight +
+                            iter.size() * iter.size() + 1000000;
     };
 
     auto                block = cooperative_groups::this_thread_block();
@@ -151,6 +152,20 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
     query.dispatch<Op::VV>(block, shrd_alloc, mat_setup);
 }
 
+template <typename T>
+__global__ static void print_diag(rxmesh::SparseMatrix<T> A_mat, int num_vet)
+{
+    int t_idx = threadIdx.x + blockDim.x * blockIdx.x;
+    if (t_idx == 0) {
+
+        for (int i = 0; i < num_vet; ++i) {
+            if (A_mat.direct_access(i, i) < 1) {
+                printf("A_mat diag %d: %f \n", i, A_mat.direct_access(i, i));
+            }
+        }
+    }
+}
+
 TEST(RXMeshStatic, SparseMatrix)
 {
     using namespace rxmesh;
@@ -292,9 +307,13 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
     float* d_arr_ref;
     float* d_result;
 
+    printf("check1\n");
+
     CUDA_ERROR(cudaMalloc((void**)&d_arr_ref, (num_vertices) * sizeof(float)));
     CUDA_ERROR(cudaMalloc((void**)&d_result, (num_vertices) * sizeof(float)));
 
+    printf("check2\n");
+
     LaunchBox<threads> launch_box;
     rxmesh.prepare_launch_box(
         {Op::VV}, launch_box, (void*)sparse_mat_edge_len_test<float, threads>);
@@ -304,11 +323,12 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
                                                launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, spmat, d_arr_ref);
 
-    // Spmat matrix multiply
+    printf("check3\n");
 
-    // spmat_multi_hardwired_kernel<float>
-    //     <<<blocks, threads>>>(d_arr_ones, spmat, d_result, num_vertices);
-    spmat_arr_mul(spmat, d_arr_ones, d_result);
+    // Spmat matrix array multiply
+    spmat_multi_hardwired_kernel<float>
+        <<<blocks, threads>>>(d_arr_ones, spmat, d_result, num_vertices);
+    // spmat.arr_mul(d_arr_ones, d_result);
 
     // copy the value back to host
     std::vector<float> h_arr_ref(num_vertices);
@@ -341,7 +361,8 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
     cuda_query(0);
 
     // generate rxmesh obj
-    std::string  obj_path = STRINGIFY(INPUT_DIR) "dragon.obj";
+    std::string obj_path = STRINGIFY(INPUT_DIR) "dragon.obj";  // STRINGIFY(INPUT_DIR) "dragon.obj";
+                                          // "/home/ericycc/data/120628.obj";
     RXMeshStatic rxmesh(obj_path);
 
     uint32_t num_vertices = rxmesh.get_num_vertices();
@@ -366,21 +387,37 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
                                          launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
+    print_diag<float><<<1, 1>>>(A_mat, num_vertices);
+
     spmat_linear_solve(A_mat, B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
-    spmat_denmat_mul(A_mat, X_mat, ret_mat);
-    // spmat_denmat_mul_cw(A_mat, X_mat, ret_mat);
+    // timing begins for spmm
+    GPUTimer timer;
+    timer.start();
+
+    A_mat.denmat_mul(X_mat, ret_mat);
+
+    timer.stop();
+    RXMESH_TRACE("SPMM_rxmesh() took {} (ms) ", timer.elapsed_millis());
 
-    std::vector<float> h_ret_mat(num_vertices * 3);
+    std::vector<Vector3f> h_ret_mat(num_vertices);
     cudaMemcpy(h_ret_mat.data(),
                ret_mat.data(),
-               num_vertices * 3,
+               num_vertices * 3 * sizeof(float),
                cudaMemcpyDeviceToHost);
-    std::vector<float> h_B_mat(num_vertices * 3);
-    cudaMemcpy(
-        h_B_mat.data(), B_mat.data(), num_vertices * 3, cudaMemcpyDeviceToHost);
+    std::vector<Vector3f> h_B_mat(num_vertices);
+    cudaMemcpy(h_B_mat.data(),
+               B_mat.data(),
+               num_vertices * 3 * sizeof(float),
+               cudaMemcpyDeviceToHost);
+
+    // auto ps_mesh = rxmesh.get_polyscope_mesh();
+    // ps_mesh->addVertexColorQuantity("ret_mat", h_ret_mat);
+    // ps_mesh->addVertexColorQuantity("B_mat", h_B_mat);
 
-    for (uint32_t i = 0; i < num_vertices * 3; ++i) {
-        EXPECT_NEAR(h_ret_mat[i], h_B_mat[i], 1e-3);
+    for (uint32_t i = 0; i < num_vertices; ++i) {
+        for (uint32_t j = 0; j < 3; ++j) {
+            EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
+        }
     }
 }
\ No newline at end of file

From 2574ca8b5b8d608f9b355f6a0d8e79c90278d749 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Fri, 3 Feb 2023 23:49:18 -0800
Subject: [PATCH 05/44] merge spmv

---
 include/rxmesh/matrix/sparse_matrix.cuh  | 94 +++++++++++-------------
 tests/RXMesh_test/test_sparse_matrix.cuh | 13 +---
 2 files changed, 46 insertions(+), 61 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 2f6239cb..f8a32b80 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -76,7 +76,8 @@ struct SparseMatrix
           m_cusparse_handle(NULL),
           m_descr(NULL),
           m_spdescr(NULL),
-          m_spmm_buffer_size(0)
+          m_spmm_buffer_size(0),
+          m_spmv_buffer_size(0)
     {
         using namespace rxmesh;
         constexpr uint32_t blockThreads = 256;
@@ -314,35 +315,30 @@ struct SparseMatrix
         CUDA_ERROR(cudaFree(dBuffer));
     }
 
-    // void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0)
-    // {
-    //     const float alpha = 1.0f;
-    //     const float beta  = 0.0f;
-
-    //     cusparseDnVecDescr_t vecx   = NULL;
-    //     cusparseDnVecDescr_t vecy   = NULL;
-
-    //     printf("check\n");
+    void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0)
+    {
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
 
-    //     CUSPARSE_ERROR(
-    //         cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F));
-    //     CUSPARSE_ERROR(
-    //         cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F));
+        cusparseDnVecDescr_t vecx = NULL;
+        cusparseDnVecDescr_t vecy = NULL;
 
-    //     printf("check\n");
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F));
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F));
 
-    //     CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
-    //                                            CUSPARSE_OPERATION_NON_TRANSPOSE,
-    //                                            &alpha,
-    //                                            m_spdescr,
-    //                                            vecx,
-    //                                            &beta,
-    //                                            vecy,
-    //                                            CUDA_R_32F,
-    //                                            CUSPARSE_SPMV_ALG_DEFAULT,
-    //                                            &m_spmv_buffer_size));
-    //     printf("check\n");
-    // }
+        CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                               &alpha,
+                                               m_spdescr,
+                                               vecx,
+                                               &beta,
+                                               vecy,
+                                               CUDA_R_32F,
+                                               CUSPARSE_SPMV_ALG_DEFAULT,
+                                               &m_spmv_buffer_size));
+    }
 
     /**
      * @brief wrap up the cusparse api for sparse matrix array
@@ -353,18 +349,10 @@ struct SparseMatrix
         const float alpha = 1.0f;
         const float beta  = 0.0f;
 
-        printf("check\n");
-
-        size_t m_spmv_buffer_size = 0;
-
-        printf("check\n");
-
         void*                buffer = NULL;
         cusparseDnVecDescr_t vecx   = NULL;
         cusparseDnVecDescr_t vecy   = NULL;
 
-        printf("check\n");
-
         CUSPARSE_ERROR(
             cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F));
         CUSPARSE_ERROR(
@@ -372,24 +360,25 @@ struct SparseMatrix
 
         cusparseSetStream(m_cusparse_handle, stream);
 
-        // if (m_spmv_buffer_size == 0) {
-        //     RXMESH_WARN(
-        //         "Sparse matrix - Array multiplication buffer size not "
-        //         "initialized.",
-        //         "Calculate it now.");
-        //     arr_mul_buffer_size(in_arr, rt_arr, stream);
-        // }
+        if (m_spmv_buffer_size == 0) {
+            RXMESH_WARN(
+                "Sparse matrix - Array multiplication buffer size not "
+                "initialized."
+                "Calculate it now.");
+            arr_mul_buffer_size(in_arr, rt_arr, stream);
+        }
+
+        // CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
+        //                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
+        //                                        &alpha,
+        //                                        m_spdescr,
+        //                                        vecx,
+        //                                        &beta,
+        //                                        vecy,
+        //                                        CUDA_R_32F,
+        //                                        CUSPARSE_SPMV_ALG_DEFAULT,
+        //                                        &m_spmv_buffer_size));
 
-        CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
-                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                               &alpha,
-                                               m_spdescr,
-                                               vecx,
-                                               &beta,
-                                               vecy,
-                                               CUDA_R_32F,
-                                               CUSPARSE_SPMV_ALG_DEFAULT,
-                                               &m_spmv_buffer_size));
         CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size));
 
         CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle,
@@ -415,6 +404,7 @@ struct SparseMatrix
     cusparseMatDescr_t   m_descr;
 
     size_t m_spmm_buffer_size;
+    size_t m_spmv_buffer_size;
 
     IndexT* m_d_row_ptr;
     IndexT* m_d_col_idx;
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index e71621bd..3f4c4a09 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -307,13 +307,9 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
     float* d_arr_ref;
     float* d_result;
 
-    printf("check1\n");
-
     CUDA_ERROR(cudaMalloc((void**)&d_arr_ref, (num_vertices) * sizeof(float)));
     CUDA_ERROR(cudaMalloc((void**)&d_result, (num_vertices) * sizeof(float)));
 
-    printf("check2\n");
-
     LaunchBox<threads> launch_box;
     rxmesh.prepare_launch_box(
         {Op::VV}, launch_box, (void*)sparse_mat_edge_len_test<float, threads>);
@@ -323,12 +319,10 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
                                                launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, spmat, d_arr_ref);
 
-    printf("check3\n");
-
     // Spmat matrix array multiply
-    spmat_multi_hardwired_kernel<float>
-        <<<blocks, threads>>>(d_arr_ones, spmat, d_result, num_vertices);
-    // spmat.arr_mul(d_arr_ones, d_result);
+    // spmat_multi_hardwired_kernel<float>
+    //     <<<blocks, threads>>>(d_arr_ones, spmat, d_result, num_vertices);
+    spmat.arr_mul(d_arr_ones, d_result);
 
     // copy the value back to host
     std::vector<float> h_arr_ref(num_vertices);
@@ -420,4 +414,5 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
             EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
         }
     }
+    A_mat.free();
 }
\ No newline at end of file

From 19c5cbc1486c56be81d46f42d2846c55ba9c8848 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Fri, 10 Feb 2023 14:32:36 -0800
Subject: [PATCH 06/44] debug version

---
 CMakeLists.txt                             |   1 -
 apps/MCF/mcf.cu                            |   1 +
 apps/MCF/mcf_sparse_matrix.cuh             |   7 +-
 include/rxmesh/matrix/matrix_operation.cuh | 224 ---------------------
 include/rxmesh/matrix/sparse_matrix.cuh    | 220 +++++++++++++++++++-
 tests/RXMesh_test/test_sparse_matrix.cuh   |  21 +-
 6 files changed, 224 insertions(+), 250 deletions(-)
 delete mode 100644 include/rxmesh/matrix/matrix_operation.cuh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f126b332..e10a467d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,6 @@ project(RXMesh
 
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
-set(USE_POLYSCOPE "OFF")
 
 
 if(${USE_POLYSCOPE})
diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index 02c5355f..49ceb94d 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -28,6 +28,7 @@ struct arg
 
 #include "mcf_openmesh.h"
 #include "mcf_rxmesh.h"
+#include "mcf_sparse_matrix.cuh"
 
 
 TEST(App, MCF)
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index daf1ed4f..2193b965 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -43,5 +43,10 @@ __global__ static void mcf_A_B_setup(
         A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight;
     };
 
-    query_block_dispatcher<Op::VV, blockThreads>(context, init_lambda);
+    auto                block = cooperative_groups::this_thread_block();
+    Query<blockThreads> query(context);
+    ShmemAllocator      shrd_alloc;
+    query.dispatch<Op::VV>(block, shrd_alloc, compute_valence);
 }
+
+void mcf_rxmesh_solver() {}
\ No newline at end of file
diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh
deleted file mode 100644
index 0ab008a7..00000000
--- a/include/rxmesh/matrix/matrix_operation.cuh
+++ /dev/null
@@ -1,224 +0,0 @@
-#pragma once
-
-#include "rxmesh/attribute.h"
-#include "rxmesh/matrix/dense_matrix.cuh"
-#include "rxmesh/matrix/sparse_matrix.cuh"
-#include "rxmesh/rxmesh_static.h"
-
-namespace rxmesh {
-
-/**
- * @brief The enum class for choosing different solver types
- */
-enum class Solver
-{
-    CHOL = 0,
-    LU   = 1,
-    QR   = 2
-};
-
-/**
- * @brief The enum class for choosing different reorder types
- * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee
- * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based
- * on Quotient Graph, NSTDIS for Nested Dissection
- */
-enum class Reorder
-{
-    NONE   = 0,
-    SYMRCM = 1,
-    SYMAMD = 2,
-    NSTDIS = 3
-};
-
-static int reorder_to_int(const Reorder& reorder)
-{
-    switch (reorder) {
-        case Reorder::NONE:
-            return 0;
-        case Reorder::SYMRCM:
-            return 1;
-        case Reorder::SYMAMD:
-            return 2;
-        case Reorder::NSTDIS:
-            return 3;
-        default: {
-            RXMESH_ERROR("reorder_to_int() unknown input reorder");
-            return 0;
-        }
-    }
-}
-
-/**
- * @brief solve the Ax=b for x where x and b are all array
- */
-template <typename T>
-void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
-                        T*                      B_arr,
-                        T*                      X_arr,
-                        rxmesh::Solver          solver,
-                        rxmesh::Reorder         reorder)
-{
-    A_mat.create_cusolver_sphandle();
-    cusparse_linear_solver_wrapper(solver,
-                                   reorder,
-                                   A_mat.m_cusolver_sphandles,
-                                   A_mat.m_descr,
-                                   A_mat.m_row_size,
-                                   A_mat.m_col_size,
-                                   A_mat.m_nnz,
-                                   A_mat.m_d_row_ptr,
-                                   A_mat.m_d_col_idx,
-                                   A_mat.m_d_val,
-                                   B_arr,
-                                   X_arr);
-}
-
-/**
- * @brief solve the AX=B for X where X and B are all dense matrix and we would
- * solve it in a column wise manner
- */
-template <typename T>
-void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
-                        rxmesh::DenseMatrix<T>  B_mat,
-                        rxmesh::DenseMatrix<T>  X_mat,
-                        rxmesh::Solver          solver,
-                        rxmesh::Reorder         reorder)
-{
-    // A_mat.create_cusolver_sphandle();
-    for (int i = 0; i < B_mat.m_col_size; ++i) {
-        cusparse_linear_solver_wrapper(solver,
-                                       reorder,
-                                       A_mat.m_cusolver_sphandle,
-                                       A_mat.m_descr,
-                                       A_mat.m_row_size,
-                                       A_mat.m_col_size,
-                                       A_mat.m_nnz,
-                                       A_mat.m_d_row_ptr,
-                                       A_mat.m_d_col_idx,
-                                       A_mat.m_d_val,
-                                       B_mat.col_data(i),
-                                       X_mat.col_data(i));
-    }
-}
-
-/**
- * @brief wrap up the cusolver api for solving linear systems. This is a lower
- * level api
- */
-template <typename T>
-void cusparse_linear_solver_wrapper(const rxmesh::Solver  solver,
-                                    const rxmesh::Reorder reorder,
-                                    cusolverSpHandle_t    handle,
-                                    cusparseMatDescr_t    descrA,
-                                    int                   rowsA,
-                                    int                   colsA,
-                                    int                   nnzA,
-                                    int*                  d_csrRowPtrA,
-                                    int*                  d_csrColIndA,
-                                    T*                    d_csrValA,
-                                    T*                    d_b,
-                                    T*                    d_x)
-{
-    if constexpr ((!std::is_same_v<T, float>)&&(!std::is_same_v<T, double>)) {
-        RXMESH_ERROR(
-            "Unsupported type for cusparse: {}"
-            "Only float and double are supported",
-            typeid(T).name());
-    }
-
-    double tol         = 1.e-12;
-    int    singularity = 0; /* -1 if A is invertible under tol. */
-
-    /* solve B*z = Q*b */
-    if (solver == Solver::CHOL) {
-        if constexpr (std::is_same_v<T, float>) {
-            CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle,
-                                                 rowsA,
-                                                 nnzA,
-                                                 descrA,
-                                                 d_csrValA,
-                                                 d_csrRowPtrA,
-                                                 d_csrColIndA,
-                                                 d_b,
-                                                 tol,
-                                                 reorder_to_int(reorder),
-                                                 d_x,
-                                                 &singularity));
-        }
-
-        if constexpr (std::is_same_v<T, double>) {
-            CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle,
-                                                 rowsA,
-                                                 nnzA,
-                                                 descrA,
-                                                 d_csrValA,
-                                                 d_csrRowPtrA,
-                                                 d_csrColIndA,
-                                                 d_b,
-                                                 tol,
-                                                 reorder_to_int(reorder),
-                                                 d_x,
-                                                 &singularity));
-        }
-
-    } else if (solver == Solver::QR) {
-        if constexpr (std::is_same_v<T, float>) {
-            CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle,
-                                               rowsA,
-                                               nnzA,
-                                               descrA,
-                                               d_csrValA,
-                                               d_csrRowPtrA,
-                                               d_csrColIndA,
-                                               d_b,
-                                               tol,
-                                               reorder_to_int(reorder),
-                                               d_x,
-                                               &singularity));
-        }
-
-        if constexpr (std::is_same_v<T, double>) {
-            CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle,
-                                               rowsA,
-                                               nnzA,
-                                               descrA,
-                                               d_csrValA,
-                                               d_csrRowPtrA,
-                                               d_csrColIndA,
-                                               d_b,
-                                               tol,
-                                               reorder_to_int(reorder),
-                                               d_x,
-                                               &singularity));
-        }
-    } else {
-        RXMESH_ERROR(
-            "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x for "
-            "Solver::LU");
-    }
-    cudaDeviceSynchronize();
-
-    if (0 <= singularity) {
-        RXMESH_WARN("WARNING: the matrix is singular at row {} under tol ({})",
-                    singularity,
-                    tol);
-    }
-}
-
-
-/**
- * @brief do the sparse matrix dense matrix multiplication using sparse matrix
- * array multiplication in a column wise way
- */
-template <typename T>
-void spmat_denmat_mul_cw(rxmesh::SparseMatrix<T> A_mat,
-                         rxmesh::DenseMatrix<T>  B_mat,
-                         rxmesh::DenseMatrix<T>  C_mat)
-{
-    for (int i = 0; i < B_mat.m_col_size; ++i) {
-        spmat_arr_mul(A_mat, B_mat.col_data(i), C_mat.col_data(i));
-    }
-}
-
-}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index f8a32b80..3ab55334 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -6,8 +6,52 @@
 #include "rxmesh/query.cuh"
 #include "rxmesh/types.h"
 
+#include "rxmesh/matrix/dense_matrix.cuh"
+
 namespace rxmesh {
 
+/**
+ * @brief The enum class for choosing different solver types
+ */
+enum class Solver
+{
+    CHOL = 0,
+    LU   = 1,
+    QR   = 2
+};
+
+/**
+ * @brief The enum class for choosing different reorder types
+ * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee
+ * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based
+ * on Quotient Graph, NSTDIS for Nested Dissection
+ */
+enum class Reorder
+{
+    NONE   = 0,
+    SYMRCM = 1,
+    SYMAMD = 2,
+    NSTDIS = 3
+};
+
+static int reorder_to_int(const Reorder& reorder)
+{
+    switch (reorder) {
+        case Reorder::NONE:
+            return 0;
+        case Reorder::SYMRCM:
+            return 1;
+        case Reorder::SYMAMD:
+            return 2;
+        case Reorder::NSTDIS:
+            return 3;
+        default: {
+            RXMESH_ERROR("reorder_to_int() unknown input reorder");
+            return 0;
+        }
+    }
+}
+
 namespace detail {
 // this is the function for the CSR calculation
 template <uint32_t blockThreads, typename IndexT = int>
@@ -368,17 +412,6 @@ struct SparseMatrix
             arr_mul_buffer_size(in_arr, rt_arr, stream);
         }
 
-        // CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
-        //                                        CUSPARSE_OPERATION_NON_TRANSPOSE,
-        //                                        &alpha,
-        //                                        m_spdescr,
-        //                                        vecx,
-        //                                        &beta,
-        //                                        vecy,
-        //                                        CUDA_R_32F,
-        //                                        CUSPARSE_SPMV_ALG_DEFAULT,
-        //                                        &m_spmv_buffer_size));
-
         CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size));
 
         CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle,
@@ -397,6 +430,171 @@ struct SparseMatrix
         CUDA_ERROR(cudaFree(buffer));
     }
 
+    /**
+     * @brief do the sparse matrix dense matrix multiplication using sparse
+     * matrix array multiplication in a column wise way
+     */
+    void spmat_denmat_mul_cw(rxmesh::DenseMatrix<T> B_mat,
+                             rxmesh::DenseMatrix<T> C_mat)
+    {
+        for (int i = 0; i < B_mat.m_col_size; ++i) {
+            arr_mul(B_mat.col_data(i), C_mat.col_data(i));
+        }
+    }
+
+    /**
+     * @brief solve the Ax=b for x where x and b are all array
+     */
+    void spmat_linear_solve(T*              B_arr,
+                            T*              X_arr,
+                            rxmesh::Solver  solver,
+                            rxmesh::Reorder reorder)
+    {
+        cusparse_linear_solver_wrapper(solver,
+                                       reorder,
+                                       m_cusolver_sphandle,
+                                       m_descr,
+                                       m_row_size,
+                                       m_col_size,
+                                       m_nnz,
+                                       m_d_row_ptr,
+                                       m_d_col_idx,
+                                       m_d_val,
+                                       B_arr,
+                                       X_arr);
+    }
+
+    /**
+     * @brief solve the AX=B for X where X and B are all dense matrix and we
+     * would solve it in a column wise manner
+     */
+    void spmat_linear_solve(rxmesh::DenseMatrix<T> B_mat,
+                            rxmesh::DenseMatrix<T> X_mat,
+                            rxmesh::Solver         solver,
+                            rxmesh::Reorder        reorder)
+    {
+        for (int i = 0; i < B_mat.m_col_size; ++i) {
+            cusparse_linear_solver_wrapper(solver,
+                                           reorder,
+                                           m_cusolver_sphandle,
+                                           m_descr,
+                                           m_row_size,
+                                           m_col_size,
+                                           m_nnz,
+                                           m_d_row_ptr,
+                                           m_d_col_idx,
+                                           m_d_val,
+                                           B_mat.col_data(i),
+                                           X_mat.col_data(i));
+        }
+    }
+
+    /**
+     * @brief wrap up the cusolver api for solving linear systems. This is a
+     * lower level api
+     */
+    void cusparse_linear_solver_wrapper(const rxmesh::Solver  solver,
+                                        const rxmesh::Reorder reorder,
+                                        cusolverSpHandle_t    handle,
+                                        cusparseMatDescr_t    descrA,
+                                        int                   rowsA,
+                                        int                   colsA,
+                                        int                   nnzA,
+                                        int*                  d_csrRowPtrA,
+                                        int*                  d_csrColIndA,
+                                        T*                    d_csrValA,
+                                        T*                    d_b,
+                                        T*                    d_x)
+    {
+        if constexpr ((!std::is_same_v<T, float>)&&(
+                          !std::is_same_v<T, double>)) {
+            RXMESH_ERROR(
+                "Unsupported type for cusparse: {}"
+                "Only float and double are supported",
+                typeid(T).name());
+        }
+
+        double tol         = 1.e-12;
+        int    singularity = 0; /* -1 if A is invertible under tol. */
+
+        /* solve B*z = Q*b */
+        if (solver == Solver::CHOL) {
+            if constexpr (std::is_same_v<T, float>) {
+                CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle,
+                                                     rowsA,
+                                                     nnzA,
+                                                     descrA,
+                                                     d_csrValA,
+                                                     d_csrRowPtrA,
+                                                     d_csrColIndA,
+                                                     d_b,
+                                                     tol,
+                                                     reorder_to_int(reorder),
+                                                     d_x,
+                                                     &singularity));
+            }
+
+            if constexpr (std::is_same_v<T, double>) {
+                CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle,
+                                                     rowsA,
+                                                     nnzA,
+                                                     descrA,
+                                                     d_csrValA,
+                                                     d_csrRowPtrA,
+                                                     d_csrColIndA,
+                                                     d_b,
+                                                     tol,
+                                                     reorder_to_int(reorder),
+                                                     d_x,
+                                                     &singularity));
+            }
+
+        } else if (solver == Solver::QR) {
+            if constexpr (std::is_same_v<T, float>) {
+                CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle,
+                                                   rowsA,
+                                                   nnzA,
+                                                   descrA,
+                                                   d_csrValA,
+                                                   d_csrRowPtrA,
+                                                   d_csrColIndA,
+                                                   d_b,
+                                                   tol,
+                                                   reorder_to_int(reorder),
+                                                   d_x,
+                                                   &singularity));
+            }
+
+            if constexpr (std::is_same_v<T, double>) {
+                CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle,
+                                                   rowsA,
+                                                   nnzA,
+                                                   descrA,
+                                                   d_csrValA,
+                                                   d_csrRowPtrA,
+                                                   d_csrColIndA,
+                                                   d_b,
+                                                   tol,
+                                                   reorder_to_int(reorder),
+                                                   d_x,
+                                                   &singularity));
+            }
+        } else {
+            RXMESH_ERROR(
+                "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x "
+                "for "
+                "Solver::LU");
+        }
+        cudaDeviceSynchronize();
+
+        if (0 <= singularity) {
+            RXMESH_WARN(
+                "WARNING: the matrix is singular at row {} under tol ({})",
+                singularity,
+                tol);
+        }
+    }
+
     const Context        m_context;
     cusparseHandle_t     m_cusparse_handle;
     cusolverSpHandle_t   m_cusolver_sphandle;
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 3f4c4a09..f29ff86a 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -1,7 +1,8 @@
 #include "gtest/gtest.h"
 
 #include "rxmesh/attribute.h"
-#include "rxmesh/matrix/matrix_operation.cuh"
+#include "rxmesh/matrix/dense_matrix.cuh"
+#include "rxmesh/matrix/sparse_matrix.cuh"
 #include "rxmesh/query.cuh"
 #include "rxmesh/rxmesh_static.h"
 
@@ -11,7 +12,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context,
                                        IndexT*               vet_degree)
 {
     using namespace rxmesh;
-
     auto compute_valence = [&](VertexHandle& v_id, const VertexIterator& iter) {
         auto     ids      = v_id.unpack();
         uint32_t patch_id = ids.first;
@@ -72,9 +72,9 @@ __global__ static void sparse_mat_edge_len_test(
             Vector<3, T> vi_coord(
                 coords(iter[v], 0), coords(iter[v], 1), coords(iter[v], 2));
 
-            sparse_mat(v_id, iter[v]) = 1; //dist(v_coord, vi_coord);
+            sparse_mat(v_id, iter[v]) = 1;  // dist(v_coord, vi_coord);
 
-            arr_ref[row_index] += 1; //dist(v_coord, vi_coord);
+            arr_ref[row_index] += 1;  // dist(v_coord, vi_coord);
         }
     };
 
@@ -283,8 +283,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
     cuda_query(0);
 
     // generate rxmesh obj
-    std::string  obj_path = STRINGIFY(INPUT_DIR) "cube.obj";
-    RXMeshStatic rxmesh(obj_path);
+    RXMeshStatic rxmesh(rxmesh_args.obj_file_name, rxmesh_args.quite);
 
     uint32_t num_vertices = rxmesh.get_num_vertices();
 
@@ -319,9 +318,6 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
                                                launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, spmat, d_arr_ref);
 
-    // Spmat matrix array multiply
-    // spmat_multi_hardwired_kernel<float>
-    //     <<<blocks, threads>>>(d_arr_ones, spmat, d_result, num_vertices);
     spmat.arr_mul(d_arr_ones, d_result);
 
     // copy the value back to host
@@ -355,9 +351,8 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
     cuda_query(0);
 
     // generate rxmesh obj
-    std::string obj_path = STRINGIFY(INPUT_DIR) "dragon.obj";  // STRINGIFY(INPUT_DIR) "dragon.obj";
-                                          // "/home/ericycc/data/120628.obj";
-    RXMeshStatic rxmesh(obj_path);
+    std::string  obj_path = rxmesh_args.obj_file_name;
+    RXMeshStatic rxmesh(obj_path, rxmesh_args.quite);
 
     uint32_t num_vertices = rxmesh.get_num_vertices();
 
@@ -383,7 +378,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
 
     print_diag<float><<<1, 1>>>(A_mat, num_vertices);
 
-    spmat_linear_solve(A_mat, B_mat, X_mat, Solver::CHOL, Reorder::NONE);
+    A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
     // timing begins for spmm
     GPUTimer timer;

From 8de4288263e275e04332c717648627196eb0579c Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Sat, 11 Feb 2023 13:00:57 -0800
Subject: [PATCH 07/44] init

---
 apps/MCF/mcf.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index 49ceb94d..cdedf01c 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -54,6 +54,9 @@ TEST(App, MCF)
 
     // RXMesh Impl
     mcf_rxmesh(rxmesh, ground_truth);
+
+    // RXMesh cusolver Impl
+    mcf_rxmesh_solver(rxmesh, ground_truth);
 }
 
 int main(int argc, char** argv)

From c6abf7431760f9de9ea88970f5a448505d5ba292 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Sat, 11 Feb 2023 13:02:48 -0800
Subject: [PATCH 08/44] add error checker

---
 include/rxmesh/matrix/sparse_matrix.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 3ab55334..8123df8b 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -298,7 +298,7 @@ struct SparseMatrix
         cusparseDnMatDescr_t matC    = C_mat.m_dendescr;
         void*                dBuffer = NULL;
 
-        cusparseSetStream(m_cusparse_handle, stream);
+        CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream));
 
         // allocate an external buffer if needed
         CUSPARSE_ERROR(cusparseSpMM_bufferSize(m_cusparse_handle,
@@ -331,7 +331,7 @@ struct SparseMatrix
         cusparseDnMatDescr_t matC    = C_mat.m_dendescr;
         void*                dBuffer = NULL;
 
-        cusparseSetStream(m_cusparse_handle, stream);
+        CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream));
 
         // allocate an external buffer if needed
         if (m_spmm_buffer_size == 0) {
@@ -402,7 +402,7 @@ struct SparseMatrix
         CUSPARSE_ERROR(
             cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F));
 
-        cusparseSetStream(m_cusparse_handle, stream);
+        CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream));
 
         if (m_spmv_buffer_size == 0) {
             RXMESH_WARN(

From e4a1137bf639d478ffce9e286c703466060825e7 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 13 Feb 2023 12:48:13 -0800
Subject: [PATCH 09/44] uniform done

---
 CMakeLists.txt                           |  2 +-
 apps/MCF/mcf_sparse_matrix.cuh           | 78 +++++++++++++++++++++---
 include/rxmesh/matrix/dense_matrix.cuh   | 14 +++--
 tests/RXMesh_test/test_sparse_matrix.cuh |  2 +-
 4 files changed, 80 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e10a467d..c5fffbd6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ project(RXMesh
 
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
-
+set(USE_POLYSCOPE "OFF")
 
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index 2193b965..3b948228 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -6,10 +6,11 @@
 
 
 template <typename T, uint32_t blockThreads>
-__global__ static void mcf_A_B_setup(
+__global__ static void mcf_A_X_B_setup(
     const rxmesh::Context      context,
     rxmesh::VertexAttribute<T> coords,  // for non-uniform
     rxmesh::SparseMatrix<T>    A_mat,
+    rxmesh::DenseMatrix<T>     X_mat,
     rxmesh::DenseMatrix<T>     B_mat,
     const bool                 use_uniform_laplace,  // for non-uniform
     const T                    time_step)
@@ -25,28 +26,89 @@ __global__ static void mcf_A_B_setup(
         uint32_t r_patch_id = r_ids.first;
         uint16_t r_local_id = r_ids.second;
 
-        uint32_t row_index = A_mat.m_d_patch_ptr_v[r_patch_id] + r_local_id;
+        uint32_t row_index =
+            A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
 
+        // set up B matrix
         B_mat(row_index, 0) = coords(v_id, 0) * v_weight;
         B_mat(row_index, 1) = coords(v_id, 1) * v_weight;
         B_mat(row_index, 2) = coords(v_id, 2) * v_weight;
 
+        // set up initial X matrix
+        X_mat(row_index, 0) = coords(v_id, 0);
+        X_mat(row_index, 1) = coords(v_id, 1);
+        X_mat(row_index, 2) = coords(v_id, 2);
+
         Vector<3, float> vi_coord(
             coords(v_id, 0), coords(v_id, 1), coords(v_id, 2));
         for (uint32_t v = 0; v < iter.size(); ++v) {
-            T e_weight           = 1;
-            A_mat(v_id, iter[v]) = -time_step * e_weight;
-
+            T e_weight = 1;
+            e_weight *= time_step;
             sum_e_weight += e_weight;
+
+            A_mat(v_id, iter[v]) = -e_weight;
         }
 
-        A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight;
+        A_mat(v_id, v_id) = v_weight + sum_e_weight;
     };
 
     auto                block = cooperative_groups::this_thread_block();
     Query<blockThreads> query(context);
     ShmemAllocator      shrd_alloc;
-    query.dispatch<Op::VV>(block, shrd_alloc, compute_valence);
+    query.dispatch<Op::VV>(
+        block,
+        shrd_alloc,
+        init_lambda,
+        [](VertexHandle) { return true; },
+        !use_uniform_laplace);
 }
 
-void mcf_rxmesh_solver() {}
\ No newline at end of file
+template <typename T>
+void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
+                       const std::vector<std::vector<T>>& ground_truth)
+{
+    using namespace rxmesh;
+    constexpr uint32_t blockThreads = 256;
+
+    uint32_t num_vertices = rxmesh.get_num_vertices();
+    auto     coords       = rxmesh.get_input_vertex_coordinates();
+
+    SparseMatrix<float> A_mat(rxmesh);
+    DenseMatrix<float>  X_mat(num_vertices, 3);
+    DenseMatrix<float>  B_mat(num_vertices, 3);
+
+    LaunchBox<blockThreads> launch_box;
+    rxmesh.prepare_launch_box(
+        {Op::VV}, launch_box, (void*)mcf_A_X_B_setup<float, blockThreads>);
+
+    printf("use_uniform_laplace: %d, time_step: %f\n",
+           Arg.use_uniform_laplace,
+           Arg.time_step);
+
+    mcf_A_X_B_setup<float, blockThreads>
+        <<<launch_box.blocks,
+           launch_box.num_threads,
+           launch_box.smem_bytes_dyn>>>(rxmesh.get_context(),
+                                        *coords,
+                                        A_mat,
+                                        X_mat,
+                                        B_mat,
+                                        true,
+                                        Arg.time_step);
+
+    A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
+
+
+    std::vector<Vector3f> h_X_mat(num_vertices);
+    cudaMemcpy(h_X_mat.data(),
+               X_mat.data(),
+               num_vertices * 3 * sizeof(float),
+               cudaMemcpyDeviceToHost);
+
+    const T tol = 0.1f;
+    // for (uint32_t i = 0; i < num_vertices; ++i) {
+    //     for (uint32_t j = 0; j < 3; ++j) {
+    //         EXPECT_NEAR(h_X_mat[i][j], ground_truth[i][j], tol);
+    //     }
+    // }
+}
\ No newline at end of file
diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index 66b403a5..889a201f 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -19,12 +19,12 @@ struct DenseMatrix
         CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
 
         CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr,
-                            m_row_size,
-                            m_col_size,
-                            m_row_size, // leading dim
-                            m_d_val,
-                            CUDA_R_32F,
-                            CUSPARSE_ORDER_COL));
+                                           m_row_size,
+                                           m_col_size,
+                                           m_row_size,  // leading dim
+                                           m_d_val,
+                                           CUDA_R_32F,
+                                           CUSPARSE_ORDER_COL));
     }
 
     void set_ones()
@@ -66,6 +66,8 @@ struct DenseMatrix
         return m_row_size * m_col_size * sizeof(T);
     }
 
+    // TODO: something like attribute->move()
+
     cusparseDnMatDescr_t m_dendescr;
     IndexT               m_row_size;
     IndexT               m_col_size;
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index f29ff86a..b2acce4b 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -377,7 +377,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
     print_diag<float><<<1, 1>>>(A_mat, num_vertices);
-
+    
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
     // timing begins for spmm

From e4f57e281d97a07205d3bf4b861b56d2469b4c9c Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 13 Feb 2023 15:19:02 -0800
Subject: [PATCH 10/44] singularity check works

---
 CMakeLists.txt                           |  2 +-
 tests/RXMesh_test/test_sparse_matrix.cuh | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e10a467d..c5fffbd6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ project(RXMesh
 
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
-
+set(USE_POLYSCOPE "OFF")
 
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index f29ff86a..4b203583 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -144,6 +144,10 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
 
         A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight +
                             iter.size() * iter.size() + 1000000;
+
+        if (row_index % 10 == 0) {
+            A_mat(v_id, v_id) = 0;
+        }
     };
 
     auto                block = cooperative_groups::this_thread_block();
@@ -378,7 +382,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
 
     print_diag<float><<<1, 1>>>(A_mat, num_vertices);
 
-    A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
+    A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 
     // timing begins for spmm
     GPUTimer timer;
@@ -400,10 +404,6 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
                num_vertices * 3 * sizeof(float),
                cudaMemcpyDeviceToHost);
 
-    // auto ps_mesh = rxmesh.get_polyscope_mesh();
-    // ps_mesh->addVertexColorQuantity("ret_mat", h_ret_mat);
-    // ps_mesh->addVertexColorQuantity("B_mat", h_B_mat);
-
     for (uint32_t i = 0; i < num_vertices; ++i) {
         for (uint32_t j = 0; j < 3; ++j) {
             EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);

From a6a7d844f6696080bada0b6021980487455c2b30 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 13 Feb 2023 15:32:26 -0800
Subject: [PATCH 11/44] finish warpping up sparse matrix code

---
 CMakeLists.txt                           |  1 -
 tests/RXMesh_test/test_sparse_matrix.cuh | 36 +++++++++---------------
 2 files changed, 14 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5fffbd6..ad56a02e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,6 @@ project(RXMesh
 
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
-set(USE_POLYSCOPE "OFF")
 
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 4b203583..00053aaa 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -125,9 +125,9 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
         uint32_t row_index =
             A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
 
-        B_mat(row_index, 0) = iter.size() * 7;
-        B_mat(row_index, 1) = iter.size() * 2;
-        B_mat(row_index, 2) = iter.size() * 10;
+        B_mat(row_index, 0) = iter.size() * 7.4f;
+        B_mat(row_index, 1) = iter.size() * 2.6f;
+        B_mat(row_index, 2) = iter.size() * 10.3f;
 
         X_mat(row_index, 0) = coords(v_id, 0) * v_weight;
         X_mat(row_index, 1) = coords(v_id, 1) * v_weight;
@@ -144,10 +144,6 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
 
         A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight +
                             iter.size() * iter.size() + 1000000;
-
-        if (row_index % 10 == 0) {
-            A_mat(v_id, v_id) = 0;
-        }
     };
 
     auto                block = cooperative_groups::this_thread_block();
@@ -156,20 +152,8 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
     query.dispatch<Op::VV>(block, shrd_alloc, mat_setup);
 }
 
-template <typename T>
-__global__ static void print_diag(rxmesh::SparseMatrix<T> A_mat, int num_vet)
-{
-    int t_idx = threadIdx.x + blockDim.x * blockIdx.x;
-    if (t_idx == 0) {
-
-        for (int i = 0; i < num_vet; ++i) {
-            if (A_mat.direct_access(i, i) < 1) {
-                printf("A_mat diag %d: %f \n", i, A_mat.direct_access(i, i));
-            }
-        }
-    }
-}
 
+/* Check the access of the sparse matrix in CSR format in device */
 TEST(RXMeshStatic, SparseMatrix)
 {
     using namespace rxmesh;
@@ -216,6 +200,7 @@ TEST(RXMeshStatic, SparseMatrix)
     rxmesh.prepare_launch_box(
         {Op::VV}, launch_box, (void*)sparse_mat_test<threads>);
 
+    // test kernel
     sparse_mat_test<threads>
         <<<launch_box.blocks,
            launch_box.num_threads,
@@ -236,6 +221,7 @@ TEST(RXMeshStatic, SparseMatrix)
     spmat.free();
 }
 
+/* check if the operator '()' works normally */
 TEST(RXMeshStatic, SparseMatrixQuery)
 {
     using namespace rxmesh;
@@ -259,6 +245,7 @@ TEST(RXMeshStatic, SparseMatrixQuery)
     rxmesh.prepare_launch_box(
         {Op::VV}, launch_box, (void*)sparse_mat_query_test<threads>);
 
+    // test kernel
     sparse_mat_query_test<threads>
         <<<launch_box.blocks,
            launch_box.num_threads,
@@ -279,6 +266,9 @@ TEST(RXMeshStatic, SparseMatrixQuery)
     spmat.free();
 }
 
+/* First replace the sparse matrix entry with the edge length and then do spmv
+ * with an all one array and check the result
+ */
 TEST(RXMeshStatic, SparseMatrixEdgeLen)
 {
     using namespace rxmesh;
@@ -347,6 +337,10 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
     spmat.free();
 }
 
+/* set up a simple AX=B system where A is a sparse matrix, B and C are dense
+ * matrix. Solve it using the warpped up cusolver API and check the final AX
+ * with B using warpped up cusparse API.
+ */
 TEST(RXMeshStatic, SparseMatrixSimpleSolve)
 {
     using namespace rxmesh;
@@ -380,8 +374,6 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
                                          launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
-    print_diag<float><<<1, 1>>>(A_mat, num_vertices);
-
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 
     // timing begins for spmm

From a4bcaa0a089aca4c76e5a6250846a48089c08252 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 13 Feb 2023 16:00:29 -0800
Subject: [PATCH 12/44] fix mcf build

---
 CMakeLists.txt                 | 2 +-
 apps/MCF/mcf_sparse_matrix.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad56a02e..9fcf5984 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,7 @@ project(RXMesh
 
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
-
+set(USE_POLYSCOPE "OFF")
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
 else()
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index 2193b965..dae89726 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -46,7 +46,7 @@ __global__ static void mcf_A_B_setup(
     auto                block = cooperative_groups::this_thread_block();
     Query<blockThreads> query(context);
     ShmemAllocator      shrd_alloc;
-    query.dispatch<Op::VV>(block, shrd_alloc, compute_valence);
+    query.dispatch<Op::VV>(block, shrd_alloc, init_lambda);
 }
 
 void mcf_rxmesh_solver() {}
\ No newline at end of file

From c501804914605c1e996b5aa7ee96ee1d80ea2f26 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Tue, 21 Feb 2023 14:01:52 -0800
Subject: [PATCH 13/44] test

---
 apps/MCF/mcf_sparse_matrix.cuh | 90 ++++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 16 deletions(-)

diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index 3b948228..cfb02411 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -63,6 +63,36 @@ __global__ static void mcf_A_X_B_setup(
         !use_uniform_laplace);
 }
 
+template <typename T, uint32_t blockThreads>
+__global__ static void update_smooth_result(const rxmesh::Context      context,
+                                            rxmesh::VertexAttribute<T> smooth_X,
+                                            rxmesh::SparseMatrix<T>    A_mat,
+                                            rxmesh::DenseMatrix<T>     X_mat)
+{
+    using namespace rxmesh;
+    auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) {
+        auto     r_ids      = v_id.unpack();
+        uint32_t r_patch_id = r_ids.first;
+        uint16_t r_local_id = r_ids.second;
+
+        uint32_t row_index =
+            A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
+
+        // printf("check: %f\n", X_mat(row_index, 0));
+
+        smooth_X(v_id, 0) = X_mat(row_index, 0);
+        smooth_X(v_id, 1) = X_mat(row_index, 1);
+        smooth_X(v_id, 2) = X_mat(row_index, 2);
+
+        // printf("s_check: %f\n", smooth_X(v_id, 0));
+    };
+
+    auto                block = cooperative_groups::this_thread_block();
+    Query<blockThreads> query(context);
+    ShmemAllocator      shrd_alloc;
+    query.dispatch<Op::VV>(block, shrd_alloc, init_lambda);
+}
+
 template <typename T>
 void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
                        const std::vector<std::vector<T>>& ground_truth)
@@ -77,14 +107,14 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
     DenseMatrix<float>  X_mat(num_vertices, 3);
     DenseMatrix<float>  B_mat(num_vertices, 3);
 
-    LaunchBox<blockThreads> launch_box;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box, (void*)mcf_A_X_B_setup<float, blockThreads>);
-
     printf("use_uniform_laplace: %d, time_step: %f\n",
            Arg.use_uniform_laplace,
            Arg.time_step);
 
+    LaunchBox<blockThreads> launch_box;
+    rxmesh.prepare_launch_box(
+        {Op::VV}, launch_box, (void*)mcf_A_X_B_setup<float, blockThreads>);
+
     mcf_A_X_B_setup<float, blockThreads>
         <<<launch_box.blocks,
            launch_box.num_threads,
@@ -98,17 +128,45 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
 
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
+    auto smooth_X =
+        rxmesh.add_vertex_attribute<T>("smooth_X", 3, rxmesh::LOCATION_ALL);
+
+    LaunchBox<blockThreads> launch_box_smooth;
+    rxmesh.prepare_launch_box({Op::VV},
+                              launch_box_smooth,
+                              (void*)update_smooth_result<float, blockThreads>);
+
+    update_smooth_result<float, blockThreads>
+        <<<launch_box_smooth.blocks,
+           launch_box_smooth.num_threads,
+           launch_box_smooth.smem_bytes_dyn>>>(
+            rxmesh.get_context(), *smooth_X, A_mat, X_mat);
+
+    smooth_X->move(rxmesh::DEVICE, rxmesh::HOST);
+
+    const T tol     = 0.001;
+    T       tmp_tol = tol;
+    bool    passed  = true;
+    rxmesh.for_each_vertex(HOST, [&](const VertexHandle vh) {
+        uint32_t v_id = rxmesh.map_to_global(vh);
+
+        for (uint32_t i = 0; i < 3; ++i) {
+            tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) /
+                               ground_truth[v_id][i]);
+            if (tmp_tol > tol) {
+                printf("val: %f, truth: %f, tol: %f\n",
+                       (*smooth_X)(vh, i),
+                       ground_truth[v_id][i],
+                       tmp_tol);
+            }
+
+            if (std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) /
+                         ground_truth[v_id][i]) > tol) {
+                passed = false;
+                break;
+            }
+        }
+    });
 
-    std::vector<Vector3f> h_X_mat(num_vertices);
-    cudaMemcpy(h_X_mat.data(),
-               X_mat.data(),
-               num_vertices * 3 * sizeof(float),
-               cudaMemcpyDeviceToHost);
-
-    const T tol = 0.1f;
-    // for (uint32_t i = 0; i < num_vertices; ++i) {
-    //     for (uint32_t j = 0; j < 3; ++j) {
-    //         EXPECT_NEAR(h_X_mat[i][j], ground_truth[i][j], tol);
-    //     }
-    // }
+    EXPECT_TRUE(passed);
 }
\ No newline at end of file

From eeb1f989e1ac1e7bbd1c4ca1c298aaec51e7f9cc Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Tue, 21 Feb 2023 14:22:24 -0800
Subject: [PATCH 14/44] delete off

---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e10a467d..572c32e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,6 @@ project(RXMesh
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
 
-
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
 else()
@@ -165,4 +164,4 @@ add_subdirectory("include")
 
 include(GoogleTest)
 add_subdirectory(apps)
-add_subdirectory(tests)
\ No newline at end of file
+add_subdirectory(tests)

From 6bcd5a6c193a7c5b0073eb878db014112cd82919 Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Tue, 21 Feb 2023 14:45:25 -0800
Subject: [PATCH 15/44] visual test

---
 CMakeLists.txt                 | 1 -
 apps/MCF/mcf_sparse_matrix.cuh | 9 +++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5fffbd6..ad56a02e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,7 +10,6 @@ project(RXMesh
 
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
-set(USE_POLYSCOPE "OFF")
 
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index cfb02411..4cf3f2d7 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -130,6 +130,8 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
 
     auto smooth_X =
         rxmesh.add_vertex_attribute<T>("smooth_X", 3, rxmesh::LOCATION_ALL);
+    auto truth_X =
+        rxmesh.add_vertex_attribute<T>("truth_X", 3, rxmesh::LOCATION_ALL);
 
     LaunchBox<blockThreads> launch_box_smooth;
     rxmesh.prepare_launch_box({Op::VV},
@@ -143,6 +145,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
             rxmesh.get_context(), *smooth_X, A_mat, X_mat);
 
     smooth_X->move(rxmesh::DEVICE, rxmesh::HOST);
+    truth_X->move(rxmesh::DEVICE, rxmesh::HOST);
 
     const T tol     = 0.001;
     T       tmp_tol = tol;
@@ -151,6 +154,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
         uint32_t v_id = rxmesh.map_to_global(vh);
 
         for (uint32_t i = 0; i < 3; ++i) {
+            (*truth_X)(vh, i) = ground_truth[v_id][i];
             tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) /
                                ground_truth[v_id][i]);
             if (tmp_tol > tol) {
@@ -168,5 +172,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
         }
     });
 
+    auto ps_mesh = rxmesh.get_polyscope_mesh();
+    ps_mesh->addVertexColorQuantity("smooth_x", *smooth_X);
+    ps_mesh->addVertexColorQuantity("smooth_om", *truth_X);
+    polyscope::show();
+
     EXPECT_TRUE(passed);
 }
\ No newline at end of file

From 36c1d4cb865a46c25769a23538b9cecc1a07f2b7 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Sat, 25 Feb 2023 00:47:04 -0800
Subject: [PATCH 16/44] uniform pass

---
 apps/MCF/mcf_rxmesh.h          |   1 +
 apps/MCF/mcf_sparse_matrix.cuh | 179 ++++++++++++++++++++++++++-------
 2 files changed, 142 insertions(+), 38 deletions(-)

diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index a58a62df..f17a1591 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -227,6 +227,7 @@ void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
 
     // output to obj
     // rxmesh.export_obj("mcf_rxmesh.obj", *X);
+    rxmesh.export_obj("mcf_rxmesh_cg.obj", *X);
 
     // Verify
     const T tol    = 0.001;
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index cfb02411..08ceb8aa 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -1,55 +1,138 @@
 #pragma once
+#include "mcf_util.h"
 #include "rxmesh/attribute.h"
 #include "rxmesh/matrix/dense_matrix.cuh"
 #include "rxmesh/matrix/sparse_matrix.cuh"
 #include "rxmesh/rxmesh_static.h"
 
+template <typename T, uint32_t blockThreads>
+__global__ static void mcf_B_setup(const rxmesh::Context            context,
+                                   const rxmesh::VertexAttribute<T> coords,
+                                   rxmesh::DenseMatrix<T>           B_mat,
+                                   const bool use_uniform_laplace)
+{
+    using namespace rxmesh;
+
+    auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) {
+        auto     r_ids      = p_id.unpack();
+        uint32_t r_patch_id = r_ids.first;
+        uint16_t r_local_id = r_ids.second;
+
+        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
+
+        if (use_uniform_laplace) {
+            const T valence     = static_cast<T>(iter.size());
+            B_mat(row_index, 0) = coords(p_id, 0) * valence;
+            B_mat(row_index, 1) = coords(p_id, 1) * valence;
+            B_mat(row_index, 2) = coords(p_id, 2) * valence;
+        } else {
+            T v_weight = 0;
+
+            // this is the last vertex in the one-ring (before r_id)
+            VertexHandle q_id = iter.back();
+
+            for (uint32_t v = 0; v < iter.size(); ++v) {
+                // the current one ring vertex
+                VertexHandle r_id = iter[v];
+
+                T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
+
+                v_weight += (tri_area > 0) ? tri_area : 0.0;
+
+                q_id = r_id;
+            }
+            v_weight = 0.5 / v_weight;
+
+            B_mat(row_index, 0) = coords(p_id, 0) / v_weight;
+            B_mat(row_index, 1) = coords(p_id, 1) / v_weight;
+            B_mat(row_index, 2) = coords(p_id, 2) / v_weight;
+        }
+    };
+
+    // With uniform Laplacian, we just need the valence, thus we
+    // call query and set oriented to false
+    auto block = cooperative_groups::this_thread_block();
+
+    Query<blockThreads> query(context);
+    ShmemAllocator      shrd_alloc;
+    query.dispatch<Op::VV>(
+        block,
+        shrd_alloc,
+        init_lambda,
+        [](VertexHandle) { return true; },
+        !use_uniform_laplace);
+}
 
 template <typename T, uint32_t blockThreads>
-__global__ static void mcf_A_X_B_setup(
-    const rxmesh::Context      context,
-    rxmesh::VertexAttribute<T> coords,  // for non-uniform
-    rxmesh::SparseMatrix<T>    A_mat,
-    rxmesh::DenseMatrix<T>     X_mat,
-    rxmesh::DenseMatrix<T>     B_mat,
-    const bool                 use_uniform_laplace,  // for non-uniform
-    const T                    time_step)
+__global__ static void mcf_A_X_setup(
+    const rxmesh::Context            context,
+    const rxmesh::VertexAttribute<T> coords,
+    rxmesh::SparseMatrix<T>          A_mat,
+    rxmesh::DenseMatrix<T>           X_mat,
+    const bool                       use_uniform_laplace,  // for non-uniform
+    const T                          time_step)
 {
     using namespace rxmesh;
-    auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) {
+    auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) {
         T sum_e_weight(0);
+        T v_weight(0);
 
-        T v_weight = iter.size();
+        VertexHandle q_id = iter.back();
 
         // reference value calculation
-        auto     r_ids      = v_id.unpack();
+        auto     r_ids      = p_id.unpack();
         uint32_t r_patch_id = r_ids.first;
         uint16_t r_local_id = r_ids.second;
 
         uint32_t row_index =
             A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
 
-        // set up B matrix
-        B_mat(row_index, 0) = coords(v_id, 0) * v_weight;
-        B_mat(row_index, 1) = coords(v_id, 1) * v_weight;
-        B_mat(row_index, 2) = coords(v_id, 2) * v_weight;
-
         // set up initial X matrix
-        X_mat(row_index, 0) = coords(v_id, 0);
-        X_mat(row_index, 1) = coords(v_id, 1);
-        X_mat(row_index, 2) = coords(v_id, 2);
+        X_mat(row_index, 0) = coords(p_id, 0);
+        X_mat(row_index, 1) = coords(p_id, 1);
+        X_mat(row_index, 2) = coords(p_id, 2);
 
-        Vector<3, float> vi_coord(
-            coords(v_id, 0), coords(v_id, 1), coords(v_id, 2));
+        // set up matrix A
         for (uint32_t v = 0; v < iter.size(); ++v) {
-            T e_weight = 1;
+            VertexHandle r_id = iter[v];
+
+            T e_weight = 0;
+            if (use_uniform_laplace) {
+                e_weight = 1;
+            } else {
+                VertexHandle s_id =
+                    (v == iter.size() - 1) ? iter[0] : iter[v + 1];
+
+                e_weight = edge_cotan_weight(p_id, r_id, q_id, s_id, coords);
+                e_weight = (static_cast<T>(e_weight >= 0.0)) * e_weight;
+            }
+
             e_weight *= time_step;
             sum_e_weight += e_weight;
 
-            A_mat(v_id, iter[v]) = -e_weight;
+            A_mat(p_id, iter[v]) = -e_weight;
+
+            // compute vertex weight
+            if (use_uniform_laplace) {
+                ++v_weight;
+            } else {
+                T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
+                v_weight += (tri_area > 0) ? tri_area : 0;
+                q_id = r_id;
+            }
+        }
+
+        // Diagonal entry
+        if (use_uniform_laplace) {
+            v_weight = 1.0 / v_weight;
+        } else {
+            v_weight = 0.5 / v_weight;
         }
 
-        A_mat(v_id, v_id) = v_weight + sum_e_weight;
+        assert(!isnan(v_weight));
+        assert(!isinf(v_weight));
+
+        A_mat(p_id, p_id) = (1.0 / v_weight) + sum_e_weight;
     };
 
     auto                block = cooperative_groups::this_thread_block();
@@ -107,25 +190,40 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
     DenseMatrix<float>  X_mat(num_vertices, 3);
     DenseMatrix<float>  B_mat(num_vertices, 3);
 
+    // B set up
+    LaunchBox<blockThreads> launch_box_B;
+    rxmesh.prepare_launch_box(
+        {Op::VV}, launch_box_B, (void*)mcf_B_setup<float, blockThreads>);
+
+    mcf_B_setup<float, blockThreads><<<launch_box_B.blocks,
+                                       launch_box_B.num_threads,
+                                       launch_box_B.smem_bytes_dyn>>>(
+        rxmesh.get_context(), *coords, B_mat, Arg.use_uniform_laplace);
+
     printf("use_uniform_laplace: %d, time_step: %f\n",
            Arg.use_uniform_laplace,
            Arg.time_step);
 
-    LaunchBox<blockThreads> launch_box;
+    // A and X set up
+    LaunchBox<blockThreads> launch_box_A_X;
     rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box, (void*)mcf_A_X_B_setup<float, blockThreads>);
-
-    mcf_A_X_B_setup<float, blockThreads>
-        <<<launch_box.blocks,
-           launch_box.num_threads,
-           launch_box.smem_bytes_dyn>>>(rxmesh.get_context(),
-                                        *coords,
-                                        A_mat,
-                                        X_mat,
-                                        B_mat,
-                                        true,
-                                        Arg.time_step);
+        {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup<float, blockThreads>);
+
+    mcf_A_X_setup<float, blockThreads>
+        <<<launch_box_A_X.blocks,
+           launch_box_A_X.num_threads,
+           launch_box_A_X.smem_bytes_dyn>>>(rxmesh.get_context(),
+                                            *coords,
+                                            A_mat,
+                                            X_mat,
+                                            Arg.use_uniform_laplace,
+                                            Arg.time_step);
+
+    printf("use_uniform_laplace: %d, time_step: %f\n",
+           Arg.use_uniform_laplace,
+           Arg.time_step);
 
+    // Solving the linear system
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
     auto smooth_X =
@@ -141,9 +239,14 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
            launch_box_smooth.num_threads,
            launch_box_smooth.smem_bytes_dyn>>>(
             rxmesh.get_context(), *smooth_X, A_mat, X_mat);
-
     smooth_X->move(rxmesh::DEVICE, rxmesh::HOST);
 
+    printf("use_uniform_laplace: %d, time_step: %f\n",
+           Arg.use_uniform_laplace,
+           Arg.time_step);
+
+    rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X);
+
     const T tol     = 0.001;
     T       tmp_tol = tol;
     bool    passed  = true;

From 34d39fd4ddcae1149fb8b4322c140f447f45322e Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Sun, 26 Feb 2023 00:33:04 -0800
Subject: [PATCH 17/44] CUSOLVER_STATUS_INTERNAL_ERROR by Invalid __shared__
 write at mcf_sparse_matrix.cuh:58

---
 CMakeLists.txt                 |  2 ++
 apps/MCF/mcf_sparse_matrix.cuh | 13 -------------
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad56a02e..8e548767 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,8 @@ project(RXMesh
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
 
+set(USE_POLYSCOPE "OFF")
+
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
 else()
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index 2a0c0a39..b169f467 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -219,10 +219,6 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
                                             Arg.use_uniform_laplace,
                                             Arg.time_step);
 
-    printf("use_uniform_laplace: %d, time_step: %f\n",
-           Arg.use_uniform_laplace,
-           Arg.time_step);
-
     // Solving the linear system
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
@@ -244,10 +240,6 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
     smooth_X->move(rxmesh::DEVICE, rxmesh::HOST);
     truth_X->move(rxmesh::DEVICE, rxmesh::HOST);
 
-    printf("use_uniform_laplace: %d, time_step: %f\n",
-           Arg.use_uniform_laplace,
-           Arg.time_step);
-
     rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X);
 
     const T tol     = 0.001;
@@ -275,10 +267,5 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
         }
     });
 
-    auto ps_mesh = rxmesh.get_polyscope_mesh();
-    ps_mesh->addVertexColorQuantity("smooth_x", *smooth_X);
-    ps_mesh->addVertexColorQuantity("smooth_om", *truth_X);
-    polyscope::show();
-
     EXPECT_TRUE(passed);
 }
\ No newline at end of file

From eb01a05d4cb718af050f24f19a22eb67c4b7b32c Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Sun, 26 Feb 2023 11:29:57 -0800
Subject: [PATCH 18/44] resolve changes

---
 CMakeLists.txt                           |  5 +-
 include/rxmesh/matrix/sparse_matrix.cuh  | 34 +++++++++-
 tests/RXMesh_test/test_sparse_matrix.cuh | 85 +++---------------------
 3 files changed, 43 insertions(+), 81 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9fcf5984..ee091248 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,5 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CMAKE_CUDA_ARCHITECTURES 75)
-endif()
-
 project(RXMesh 
         VERSION 0.2.1 
         LANGUAGES C CXX CUDA)
@@ -11,6 +7,7 @@ project(RXMesh
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
 set(USE_POLYSCOPE "OFF")
+
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
 else()
diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 8123df8b..0b4c1958 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -258,7 +258,7 @@ struct SparseMatrix
         return m_d_val[get_val_idx(row_v, col_v)];
     }
 
-    __device__ T& direct_access(IndexT x, IndexT y)
+    __device__ T& operator()(const IndexT x, const IndexT y)
     {
         const IndexT start = m_d_row_ptr[x];
         const IndexT end   = m_d_row_ptr[x + 1];
@@ -271,6 +271,35 @@ struct SparseMatrix
         assert(1 != 1);
     }
 
+    __device__ T& operator()(const IndexT x, const IndexT y) const
+    {
+        const IndexT start = m_d_row_ptr[x];
+        const IndexT end   = m_d_row_ptr[x + 1];
+
+        for (IndexT i = start; i < end; ++i) {
+            if (m_d_col_idx[i] == y) {
+                return m_d_val[i];
+            }
+        }
+        assert(1 != 1);
+    }
+
+    __host__ __device__ IndexT& get_nnz() const {
+        return m_nnz;
+    }
+
+    __device__ IndexT get_row_ptr_at(IndexT idx) const {
+        return m_d_row_ptr[idx];
+    }
+
+    __device__ IndexT get_col_idx_at(IndexT idx) const {
+        return m_d_col_idx[idx];
+    }
+
+    __device__ IndexT get_val_at(IndexT idx) const {
+        return m_d_val[idx];
+    }
+
     void free()
     {
         CUDA_ERROR(cudaFree(m_d_row_ptr));
@@ -585,7 +614,7 @@ struct SparseMatrix
                 "for "
                 "Solver::LU");
         }
-        cudaDeviceSynchronize();
+        CUDA_ERROR(cudaDeviceSynchronize());
 
         if (0 <= singularity) {
             RXMESH_WARN(
@@ -595,6 +624,7 @@ struct SparseMatrix
         }
     }
 
+   private:
     const Context        m_context;
     cusparseHandle_t     m_cusparse_handle;
     cusolverSpHandle_t   m_cusolver_sphandle;
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 00053aaa..169fc498 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -26,26 +26,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context,
     query.dispatch<Op::VV>(block, shrd_alloc, compute_valence);
 }
 
-template <uint32_t blockThreads>
-__global__ static void sparse_mat_query_test(
-    const rxmesh::Context     context,
-    rxmesh::SparseMatrix<int> sparse_mat)
-{
-    using namespace rxmesh;
-    auto fillin = [&](VertexHandle& v_id, const VertexIterator& iter) {
-        sparse_mat(v_id, v_id) = 2;
-        for (uint32_t v = 0; v < iter.size(); ++v) {
-            sparse_mat(v_id, iter[v]) = 2;
-            sparse_mat(iter[v], v_id) = 2;
-        }
-    };
-
-    auto                block = cooperative_groups::this_thread_block();
-    Query<blockThreads> query(context);
-    ShmemAllocator      shrd_alloc;
-    query.dispatch<Op::VV>(block, shrd_alloc, fillin);
-}
-
 template <typename T, uint32_t blockThreads>
 __global__ static void sparse_mat_edge_len_test(
     const rxmesh::Context      context,
@@ -62,7 +42,7 @@ __global__ static void sparse_mat_edge_len_test(
         uint16_t r_local_id = r_ids.second;
 
         uint32_t row_index =
-            sparse_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
+            context.m_vertex_prefix[r_patch_id] + r_local_id;
 
         arr_ref[row_index]     = 0;
         sparse_mat(v_id, v_id) = 0;
@@ -93,11 +73,11 @@ __global__ void spmat_multi_hardwired_kernel(T*                      vec,
     int   tid = threadIdx.x + blockIdx.x * blockDim.x;
     float sum = 0;
     if (tid < N) {
-        uint32_t start = sparse_mat.m_d_row_ptr[tid];
-        uint32_t end   = sparse_mat.m_d_row_ptr[tid + 1];
+        uint32_t start = sparse_mat.get_row_ptr_at(tid);
+        uint32_t end   = sparse_mat.get_row_ptr_at(tid + 1);
         for (int i = 0; i < end - start; i++) {
-            sum += vec[sparse_mat.m_d_col_idx[start + i]] *
-                   sparse_mat.m_d_val[start + i];
+            sum += vec[sparse_mat.get_col_idx_at(start + i)] *
+                   sparse_mat.get_val_at(start + i);
         }
         out[tid] = sum;
     }
@@ -123,7 +103,7 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
         uint16_t r_local_id = r_ids.second;
 
         uint32_t row_index =
-            A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
+            context.m_vertex_prefix[r_patch_id] + r_local_id;
 
         B_mat(row_index, 0) = iter.size() * 7.4f;
         B_mat(row_index, 1) = iter.size() * 2.6f;
@@ -221,51 +201,6 @@ TEST(RXMeshStatic, SparseMatrix)
     spmat.free();
 }
 
-/* check if the operator '()' works normally */
-TEST(RXMeshStatic, SparseMatrixQuery)
-{
-    using namespace rxmesh;
-
-    // Select device
-    cuda_query(0);
-
-    // generate rxmesh obj
-    std::string  obj_path = STRINGIFY(INPUT_DIR) "cube.obj";
-    RXMeshStatic rxmesh(obj_path);
-
-    uint32_t num_vertices = rxmesh.get_num_vertices();
-
-    const uint32_t threads = 256;
-    const uint32_t blocks  = DIVIDE_UP(num_vertices, threads);
-
-    SparseMatrix<int> spmat(rxmesh);
-    spmat.set_ones();
-
-    LaunchBox<threads> launch_box;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box, (void*)sparse_mat_query_test<threads>);
-
-    // test kernel
-    sparse_mat_query_test<threads>
-        <<<launch_box.blocks,
-           launch_box.num_threads,
-           launch_box.smem_bytes_dyn>>>(rxmesh.get_context(), spmat);
-
-    std::vector<uint32_t> h_result(spmat.m_nnz);
-    CUDA_ERROR(cudaMemcpy(h_result.data(),
-                          spmat.m_d_val,
-                          spmat.m_nnz * sizeof(int),
-                          cudaMemcpyDeviceToHost));
-
-    std::vector<uint32_t> h_ref(spmat.m_nnz, 2);
-
-    for (int i = 0; i < spmat.m_nnz; ++i) {
-        EXPECT_EQ(h_result[i], h_ref[i]);
-    }
-
-    spmat.free();
-}
-
 /* First replace the sparse matrix entry with the edge length and then do spmv
  * with an all one array and check the result
  */
@@ -386,15 +321,15 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
     RXMESH_TRACE("SPMM_rxmesh() took {} (ms) ", timer.elapsed_millis());
 
     std::vector<Vector3f> h_ret_mat(num_vertices);
-    cudaMemcpy(h_ret_mat.data(),
+    CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
                ret_mat.data(),
                num_vertices * 3 * sizeof(float),
-               cudaMemcpyDeviceToHost);
+               cudaMemcpyDeviceToHost));
     std::vector<Vector3f> h_B_mat(num_vertices);
-    cudaMemcpy(h_B_mat.data(),
+    CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
                B_mat.data(),
                num_vertices * 3 * sizeof(float),
-               cudaMemcpyDeviceToHost);
+               cudaMemcpyDeviceToHost));
 
     for (uint32_t i = 0; i < num_vertices; ++i) {
         for (uint32_t j = 0; j < 3; ++j) {

From 8b2f36159a315f666a7ee66ecb619b4104162bb4 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 27 Feb 2023 17:55:55 -0800
Subject: [PATCH 19/44] test mem err

---
 apps/MCF/mcf.cu                        |  4 +--
 apps/MCF/mcf_rxmesh.h                  |  2 ++
 apps/MCF/mcf_sparse_matrix.cuh         | 42 +++++++++++++-------------
 include/rxmesh/matrix/dense_matrix.cuh |  1 +
 4 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index cdedf01c..bfa73bdc 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -53,10 +53,10 @@ TEST(App, MCF)
     mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth);
 
     // RXMesh Impl
-    mcf_rxmesh(rxmesh, ground_truth);
+    mcf_rxmesh(rxmesh, ground_truth); //mcf_rxmesh_cg
 
     // RXMesh cusolver Impl
-    mcf_rxmesh_solver(rxmesh, ground_truth);
+    mcf_rxmesh_solver(rxmesh, ground_truth); //mcf_rxmesh_cusolver_chol
 }
 
 int main(int argc, char** argv)
diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index f17a1591..fa3e589d 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -119,6 +119,8 @@ void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
                               launch_box_init_B.smem_bytes_dyn>>>(
         rxmesh.get_context(), *X, *B, Arg.use_uniform_laplace);
 
+    CUDA_ERROR(cudaDeviceSynchronize());
+
     // CG scalars
     T alpha(0), beta(0), delta_new(0), delta_old(0);
 
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index bda1a88f..f34dc8a5 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -26,30 +26,30 @@ __global__ static void mcf_B_setup(const rxmesh::Context            context,
             B_mat(row_index, 1) = coords(p_id, 1) * valence;
             B_mat(row_index, 2) = coords(p_id, 2) * valence;
         } else {
-            T v_weight = 0;
+            // T v_weight = 0;
 
             // this is the last vertex in the one-ring (before r_id)
-            VertexHandle q_id = iter.back();
+            // VertexHandle q_id = iter.back();
 
-            for (uint32_t v = 0; v < iter.size(); ++v) {
-                // the current one ring vertex
-                VertexHandle r_id = iter[v];
+            // for (uint32_t v = 0; v < iter.size(); ++v) {
+            //     // the current one ring vertex
+            //     VertexHandle r_id = iter[v];
 
-                T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
+            //     T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
 
-                v_weight += (tri_area > 0) ? tri_area : 0.0;
+            //     v_weight += (tri_area > 0) ? tri_area : 0.0;
 
-                q_id = r_id;
-            }
-            v_weight = 0.5 / v_weight;
+            //     q_id = r_id;
+            // }
+            // v_weight = 0.5 / v_weight;
 
-            B_mat(row_index, 0) = coords(p_id, 0) / v_weight;
-            B_mat(row_index, 1) = coords(p_id, 1) / v_weight;
-            B_mat(row_index, 2) = coords(p_id, 2) / v_weight;
-            printf("check: %f, %f, %f\n",
-                   B_mat(row_index, 0),
-                   B_mat(row_index, 1),
-                   B_mat(row_index, 2));
+            B_mat(row_index, 0) = coords(p_id, 0); //  / v_weight;
+            B_mat(row_index, 1) = coords(p_id, 1); //  / v_weight;
+            B_mat(row_index, 2) = coords(p_id, 2); //  / v_weight;
+            // printf("check: %f, %f, %f\n",
+            //        B_mat(row_index, 0),
+            //        B_mat(row_index, 1),
+            //        B_mat(row_index, 2));
         }
     };
 
@@ -190,6 +190,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
     DenseMatrix<float>  X_mat(num_vertices, 3);
     DenseMatrix<float>  B_mat(num_vertices, 3);
 
+    printf("use_uniform_laplace: %d, time_step: %f\n",
+           Arg.use_uniform_laplace,
+           Arg.time_step);
+
     // B set up
     LaunchBox<blockThreads> launch_box_B;
     rxmesh.prepare_launch_box(
@@ -202,10 +206,6 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
 
     CUDA_ERROR(cudaDeviceSynchronize());
 
-    printf("use_uniform_laplace: %d, time_step: %f\n",
-           Arg.use_uniform_laplace,
-           Arg.time_step);
-
     // A and X set up
     LaunchBox<blockThreads> launch_box_A_X;
     rxmesh.prepare_launch_box(
diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index 889a201f..a290e56c 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -43,6 +43,7 @@ struct DenseMatrix
 
     __device__ T& operator()(const uint32_t row, const uint32_t col)
     {
+        //assert(col < m_col_size);
         return m_d_val[col * m_row_size + row];
     }
 

From fcfb4855bfb74f970a22c5d29634c944bdf548ba Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 27 Feb 2023 19:40:09 -0800
Subject: [PATCH 20/44] fix illegal memory access

---
 apps/MCF/mcf_rxmesh.h          |  5 ++---
 apps/MCF/mcf_sparse_matrix.cuh | 38 ++++++++++++++--------------------
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index fa3e589d..362416c6 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -106,11 +106,11 @@ void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
     rxmesh.prepare_launch_box({rxmesh::Op::VV},
                               launch_box_init_B,
                               (void*)init_B<T, blockThreads>,
-                              true);
+                              !Arg.use_uniform_laplace);
     rxmesh.prepare_launch_box({rxmesh::Op::VV},
                               launch_box_matvec,
                               (void*)rxmesh_matvec<T, blockThreads>,
-                              true);
+                              !Arg.use_uniform_laplace);
 
 
     // init kernel to initialize RHS (B)
@@ -229,7 +229,6 @@ void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
 
     // output to obj
     // rxmesh.export_obj("mcf_rxmesh.obj", *X);
-    rxmesh.export_obj("mcf_rxmesh_cg.obj", *X);
 
     // Verify
     const T tol    = 0.001;
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index f34dc8a5..101e3cbe 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -26,30 +26,26 @@ __global__ static void mcf_B_setup(const rxmesh::Context            context,
             B_mat(row_index, 1) = coords(p_id, 1) * valence;
             B_mat(row_index, 2) = coords(p_id, 2) * valence;
         } else {
-            // T v_weight = 0;
+            T v_weight = 0;
 
-            // this is the last vertex in the one-ring (before r_id)
-            // VertexHandle q_id = iter.back();
+            //this is the last vertex in the one-ring (before r_id)
+            VertexHandle q_id = iter.back();
 
-            // for (uint32_t v = 0; v < iter.size(); ++v) {
-            //     // the current one ring vertex
-            //     VertexHandle r_id = iter[v];
+            for (uint32_t v = 0; v < iter.size(); ++v) {
+                // the current one ring vertex
+                VertexHandle r_id = iter[v];
 
-            //     T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
+                T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
 
-            //     v_weight += (tri_area > 0) ? tri_area : 0.0;
+                v_weight += (tri_area > 0) ? tri_area : 0.0;
 
-            //     q_id = r_id;
-            // }
-            // v_weight = 0.5 / v_weight;
+                q_id = r_id;
+            }
+            v_weight = 0.5 / v_weight;
 
-            B_mat(row_index, 0) = coords(p_id, 0); //  / v_weight;
-            B_mat(row_index, 1) = coords(p_id, 1); //  / v_weight;
-            B_mat(row_index, 2) = coords(p_id, 2); //  / v_weight;
-            // printf("check: %f, %f, %f\n",
-            //        B_mat(row_index, 0),
-            //        B_mat(row_index, 1),
-            //        B_mat(row_index, 2));
+            B_mat(row_index, 0) = coords(p_id, 0) / v_weight;
+            B_mat(row_index, 1) = coords(p_id, 1) / v_weight;
+            B_mat(row_index, 2) = coords(p_id, 2) / v_weight;
         }
     };
 
@@ -163,8 +159,6 @@ __global__ static void update_smooth_result(const rxmesh::Context      context,
 
         uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
 
-        // printf("check: %f\n", X_mat(row_index, 0));
-
         smooth_X(v_id, 0) = X_mat(row_index, 0);
         smooth_X(v_id, 1) = X_mat(row_index, 1);
         smooth_X(v_id, 2) = X_mat(row_index, 2);
@@ -197,7 +191,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
     // B set up
     LaunchBox<blockThreads> launch_box_B;
     rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box_B, (void*)mcf_B_setup<float, blockThreads>);
+        {Op::VV}, launch_box_B, (void*)mcf_B_setup<float, blockThreads>, !Arg.use_uniform_laplace);
 
     mcf_B_setup<float, blockThreads><<<launch_box_B.blocks,
                                        launch_box_B.num_threads,
@@ -209,7 +203,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
     // A and X set up
     LaunchBox<blockThreads> launch_box_A_X;
     rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup<float, blockThreads>);
+        {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup<float, blockThreads>, !Arg.use_uniform_laplace);
 
     mcf_A_X_setup<float, blockThreads>
         <<<launch_box_A_X.blocks,

From f96b3fed57bd8c21e1e5edc4fedbcda4be9a9a77 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 27 Feb 2023 19:52:19 -0800
Subject: [PATCH 21/44] rename the function and format the code

---
 apps/MCF/mcf.cu                |  4 ++--
 apps/MCF/mcf_rxmesh.h          |  2 +-
 apps/MCF/mcf_sparse_matrix.cuh | 20 ++++++++++++--------
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index bfa73bdc..4ac988c0 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -53,10 +53,10 @@ TEST(App, MCF)
     mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth);
 
     // RXMesh Impl
-    mcf_rxmesh(rxmesh, ground_truth); //mcf_rxmesh_cg
+    mcf_rxmesh_cg(rxmesh, ground_truth);  
 
     // RXMesh cusolver Impl
-    mcf_rxmesh_solver(rxmesh, ground_truth); //mcf_rxmesh_cusolver_chol
+    mcf_rxmesh_cusolver_chol(rxmesh, ground_truth); 
 }
 
 int main(int argc, char** argv)
diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index 362416c6..41ca5fab 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -49,7 +49,7 @@ void init_PR(rxmesh::RXMeshStatic&             rxmesh,
 }
 
 template <typename T>
-void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
+void mcf_rxmesh_cg(rxmesh::RXMeshStatic&              rxmesh,
                 const std::vector<std::vector<T>>& ground_truth)
 {
     using namespace rxmesh;
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index 101e3cbe..1a4484a2 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -28,7 +28,7 @@ __global__ static void mcf_B_setup(const rxmesh::Context            context,
         } else {
             T v_weight = 0;
 
-            //this is the last vertex in the one-ring (before r_id)
+            // this is the last vertex in the one-ring (before r_id)
             VertexHandle q_id = iter.back();
 
             for (uint32_t v = 0; v < iter.size(); ++v) {
@@ -171,8 +171,8 @@ __global__ static void update_smooth_result(const rxmesh::Context      context,
 }
 
 template <typename T>
-void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
-                       const std::vector<std::vector<T>>& ground_truth)
+void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
+                              const std::vector<std::vector<T>>& ground_truth)
 {
     using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
@@ -190,8 +190,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
 
     // B set up
     LaunchBox<blockThreads> launch_box_B;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box_B, (void*)mcf_B_setup<float, blockThreads>, !Arg.use_uniform_laplace);
+    rxmesh.prepare_launch_box({Op::VV},
+                              launch_box_B,
+                              (void*)mcf_B_setup<float, blockThreads>,
+                              !Arg.use_uniform_laplace);
 
     mcf_B_setup<float, blockThreads><<<launch_box_B.blocks,
                                        launch_box_B.num_threads,
@@ -202,8 +204,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
 
     // A and X set up
     LaunchBox<blockThreads> launch_box_A_X;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup<float, blockThreads>, !Arg.use_uniform_laplace);
+    rxmesh.prepare_launch_box({Op::VV},
+                              launch_box_A_X,
+                              (void*)mcf_A_X_setup<float, blockThreads>,
+                              !Arg.use_uniform_laplace);
 
     mcf_A_X_setup<float, blockThreads>
         <<<launch_box_A_X.blocks,
@@ -215,7 +219,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
                                             Arg.use_uniform_laplace,
                                             Arg.time_step);
 
-    // Solving the linear system
+    // Solving the linear system using chol factorization and no reordering
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
     auto smooth_X =

From 70f967873ebe93464c1c3a51a5cf6ccc3b2d807b Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 27 Feb 2023 19:52:19 -0800
Subject: [PATCH 22/44] rename the function and format the code

---
 apps/MCF/mcf.cu                |  4 ++--
 apps/MCF/mcf_rxmesh.h          |  4 +---
 apps/MCF/mcf_sparse_matrix.cuh | 20 ++++++++++++--------
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index bfa73bdc..4ac988c0 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -53,10 +53,10 @@ TEST(App, MCF)
     mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth);
 
     // RXMesh Impl
-    mcf_rxmesh(rxmesh, ground_truth); //mcf_rxmesh_cg
+    mcf_rxmesh_cg(rxmesh, ground_truth);  
 
     // RXMesh cusolver Impl
-    mcf_rxmesh_solver(rxmesh, ground_truth); //mcf_rxmesh_cusolver_chol
+    mcf_rxmesh_cusolver_chol(rxmesh, ground_truth); 
 }
 
 int main(int argc, char** argv)
diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index 362416c6..bb2c4ab4 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -49,7 +49,7 @@ void init_PR(rxmesh::RXMeshStatic&             rxmesh,
 }
 
 template <typename T>
-void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
+void mcf_rxmesh_cg(rxmesh::RXMeshStatic&              rxmesh,
                 const std::vector<std::vector<T>>& ground_truth)
 {
     using namespace rxmesh;
@@ -119,8 +119,6 @@ void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
                               launch_box_init_B.smem_bytes_dyn>>>(
         rxmesh.get_context(), *X, *B, Arg.use_uniform_laplace);
 
-    CUDA_ERROR(cudaDeviceSynchronize());
-
     // CG scalars
     T alpha(0), beta(0), delta_new(0), delta_old(0);
 
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index 101e3cbe..1a4484a2 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -28,7 +28,7 @@ __global__ static void mcf_B_setup(const rxmesh::Context            context,
         } else {
             T v_weight = 0;
 
-            //this is the last vertex in the one-ring (before r_id)
+            // this is the last vertex in the one-ring (before r_id)
             VertexHandle q_id = iter.back();
 
             for (uint32_t v = 0; v < iter.size(); ++v) {
@@ -171,8 +171,8 @@ __global__ static void update_smooth_result(const rxmesh::Context      context,
 }
 
 template <typename T>
-void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
-                       const std::vector<std::vector<T>>& ground_truth)
+void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
+                              const std::vector<std::vector<T>>& ground_truth)
 {
     using namespace rxmesh;
     constexpr uint32_t blockThreads = 256;
@@ -190,8 +190,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
 
     // B set up
     LaunchBox<blockThreads> launch_box_B;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box_B, (void*)mcf_B_setup<float, blockThreads>, !Arg.use_uniform_laplace);
+    rxmesh.prepare_launch_box({Op::VV},
+                              launch_box_B,
+                              (void*)mcf_B_setup<float, blockThreads>,
+                              !Arg.use_uniform_laplace);
 
     mcf_B_setup<float, blockThreads><<<launch_box_B.blocks,
                                        launch_box_B.num_threads,
@@ -202,8 +204,10 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
 
     // A and X set up
     LaunchBox<blockThreads> launch_box_A_X;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box_A_X, (void*)mcf_A_X_setup<float, blockThreads>, !Arg.use_uniform_laplace);
+    rxmesh.prepare_launch_box({Op::VV},
+                              launch_box_A_X,
+                              (void*)mcf_A_X_setup<float, blockThreads>,
+                              !Arg.use_uniform_laplace);
 
     mcf_A_X_setup<float, blockThreads>
         <<<launch_box_A_X.blocks,
@@ -215,7 +219,7 @@ void mcf_rxmesh_solver(rxmesh::RXMeshStatic&              rxmesh,
                                             Arg.use_uniform_laplace,
                                             Arg.time_step);
 
-    // Solving the linear system
+    // Solving the linear system using chol factorization and no reordering
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
     auto smooth_X =

From 87dbb4ab12ae8e277cdf00c542f188b7cc455e98 Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Fri, 3 Mar 2023 13:55:55 -0800
Subject: [PATCH 23/44] remove test code

---
 CMakeLists.txt                 |  2 --
 apps/MCF/mcf_sparse_matrix.cuh | 13 +++----------
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d78ad40f..d54b4340 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,8 +7,6 @@ project(RXMesh
 
 set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
 
-set(USE_POLYSCOPE "OFF")
-
 if(${USE_POLYSCOPE})
 	message(STATUS "Polyscope is enabled")
 else()
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index 1a4484a2..d2ed2752 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -184,7 +184,7 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
     DenseMatrix<float>  X_mat(num_vertices, 3);
     DenseMatrix<float>  B_mat(num_vertices, 3);
 
-    printf("use_uniform_laplace: %d, time_step: %f\n",
+    RXMESH_INFO("use_uniform_laplace: {}, time_step: {}",
            Arg.use_uniform_laplace,
            Arg.time_step);
 
@@ -224,8 +224,6 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
 
     auto smooth_X =
         rxmesh.add_vertex_attribute<T>("smooth_X", 3, rxmesh::LOCATION_ALL);
-    auto truth_X =
-        rxmesh.add_vertex_attribute<T>("truth_X", 3, rxmesh::LOCATION_ALL);
 
     LaunchBox<blockThreads> launch_box_smooth;
     rxmesh.prepare_launch_box({Op::VV},
@@ -238,7 +236,6 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
            launch_box_smooth.smem_bytes_dyn>>>(
             rxmesh.get_context(), *smooth_X, A_mat, X_mat);
     smooth_X->move(rxmesh::DEVICE, rxmesh::HOST);
-    truth_X->move(rxmesh::DEVICE, rxmesh::HOST);
 
     rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X);
 
@@ -249,18 +246,14 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
         uint32_t v_id = rxmesh.map_to_global(vh);
 
         for (uint32_t i = 0; i < 3; ++i) {
-            (*truth_X)(vh, i) = ground_truth[v_id][i];
             tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) /
                                ground_truth[v_id][i]);
+
             if (tmp_tol > tol) {
-                printf("val: %f, truth: %f, tol: %f\n",
+                RXMESH_WARN("val: {}, truth: {}, tol: {}\n",
                        (*smooth_X)(vh, i),
                        ground_truth[v_id][i],
                        tmp_tol);
-            }
-
-            if (std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) /
-                         ground_truth[v_id][i]) > tol) {
                 passed = false;
                 break;
             }

From 2597f187c614882f62da4b7cd5de38e5c0757534 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 6 Mar 2023 02:33:27 -0800
Subject: [PATCH 24/44] dense matrix host-device

---
 apps/MCF/mcf_sparse_matrix.cuh         |   8 +-
 include/rxmesh/matrix/dense_matrix.cuh | 116 +++++++++++++++++++++++--
 2 files changed, 116 insertions(+), 8 deletions(-)

diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index d2ed2752..f9244cfb 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -237,13 +237,19 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
             rxmesh.get_context(), *smooth_X, A_mat, X_mat);
     smooth_X->move(rxmesh::DEVICE, rxmesh::HOST);
 
-    rxmesh.export_obj("mcf_rxmesh_solver.obj", *smooth_X);
+    X_mat.move(rxmesh::DEVICE, rxmesh::HOST);
 
     const T tol     = 0.001;
     T       tmp_tol = tol;
     bool    passed  = true;
     rxmesh.for_each_vertex(HOST, [&](const VertexHandle vh) {
         uint32_t v_id = rxmesh.map_to_global(vh);
+        uint32_t v_linear_id = rxmesh.linear_id(vh);
+
+        T a = X_mat(v_linear_id, 0);
+        T b = (*smooth_X)(vh, 0);
+        
+        EXPECT_EQ(X_mat(v_linear_id, 0), (*smooth_X)(vh, 0));
 
         for (uint32_t i = 0; i < 3; ++i) {
             tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) /
diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index a290e56c..fb70b322 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -9,12 +9,15 @@ namespace rxmesh {
 
 // Currently this is device only
 // host/device transormation will be added
-// only col major is supportedF
+// only col major is supported
 template <typename T, typename IndexT = int>
 struct DenseMatrix
 {
     DenseMatrix(IndexT row_size, IndexT col_size)
-        : m_row_size(row_size), m_col_size(col_size), m_dendescr(NULL)
+        : m_row_size(row_size),
+          m_col_size(col_size),
+          m_dendescr(NULL),
+          m_allocated(LOCATION_NONE)
     {
         CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
 
@@ -41,15 +44,29 @@ struct DenseMatrix
         return m_row_size;
     }
 
-    __device__ T& operator()(const uint32_t row, const uint32_t col)
+    __host__ __device__ T& operator()(const uint32_t row, const uint32_t col)
     {
-        //assert(col < m_col_size);
+        assert(row < m_row_size);
+        assert(col < m_col_size);
+
+#ifdef __CUDA_ARCH__
         return m_d_val[col * m_row_size + row];
+#else
+        return m_h_val[col * m_row_size + row];
+#endif
     }
 
-    __device__ T& operator()(const uint32_t row, const uint32_t col) const
+    __host__ __device__ T& operator()(const uint32_t row,
+                                      const uint32_t col) const
     {
+        assert(row < m_row_size);
+        assert(col < m_col_size);
+
+#ifdef __CUDA_ARCH__
         return m_d_val[col * m_row_size + row];
+#else
+        return m_h_val[col * m_row_size + row];
+#endif
     }
 
     T* data() const
@@ -57,9 +74,18 @@ struct DenseMatrix
         return m_d_val;
     }
 
-    T* col_data(const uint32_t ld_idx) const
+    T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const
     {
-        return m_d_val + ld_idx * lead_dim();
+        if ((location & HOST) == HOST) {
+            return m_h_val + ld_idx * lead_dim();
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            return m_d_val + ld_idx * lead_dim();
+        }
+
+        assert(1 != 1);
+        return 0;
     }
 
     IndexT bytes() const
@@ -67,12 +93,88 @@ struct DenseMatrix
         return m_row_size * m_col_size * sizeof(T);
     }
 
+    void move(locationT source, locationT target, cudaStream_t stream = NULL)
+    {
+        if (source == target) {
+            RXMESH_WARN(
+                "DenseMatrix::move() source ({}) and target ({}) "
+                "are the same.",
+                location_to_string(source),
+                location_to_string(target));
+            return;
+        }
+
+        if ((source == HOST || source == DEVICE) &&
+            ((source & m_allocated) != source)) {
+            RXMESH_ERROR(
+                "DenseMatrix::move() moving source is not valid"
+                " because it was not allocated on source i.e., {}",
+                location_to_string(source));
+        }
+
+        if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) &&
+            ((target & m_allocated) != target)) {
+            RXMESH_WARN(
+                "DenseMatrix::move() allocating target before moving to {}",
+                location_to_string(target));
+
+            printf("aaaaaaaaaaaaaaaaaaa\n");
+            allocate(target);
+        }
+
+        if (source == HOST && target == DEVICE) {
+            CUDA_ERROR(cudaMemcpyAsync(
+                m_d_val, m_h_val, bytes(), cudaMemcpyHostToDevice, stream));
+        } else if (source == DEVICE && target == HOST) {
+            CUDA_ERROR(cudaMemcpyAsync(
+                m_h_val, m_d_val, bytes(), cudaMemcpyDeviceToHost, stream));
+        }
+    }
+
+    void release(locationT location = LOCATION_ALL)
+    {
+        if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
+            free(m_h_val);
+            m_h_val     = nullptr;
+            m_allocated = m_allocated & (~HOST);
+        }
+
+        if (((location & DEVICE) == DEVICE) &&
+            ((m_allocated & DEVICE) == DEVICE)) {
+            GPU_FREE(m_d_val);
+            m_allocated = m_allocated & (~DEVICE);
+        }
+    }
+
+    void allocate(locationT location)
+    {
+        if ((location & HOST) == HOST) {
+            release(HOST);
+
+            m_h_val = static_cast<T*>(malloc(bytes()));
+
+            printf("aaaaaaaaaaaaaaaaaaa\n");
+
+            m_allocated = m_allocated | HOST;
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            release(DEVICE);
+
+            CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
+
+            m_allocated = m_allocated | DEVICE;
+        }
+    }
+
     // TODO: something like attribute->move()
 
     cusparseDnMatDescr_t m_dendescr;
+    locationT            m_allocated;
     IndexT               m_row_size;
     IndexT               m_col_size;
     T*                   m_d_val;
+    T*                   m_h_val;
 };
 
 }  // namespace rxmesh

From 80a7fafae45cb2c8fb8d6a966dbc3f40fa8db4fe Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 6 Mar 2023 02:41:50 -0800
Subject: [PATCH 25/44] update test and detete smooth attribute

---
 apps/MCF/mcf_sparse_matrix.cuh         | 47 ++------------------------
 include/rxmesh/matrix/dense_matrix.cuh |  4 ---
 2 files changed, 2 insertions(+), 49 deletions(-)

diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index f9244cfb..4dbae51a 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -145,31 +145,6 @@ __global__ static void mcf_A_X_setup(
         !use_uniform_laplace);
 }
 
-template <typename T, uint32_t blockThreads>
-__global__ static void update_smooth_result(const rxmesh::Context      context,
-                                            rxmesh::VertexAttribute<T> smooth_X,
-                                            rxmesh::SparseMatrix<T>    A_mat,
-                                            rxmesh::DenseMatrix<T>     X_mat)
-{
-    using namespace rxmesh;
-    auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) {
-        auto     r_ids      = v_id.unpack();
-        uint32_t r_patch_id = r_ids.first;
-        uint16_t r_local_id = r_ids.second;
-
-        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
-
-        smooth_X(v_id, 0) = X_mat(row_index, 0);
-        smooth_X(v_id, 1) = X_mat(row_index, 1);
-        smooth_X(v_id, 2) = X_mat(row_index, 2);
-    };
-
-    auto                block = cooperative_groups::this_thread_block();
-    Query<blockThreads> query(context);
-    ShmemAllocator      shrd_alloc;
-    query.dispatch<Op::VV>(block, shrd_alloc, init_lambda);
-}
-
 template <typename T>
 void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
                               const std::vector<std::vector<T>>& ground_truth)
@@ -222,21 +197,6 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
     // Solving the linear system using chol factorization and no reordering
     A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
 
-    auto smooth_X =
-        rxmesh.add_vertex_attribute<T>("smooth_X", 3, rxmesh::LOCATION_ALL);
-
-    LaunchBox<blockThreads> launch_box_smooth;
-    rxmesh.prepare_launch_box({Op::VV},
-                              launch_box_smooth,
-                              (void*)update_smooth_result<float, blockThreads>);
-
-    update_smooth_result<float, blockThreads>
-        <<<launch_box_smooth.blocks,
-           launch_box_smooth.num_threads,
-           launch_box_smooth.smem_bytes_dyn>>>(
-            rxmesh.get_context(), *smooth_X, A_mat, X_mat);
-    smooth_X->move(rxmesh::DEVICE, rxmesh::HOST);
-
     X_mat.move(rxmesh::DEVICE, rxmesh::HOST);
 
     const T tol     = 0.001;
@@ -247,17 +207,14 @@ void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
         uint32_t v_linear_id = rxmesh.linear_id(vh);
 
         T a = X_mat(v_linear_id, 0);
-        T b = (*smooth_X)(vh, 0);
-        
-        EXPECT_EQ(X_mat(v_linear_id, 0), (*smooth_X)(vh, 0));
 
         for (uint32_t i = 0; i < 3; ++i) {
-            tmp_tol = std::abs(((*smooth_X)(vh, i) - ground_truth[v_id][i]) /
+            tmp_tol = std::abs((X_mat(v_linear_id, i) - ground_truth[v_id][i]) /
                                ground_truth[v_id][i]);
 
             if (tmp_tol > tol) {
                 RXMESH_WARN("val: {}, truth: {}, tol: {}\n",
-                       (*smooth_X)(vh, i),
+                       X_mat(v_linear_id, i),
                        ground_truth[v_id][i],
                        tmp_tol);
                 passed = false;
diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index fb70b322..1ae1c59a 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -117,8 +117,6 @@ struct DenseMatrix
             RXMESH_WARN(
                 "DenseMatrix::move() allocating target before moving to {}",
                 location_to_string(target));
-
-            printf("aaaaaaaaaaaaaaaaaaa\n");
             allocate(target);
         }
 
@@ -153,8 +151,6 @@ struct DenseMatrix
 
             m_h_val = static_cast<T*>(malloc(bytes()));
 
-            printf("aaaaaaaaaaaaaaaaaaa\n");
-
             m_allocated = m_allocated | HOST;
         }
 

From d8096b157c6d82e60856c132d8cca0bd43fbcd6c Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 6 Mar 2023 02:50:29 -0800
Subject: [PATCH 26/44] minor

---
 include/rxmesh/matrix/dense_matrix.cuh | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index 1ae1c59a..c3317be4 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -71,7 +71,16 @@ struct DenseMatrix
 
     T* data() const
     {
-        return m_d_val;
+        if ((location & HOST) == HOST) {
+            return m_h_val;
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            return m_d_val;
+        }
+
+        assert(1 != 1);
+        return 0;
     }
 
     T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const

From 122e2475ba2f253a6e192a09ea5cc41ead87ef27 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 6 Mar 2023 03:13:20 -0800
Subject: [PATCH 27/44] fix error

---
 include/rxmesh/matrix/dense_matrix.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index c3317be4..f264d426 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -69,7 +69,7 @@ struct DenseMatrix
 #endif
     }
 
-    T* data() const
+    T* data(locationT location = DEVICE) const
     {
         if ((location & HOST) == HOST) {
             return m_h_val;

From 2051e6d60a4ee32aa8ab438b9c4ffbd27f798562 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 20 Mar 2023 23:44:31 -0700
Subject: [PATCH 28/44] low level init

---
 include/rxmesh/matrix/sparse_matrix.cuh | 249 +++++++++++++++++++++++-
 1 file changed, 245 insertions(+), 4 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index e185fec7..b11f9f18 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -6,6 +6,7 @@
 #include "rxmesh/query.cuh"
 #include "rxmesh/types.h"
 
+#include "cusolverSp_LOWLEVEL_PREVIEW.h"
 #include "rxmesh/matrix/dense_matrix.cuh"
 
 namespace rxmesh {
@@ -284,19 +285,23 @@ struct SparseMatrix
         assert(1 != 1);
     }
 
-    __host__ __device__ IndexT& get_nnz() const {
+    __host__ __device__ IndexT& get_nnz() const
+    {
         return m_nnz;
     }
 
-    __device__ IndexT& get_row_ptr_at(IndexT idx) const {
+    __device__ IndexT& get_row_ptr_at(IndexT idx) const
+    {
         return m_d_row_ptr[idx];
     }
 
-    __device__ IndexT& get_col_idx_at(IndexT idx) const {
+    __device__ IndexT& get_col_idx_at(IndexT idx) const
+    {
         return m_d_col_idx[idx];
     }
 
-    __device__ T& get_val_at(IndexT idx) const {
+    __device__ T& get_val_at(IndexT idx) const
+    {
         return m_d_val[idx];
     }
 
@@ -310,6 +315,7 @@ struct SparseMatrix
         CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle));
     }
 
+    /* ----- CUSPARSE SPMM & SPMV ----- */
 
     /**
      * @brief wrap up the cusparse api for sparse matrix dense matrix
@@ -471,6 +477,10 @@ struct SparseMatrix
         }
     }
 
+    /*  ----- SOLVER -----  */
+
+    /* --- HIGH LEVEL API --- */
+
     /**
      * @brief solve the Ax=b for x where x and b are all array
      */
@@ -624,6 +634,225 @@ struct SparseMatrix
         }
     }
 
+    /* --- LOW LEVEL API --- */
+
+    void spmat_chol_analysis()
+    {
+        CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&m_chol_info));
+        m_internalDataInBytes = 0;
+        m_workspaceInBytes    = 0;
+        CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle,
+                                                  m_row_size,
+                                                  m_nnz,
+                                                  m_descr,
+                                                  m_d_row_ptr,
+                                                  m_d_col_idx,
+                                                  m_chol_info));
+    }
+
+    void spmat_chol_buffer_alloc()
+    {
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle,
+                                                        m_row_size,
+                                                        m_nnz,
+                                                        m_descr,
+                                                        m_d_val,
+                                                        m_d_row_ptr,
+                                                        m_d_col_idx,
+                                                        m_chol_info,
+                                                        &m_internalDataInBytes,
+                                                        &m_workspaceInBytes));
+        }
+
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholBufferInfo(m_cusolver_sphandle,
+                                                        m_row_size,
+                                                        m_nnz,
+                                                        m_descr,
+                                                        m_d_val,
+                                                        m_d_row_ptr,
+                                                        m_d_col_idx,
+                                                        m_chol_info,
+                                                        &m_internalDataInBytes,
+                                                        &m_workspaceInBytes));
+        }
+
+        CUDA_ERROR(cudaMalloc((void**)&m_chol_buffer, m_workspaceInBytes));
+    }
+
+    void spmat_chol_buffer_free()
+    {
+        CUDA_ERROR(cudaFree(m_chol_buffer));
+    }
+
+    void spmat_chol_factor()
+    {
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_d_val,
+                                                    m_d_row_ptr,
+                                                    m_d_col_idx,
+                                                    m_chol_info,
+                                                    m_chol_buffer));
+        }
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholFactor(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_d_val,
+                                                    m_d_row_ptr,
+                                                    m_d_col_idx,
+                                                    m_chol_info,
+                                                    m_chol_buffer));
+        }
+
+        double tol = 1.0e-8;
+        int    singularity;
+
+        CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot(
+            m_cusolver_sphandle, m_chol_info, tol, &singularity));
+        if (0 <= singularity) {
+            RXMESH_WARN(
+                "WARNING: the matrix is singular at row {} under tol ({})",
+                singularity,
+                tol);
+        }
+    }
+
+    void spmat_chol_solve(T* d_b, T* d_x)
+    {
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholSolve(m_cusolver_sphandle,
+                                                   m_row_size,
+                                                   d_b,
+                                                   d_x,
+                                                   m_chol_info,
+                                                   m_chol_buffer));
+        }
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholSolve(m_cusolver_sphandle,
+                                                   m_row_size,
+                                                   d_b,
+                                                   d_x,
+                                                   m_chol_info,
+                                                   m_chol_buffer));
+        }
+    }
+
+    /* Host compatibility */
+
+    void move(locationT source, locationT target, cudaStream_t stream = NULL)
+    {
+        if (source == target) {
+            RXMESH_WARN(
+                "DenseMatrix::move() source ({}) and target ({}) "
+                "are the same.",
+                location_to_string(source),
+                location_to_string(target));
+            return;
+        }
+
+        if ((source == HOST || source == DEVICE) &&
+            ((source & m_allocated) != source)) {
+            RXMESH_ERROR(
+                "DenseMatrix::move() moving source is not valid"
+                " because it was not allocated on source i.e., {}",
+                location_to_string(source));
+        }
+
+        if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) &&
+            ((target & m_allocated) != target)) {
+            RXMESH_WARN(
+                "DenseMatrix::move() allocating target before moving to {}",
+                location_to_string(target));
+            allocate(target);
+        }
+
+        if (source == HOST && target == DEVICE) {
+            CUDA_ERROR(cudaMemcpyAsync(m_d_val,
+                                       m_h_val,
+                                       m_nnz * sizeof(T),
+                                       cudaMemcpyHostToDevice,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_d_row_ptr,
+                                       m_h_row_ptr,
+                                       m_row_size * sizeof(IndexT),
+                                       cudaMemcpyHostToDevice,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_d_col_idx,
+                                       m_h_col_idx,
+                                       m_nnz * sizeof(IndexT),
+                                       cudaMemcpyHostToDevice,
+                                       stream));
+        } else if (source == DEVICE && target == HOST) {
+            CUDA_ERROR(cudaMemcpyAsync(m_h_val,
+                                       m_d_val,
+                                       m_nnz * sizeof(T),
+                                       cudaMemcpyDeviceToHost,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_h_row_ptr,
+                                       m_d_row_ptr,
+                                       m_row_size * sizeof(IndexT),
+                                       cudaMemcpyDeviceToHost,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_h_col_idx,
+                                       m_d_col_idx,
+                                       m_nnz * sizeof(IndexT),
+                                       cudaMemcpyDeviceToHost,
+                                       stream));
+        }
+    }
+
+    void release(locationT location = LOCATION_ALL)
+    {
+        if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
+            free(m_h_val);
+            m_h_val     = nullptr;
+            m_h_row_ptr = nullptr;
+            m_h_col_idx = nullptr;
+            m_allocated = m_allocated & (~HOST);
+        }
+
+        if (((location & DEVICE) == DEVICE) &&
+            ((m_allocated & DEVICE) == DEVICE)) {
+            GPU_FREE(m_d_val);
+            GPU_FREE(m_h_row_ptr);
+            GPU_FREE(m_h_col_idx);
+            m_allocated = m_allocated & (~DEVICE);
+        }
+    }
+
+    void allocate(locationT location)
+    {
+        if ((location & HOST) == HOST) {
+            release(HOST);
+
+            m_h_val     = static_cast<T*>(malloc(m_nnz * sizeof(T)));
+            m_h_row_ptr = static_cast<T*>(malloc(m_row_size * sizeof(IndexT)));
+            m_d_col_idx = static_cast<T*>(malloc(m_nnz * sizeof(IndexT)));
+
+            m_allocated = m_allocated | HOST;
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            release(DEVICE);
+
+            CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T)));
+            CUDA_ERROR(
+                cudaMalloc((void**)&m_h_row_ptr, m_row_size * sizeof(IndexT)));
+            CUDA_ERROR(
+                cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT)));
+
+            m_allocated = m_allocated | DEVICE;
+        }
+    }
+
+
    private:
     const Context        m_context;
     cusparseHandle_t     m_cusparse_handle;
@@ -640,6 +869,18 @@ struct SparseMatrix
     IndexT  m_row_size;
     IndexT  m_col_size;
     IndexT  m_nnz;
+
+    // host data parameters
+    locationT m_allocated;
+    IndexT*   m_h_row_ptr;
+    IndexT*   m_h_col_idx;
+    T*        m_h_val;
+
+    // lower level API parameters
+    csrcholInfo_t m_chol_info;
+    size_t        m_internalDataInBytes;
+    size_t        m_workspaceInBytes;
+    void*         m_chol_buffer;
 };
 
 }  // namespace rxmesh
\ No newline at end of file

From d7b9c33d6392a0ded0520ed94d768a4f905bf4aa Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Tue, 21 Mar 2023 17:08:05 -0700
Subject: [PATCH 29/44] host-device compatibility

---
 include/rxmesh/matrix/sparse_matrix.cuh | 27 ++++++++++++++++---------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index b11f9f18..0877822b 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -860,27 +860,34 @@ struct SparseMatrix
     cusparseSpMatDescr_t m_spdescr;
     cusparseMatDescr_t   m_descr;
 
-    size_t m_spmm_buffer_size;
-    size_t m_spmv_buffer_size;
+    IndexT m_row_size;
+    IndexT m_col_size;
+    IndexT m_nnz;
 
     IndexT* m_d_row_ptr;
     IndexT* m_d_col_idx;
     T*      m_d_val;
-    IndexT  m_row_size;
-    IndexT  m_col_size;
-    IndexT  m_nnz;
 
-    // host data parameters
-    locationT m_allocated;
-    IndexT*   m_h_row_ptr;
-    IndexT*   m_h_col_idx;
-    T*        m_h_val;
+    // host csr data
+    IndexT* m_h_row_ptr;
+    IndexT* m_h_col_idx;
+    T*      m_h_val;
+
+    // susparse buffer
+    size_t m_spmm_buffer_size;
+    size_t m_spmv_buffer_size;
 
     // lower level API parameters
     csrcholInfo_t m_chol_info;
     size_t        m_internalDataInBytes;
     size_t        m_workspaceInBytes;
     void*         m_chol_buffer;
+
+    // purmutation
+    IndexT* m_h_permute;
+    IndexT* m_d_permute;
+
+    locationT m_allocated;
 };
 
 }  // namespace rxmesh
\ No newline at end of file

From daac342461932a8c68b0d2ed1259a60fe633bac1 Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Tue, 21 Mar 2023 17:08:05 -0700
Subject: [PATCH 30/44] host-device compatibility

---
 include/rxmesh/matrix/sparse_matrix.cuh | 34 +++++++++++++++++--------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index b11f9f18..8f05fbd7 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -860,27 +860,41 @@ struct SparseMatrix
     cusparseSpMatDescr_t m_spdescr;
     cusparseMatDescr_t   m_descr;
 
-    size_t m_spmm_buffer_size;
-    size_t m_spmv_buffer_size;
+    IndexT m_row_size;
+    IndexT m_col_size;
+    IndexT m_nnz;
 
     IndexT* m_d_row_ptr;
     IndexT* m_d_col_idx;
     T*      m_d_val;
-    IndexT  m_row_size;
-    IndexT  m_col_size;
-    IndexT  m_nnz;
 
-    // host data parameters
-    locationT m_allocated;
-    IndexT*   m_h_row_ptr;
-    IndexT*   m_h_col_idx;
-    T*        m_h_val;
+    // host csr data
+    IndexT* m_h_row_ptr;
+    IndexT* m_h_col_idx;
+    T*      m_h_val;
+
+    // susparse buffer
+    size_t m_spmm_buffer_size;
+    size_t m_spmv_buffer_size;
 
     // lower level API parameters
     csrcholInfo_t m_chol_info;
     size_t        m_internalDataInBytes;
     size_t        m_workspaceInBytes;
     void*         m_chol_buffer;
+
+    // purmutation array
+    IndexT* m_h_permute;
+    IndexT* m_d_permute;
+
+    // permuted CSR matrix
+    // equal to the original matrix if not permutated 
+    // only allocated as a new CSR matrix if permutated 
+    IndexT* m_d_p_row_ptr;
+    IndexT* m_d_p_col_idx;
+    T*      m_d_p_val;
+
+    locationT m_allocated;
 };
 
 }  // namespace rxmesh
\ No newline at end of file

From 882b40693150995e344577c82ddedb5b9ac31d4b Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 27 Mar 2023 12:06:08 -0700
Subject: [PATCH 31/44] add reordring and debugging

---
 include/rxmesh/matrix/sparse_matrix.cuh  | 260 ++++++++++++++++++++---
 tests/RXMesh_test/test_sparse_matrix.cuh |  81 ++++++-
 2 files changed, 301 insertions(+), 40 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 8f05fbd7..41b31f2b 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -122,7 +122,8 @@ struct SparseMatrix
           m_descr(NULL),
           m_spdescr(NULL),
           m_spmm_buffer_size(0),
-          m_spmv_buffer_size(0)
+          m_spmv_buffer_size(0),
+          m_use_reorder(false)
     {
         using namespace rxmesh;
         constexpr uint32_t blockThreads = 256;
@@ -636,8 +637,151 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
+    void spmat_chol_reorder(rxmesh::Reorder reorder)
+    {
+        if (reorder == Reorder::NONE) {
+            RXMESH_INFO("None reordering is specified",
+                        "Continue without reordering");
+            m_use_reorder = false;
+            return;
+        }
+
+        m_use_reorder = true;
+
+        /*check on host*/
+        bool on_host = true;
+        if ((HOST & m_allocated) != HOST) {
+            move(DEVICE, HOST);
+            on_host = false;
+        }
+
+        if (reorder == Reorder::SYMRCM) {
+            CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_h_row_ptr,
+                                                    m_h_col_idx,
+                                                    m_h_permute));
+        } else if (reorder == Reorder::SYMAMD) {
+            CUSOLVER_ERROR(cusolverSpXcsrsymamdHost(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_h_row_ptr,
+                                                    m_h_col_idx,
+                                                    m_h_permute));
+        } else if (reorder == Reorder::NSTDIS) {
+            CUSOLVER_ERROR(cusolverSpXcsrmetisndHost(m_cusolver_sphandle,
+                                                     m_row_size,
+                                                     m_nnz,
+                                                     m_descr,
+                                                     m_h_row_ptr,
+                                                     m_h_col_idx,
+                                                     NULL,
+                                                     m_h_permute));
+        }
+
+        // working space for permutation: B = A*Q*A^T
+        size_t size_perm       = 0;
+        void*  perm_buffer_cpu = NULL;
+
+        CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle,
+                                                         m_row_size,
+                                                         m_col_size,
+                                                         m_nnz,
+                                                         m_descr,
+                                                         m_h_row_ptr,
+                                                         m_h_col_idx,
+                                                         m_h_permute,
+                                                         m_h_permute,
+                                                         &size_perm));
+
+        perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm);
+        assert(NULL != perm_buffer_cpu);
+
+        IndexT* h_mapBfromQ =
+            static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
+        IndexT* d_mapBfromQ = CUDA_ERROR(
+            cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT)));
+
+        // do the permutation which works only on the col and row indices
+        CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle,
+                                              m_row_size,
+                                              m_col_size,
+                                              m_nnz,
+                                              m_descr,
+                                              m_h_row_ptr,
+                                              m_h_col_idx,
+                                              m_h_permute,
+                                              m_h_permute,
+                                              h_mapBfromQ,
+                                              perm_buffer_cpu));
+
+        // allocate the purmutated csr and copy from the host
+        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T)));
+        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr,
+                              m_row_size * sizeof(IndexT)));
+        CUDA_ERROR(
+            cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT)));
+
+        CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val,
+                                   m_h_val,
+                                   m_nnz * sizeof(T),
+                                   cudaMemcpyHostToDevice));
+        CUDA_ERROR(cudaMemcpyAsync(m_d_solver_row_ptr,
+                                   m_h_row_ptr,
+                                   m_row_size * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+        CUDA_ERROR(cudaMemcpyAsync(m_d_solver_col_idx,
+                                   m_h_col_idx,
+                                   m_nnz * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+
+        // do the permutation for val indices
+        cusparseDnVecDescr_t val_values;
+        cusparseSpVecDescr_t val_permutation;
+
+        CUDA_ERROR(cudaMemcpyAsync(d_mapBfromQ,
+                                   h_mapBfromQ,
+                                   m_nnz * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+
+        CUSPARSE_ERROR(cusparseCreateSpVec(&val_permutation,
+                                           m_nnz,
+                                           m_nnz,
+                                           d_mapBfromQ,
+                                           m_d_val,
+                                           CUSPARSE_INDEX_32I,
+                                           CUSPARSE_INDEX_BASE_ZERO,
+                                           CUDA_R_32F));
+        CUSPARSE_ERROR(cusparseCreateDnVec(
+            &val_values, m_nnz, m_d_solver_val, CUDA_R_32F));
+        CUSPARSE_ERROR(
+            cusparseScatter(m_cusparse_handle, val_permutation, val_values));
+        CUSPARSE_ERROR(cusparseDestroyDnVec(val_values));
+        CUSPARSE_ERROR(cusparseDestroySpVec(val_permutation));
+
+        free(perm_buffer_cpu);
+        free(h_mapBfromQ);
+        CUDA_ERROR(cudaFree(d_mapBfromQ));
+
+        // restore the host data back to the original
+        if (on_host) {
+            move(DEVICE, HOST);
+        } else {
+            release(HOST);
+        }
+    }
+
     void spmat_chol_analysis()
     {
+        if (!m_use_reorder) {
+            m_d_solver_row_ptr = m_d_row_ptr;
+            m_d_solver_col_idx = m_d_col_idx;
+            m_d_solver_val     = m_d_val;
+        }
+
         CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&m_chol_info));
         m_internalDataInBytes = 0;
         m_workspaceInBytes    = 0;
@@ -645,8 +789,8 @@ struct SparseMatrix
                                                   m_row_size,
                                                   m_nnz,
                                                   m_descr,
-                                                  m_d_row_ptr,
-                                                  m_d_col_idx,
+                                                  m_d_solver_row_ptr,
+                                                  m_d_solver_col_idx,
                                                   m_chol_info));
     }
 
@@ -657,9 +801,9 @@ struct SparseMatrix
                                                         m_row_size,
                                                         m_nnz,
                                                         m_descr,
-                                                        m_d_val,
-                                                        m_d_row_ptr,
-                                                        m_d_col_idx,
+                                                        m_d_solver_val,
+                                                        m_d_solver_row_ptr,
+                                                        m_d_solver_col_idx,
                                                         m_chol_info,
                                                         &m_internalDataInBytes,
                                                         &m_workspaceInBytes));
@@ -670,9 +814,9 @@ struct SparseMatrix
                                                         m_row_size,
                                                         m_nnz,
                                                         m_descr,
-                                                        m_d_val,
-                                                        m_d_row_ptr,
-                                                        m_d_col_idx,
+                                                        m_d_solver_val,
+                                                        m_d_solver_row_ptr,
+                                                        m_d_solver_col_idx,
                                                         m_chol_info,
                                                         &m_internalDataInBytes,
                                                         &m_workspaceInBytes));
@@ -693,9 +837,9 @@ struct SparseMatrix
                                                     m_row_size,
                                                     m_nnz,
                                                     m_descr,
-                                                    m_d_val,
-                                                    m_d_row_ptr,
-                                                    m_d_col_idx,
+                                                    m_d_solver_val,
+                                                    m_d_solver_row_ptr,
+                                                    m_d_solver_col_idx,
                                                     m_chol_info,
                                                     m_chol_buffer));
         }
@@ -704,9 +848,9 @@ struct SparseMatrix
                                                     m_row_size,
                                                     m_nnz,
                                                     m_descr,
-                                                    m_d_val,
-                                                    m_d_row_ptr,
-                                                    m_d_col_idx,
+                                                    m_d_solver_val,
+                                                    m_d_solver_row_ptr,
+                                                    m_d_solver_col_idx,
                                                     m_chol_info,
                                                     m_chol_buffer));
         }
@@ -726,25 +870,75 @@ struct SparseMatrix
 
     void spmat_chol_solve(T* d_b, T* d_x)
     {
+
+        T* d_solver_b;
+        T* d_solver_x;
+
+        if (m_use_reorder) {
+            /* purmute b and x*/
+            CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T)));
+            CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T)));
+
+            cusparseDnVecDescr_t b_values;
+            cusparseSpVecDescr_t b_permutation;
+
+            CUSPARSE_ERROR(cusparseCreateSpVec(&b_permutation,
+                                               m_row_size,
+                                               m_row_size,
+                                               m_d_permute,
+                                               d_b,
+                                               CUSPARSE_INDEX_32I,
+                                               CUSPARSE_INDEX_BASE_ZERO,
+                                               CUDA_R_32F));
+            CUSPARSE_ERROR(
+                cusparseCreateDnVec(&b_values, m_nnz, d_solver_b, CUDA_R_32F));
+            CUSPARSE_ERROR(
+                cusparseScatter(m_cusparse_handle, b_permutation, b_values));
+            CUSPARSE_ERROR(cusparseDestroyDnVec(b_values));
+            CUSPARSE_ERROR(cusparseDestroySpVec(b_permutation));
+
+            cusparseDnVecDescr_t x_values;
+            cusparseSpVecDescr_t x_permutation;
+
+            CUSPARSE_ERROR(cusparseCreateSpVec(&x_permutation,
+                                               m_col_size,
+                                               m_col_size,
+                                               m_d_permute,
+                                               d_x,
+                                               CUSPARSE_INDEX_32I,
+                                               CUSPARSE_INDEX_BASE_ZERO,
+                                               CUDA_R_32F));
+            CUSPARSE_ERROR(
+                cusparseCreateDnVec(&x_values, m_nnz, d_solver_x, CUDA_R_32F));
+            CUSPARSE_ERROR(
+                cusparseScatter(m_cusparse_handle, x_permutation, x_values));
+            CUSPARSE_ERROR(cusparseDestroyDnVec(x_values));
+            CUSPARSE_ERROR(cusparseDestroySpVec(x_permutation));
+        } else {
+            d_solver_b = d_b;
+            d_solver_x = d_x;
+        }
+
         if constexpr (std::is_same_v<T, float>) {
             CUSOLVER_ERROR(cusolverSpScsrcholSolve(m_cusolver_sphandle,
                                                    m_row_size,
-                                                   d_b,
-                                                   d_x,
+                                                   d_solver_b,
+                                                   d_solver_x,
                                                    m_chol_info,
                                                    m_chol_buffer));
         }
+
         if constexpr (std::is_same_v<T, double>) {
             CUSOLVER_ERROR(cusolverSpDcsrcholSolve(m_cusolver_sphandle,
                                                    m_row_size,
-                                                   d_b,
-                                                   d_x,
+                                                   d_solver_b,
+                                                   d_solver_x,
                                                    m_chol_info,
                                                    m_chol_buffer));
         }
     }
 
-    /* Host compatibility */
+    /* Host data compatibility */
 
     void move(locationT source, locationT target, cudaStream_t stream = NULL)
     {
@@ -812,6 +1006,8 @@ struct SparseMatrix
     {
         if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
             free(m_h_val);
+            free(m_h_row_ptr);
+            free(m_h_col_idx);
             m_h_val     = nullptr;
             m_h_row_ptr = nullptr;
             m_h_col_idx = nullptr;
@@ -832,9 +1028,10 @@ struct SparseMatrix
         if ((location & HOST) == HOST) {
             release(HOST);
 
-            m_h_val     = static_cast<T*>(malloc(m_nnz * sizeof(T)));
-            m_h_row_ptr = static_cast<T*>(malloc(m_row_size * sizeof(IndexT)));
-            m_d_col_idx = static_cast<T*>(malloc(m_nnz * sizeof(IndexT)));
+            m_h_val = static_cast<T*>(malloc(m_nnz * sizeof(T)));
+            m_h_row_ptr =
+                static_cast<IndexT*>(malloc(m_row_size * sizeof(IndexT)));
+            m_h_col_idx = static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
 
             m_allocated = m_allocated | HOST;
         }
@@ -844,7 +1041,7 @@ struct SparseMatrix
 
             CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T)));
             CUDA_ERROR(
-                cudaMalloc((void**)&m_h_row_ptr, m_row_size * sizeof(IndexT)));
+                cudaMalloc((void**)&m_d_row_ptr, m_row_size * sizeof(IndexT)));
             CUDA_ERROR(
                 cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT)));
 
@@ -864,6 +1061,7 @@ struct SparseMatrix
     IndexT m_col_size;
     IndexT m_nnz;
 
+    // device csr data
     IndexT* m_d_row_ptr;
     IndexT* m_d_col_idx;
     T*      m_d_val;
@@ -887,13 +1085,15 @@ struct SparseMatrix
     IndexT* m_h_permute;
     IndexT* m_d_permute;
 
-    // permuted CSR matrix
-    // equal to the original matrix if not permutated 
-    // only allocated as a new CSR matrix if permutated 
-    IndexT* m_d_p_row_ptr;
-    IndexT* m_d_p_col_idx;
-    T*      m_d_p_val;
+    // CSR matrix for solving only
+    // equal to the original matrix if not permutated
+    // only allocated as a new CSR matrix if permutated
+    IndexT* m_d_solver_row_ptr;
+    IndexT* m_d_solver_col_idx;
+    T*      m_d_solver_val;
 
+    // flags
+    bool      m_use_reorder;
     locationT m_allocated;
 };
 
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 169fc498..09ff49aa 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -41,8 +41,7 @@ __global__ static void sparse_mat_edge_len_test(
         uint32_t r_patch_id = r_ids.first;
         uint16_t r_local_id = r_ids.second;
 
-        uint32_t row_index =
-            context.m_vertex_prefix[r_patch_id] + r_local_id;
+        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
 
         arr_ref[row_index]     = 0;
         sparse_mat(v_id, v_id) = 0;
@@ -102,8 +101,7 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
         uint32_t r_patch_id = r_ids.first;
         uint16_t r_local_id = r_ids.second;
 
-        uint32_t row_index =
-            context.m_vertex_prefix[r_patch_id] + r_local_id;
+        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
 
         B_mat(row_index, 0) = iter.size() * 7.4f;
         B_mat(row_index, 1) = iter.size() * 2.6f;
@@ -322,14 +320,77 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
 
     std::vector<Vector3f> h_ret_mat(num_vertices);
     CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
-               ret_mat.data(),
-               num_vertices * 3 * sizeof(float),
-               cudaMemcpyDeviceToHost));
+                          ret_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+    std::vector<Vector3f> h_B_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
+                          B_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+
+    for (uint32_t i = 0; i < num_vertices; ++i) {
+        for (uint32_t j = 0; j < 3; ++j) {
+            EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
+        }
+    }
+    A_mat.free();
+}
+
+TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
+{
+    using namespace rxmesh;
+
+    // Select device
+    cuda_query(0);
+
+    // generate rxmesh obj
+    std::string  obj_path = rxmesh_args.obj_file_name;
+    RXMeshStatic rxmesh(obj_path, rxmesh_args.quite);
+
+    uint32_t num_vertices = rxmesh.get_num_vertices();
+
+    const uint32_t threads = 256;
+    const uint32_t blocks  = DIVIDE_UP(num_vertices, threads);
+
+    auto                coords = rxmesh.get_input_vertex_coordinates();
+    SparseMatrix<float> A_mat(rxmesh);
+    DenseMatrix<float>  X_mat(num_vertices, 3);
+    DenseMatrix<float>  B_mat(num_vertices, 3);
+    DenseMatrix<float>  ret_mat(num_vertices, 3);
+
+    float time_step = 1.f;
+
+    LaunchBox<threads> launch_box;
+    rxmesh.prepare_launch_box(
+        {Op::VV}, launch_box, (void*)simple_A_X_B_setup<float, threads>);
+
+    simple_A_X_B_setup<float, threads><<<launch_box.blocks,
+                                         launch_box.num_threads,
+                                         launch_box.smem_bytes_dyn>>>(
+        rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
+
+    // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
+    A_mat.spmat_chol_analysis();
+    A_mat.spmat_chol_buffer_alloc();
+    A_mat.spmat_chol_factor();
+
+    for (int i = 0; i < B_mat.m_col_size; ++i) {
+        A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
+    }
+
+    A_mat.denmat_mul(X_mat, ret_mat);
+
+    std::vector<Vector3f> h_ret_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
+                          ret_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
     std::vector<Vector3f> h_B_mat(num_vertices);
     CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
-               B_mat.data(),
-               num_vertices * 3 * sizeof(float),
-               cudaMemcpyDeviceToHost));
+                          B_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
 
     for (uint32_t i = 0; i < num_vertices; ++i) {
         for (uint32_t j = 0; j < 3; ++j) {

From 41d47f108ec9f4ffd7c7ae07e769e0917984f179 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 27 Mar 2023 21:19:18 -0700
Subject: [PATCH 32/44] lower api work, reorder debug

---
 include/rxmesh/matrix/sparse_matrix.cuh  | 102 +++++++++++++++++++----
 tests/RXMesh_test/test_sparse_matrix.cuh |  11 ++-
 2 files changed, 95 insertions(+), 18 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 41b31f2b..409c8de7 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -306,7 +306,7 @@ struct SparseMatrix
         return m_d_val[idx];
     }
 
-    void free()
+    void free_mat()
     {
         CUDA_ERROR(cudaFree(m_d_row_ptr));
         CUDA_ERROR(cudaFree(m_d_col_idx));
@@ -637,17 +637,90 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
+    void spmat_chol_test()
+    {
+        // cholesky csr info
+        csrcholInfo_t chol_info;
+        CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&chol_info));
+
+        // cholesky analysis
+        CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle,
+                                                  m_row_size,
+                                                  m_nnz,
+                                                  m_descr,
+                                                  m_d_row_ptr,
+                                                  m_d_col_idx,
+                                                  chol_info));
+
+        size_t internalDataInBytes = 0;
+        size_t workspaceInBytes    = 0;
+
+        // allocate cholesky buffer
+        CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_d_val,
+                                                    m_d_row_ptr,
+                                                    m_d_col_idx,
+                                                    chol_info,
+                                                    &internalDataInBytes,
+                                                    &workspaceInBytes));
+
+        void* chol_buffer;
+        CUDA_ERROR(cudaMalloc((void**)&chol_buffer, workspaceInBytes));
+
+
+        // cholesky factor
+        CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle,
+                                                m_row_size,
+                                                m_nnz,
+                                                m_descr,
+                                                m_d_val,
+                                                m_d_row_ptr,
+                                                m_d_col_idx,
+                                                chol_info,
+                                                chol_buffer));
+
+        double tol = 1.0e-8;
+        int    singularity;
+
+        CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot(
+            m_cusolver_sphandle, chol_info, tol, &singularity));
+        if (0 <= singularity) {
+            printf("WARNING: the matrix is singular at row %d under tol (%E)\n",
+                   singularity,
+                   tol);
+        }
+    }
+
     void spmat_chol_reorder(rxmesh::Reorder reorder)
     {
         if (reorder == Reorder::NONE) {
             RXMESH_INFO("None reordering is specified",
                         "Continue without reordering");
             m_use_reorder = false;
+
+            if (m_reorder_allocated) {
+                GPU_FREE(m_d_val);
+                GPU_FREE(m_h_row_ptr);
+                GPU_FREE(m_h_col_idx);
+                m_reorder_allocated = false;
+            }
+
             return;
         }
 
         m_use_reorder = true;
 
+        // allocate the purmutated csr
+        m_reorder_allocated = true;
+        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T)));
+        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr,
+                              m_row_size * sizeof(IndexT)));
+        CUDA_ERROR(
+            cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT)));
+
         /*check on host*/
         bool on_host = true;
         if ((HOST & m_allocated) != HOST) {
@@ -702,8 +775,8 @@ struct SparseMatrix
 
         IndexT* h_mapBfromQ =
             static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
-        IndexT* d_mapBfromQ = CUDA_ERROR(
-            cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT)));
+        IndexT* d_mapBfromQ;
+        CUDA_ERROR(cudaMalloc((void**)&d_mapBfromQ, m_nnz * sizeof(IndexT)));
 
         // do the permutation which works only on the col and row indices
         CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle,
@@ -718,13 +791,7 @@ struct SparseMatrix
                                               h_mapBfromQ,
                                               perm_buffer_cpu));
 
-        // allocate the purmutated csr and copy from the host
-        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T)));
-        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr,
-                              m_row_size * sizeof(IndexT)));
-        CUDA_ERROR(
-            cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT)));
-
+        // copy the purmutated csr from the host
         CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val,
                                    m_h_val,
                                    m_nnz * sizeof(T),
@@ -858,8 +925,14 @@ struct SparseMatrix
         double tol = 1.0e-8;
         int    singularity;
 
-        CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot(
-            m_cusolver_sphandle, m_chol_info, tol, &singularity));
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot(
+                m_cusolver_sphandle, m_chol_info, tol, &singularity));
+        }
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot(
+                m_cusolver_sphandle, m_chol_info, tol, &singularity));
+        }
         if (0 <= singularity) {
             RXMESH_WARN(
                 "WARNING: the matrix is singular at row {} under tol ({})",
@@ -1017,8 +1090,8 @@ struct SparseMatrix
         if (((location & DEVICE) == DEVICE) &&
             ((m_allocated & DEVICE) == DEVICE)) {
             GPU_FREE(m_d_val);
-            GPU_FREE(m_h_row_ptr);
-            GPU_FREE(m_h_col_idx);
+            GPU_FREE(m_d_row_ptr);
+            GPU_FREE(m_d_col_idx);
             m_allocated = m_allocated & (~DEVICE);
         }
     }
@@ -1088,6 +1161,7 @@ struct SparseMatrix
     // CSR matrix for solving only
     // equal to the original matrix if not permutated
     // only allocated as a new CSR matrix if permutated
+    bool    m_reorder_allocated;
     IndexT* m_d_solver_row_ptr;
     IndexT* m_d_solver_col_idx;
     T*      m_d_solver_val;
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 09ff49aa..e202d36d 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -196,7 +196,7 @@ TEST(RXMeshStatic, SparseMatrix)
     CUDA_ERROR(cudaFree(d_arr_ones));
     CUDA_ERROR(cudaFree(d_result));
     CUDA_ERROR(cudaFree(vet_degree));
-    spmat.free();
+    spmat.free_mat();
 }
 
 /* First replace the sparse matrix entry with the edge length and then do spmv
@@ -267,7 +267,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
     CUDA_ERROR(cudaFree(d_arr_ref));
     CUDA_ERROR(cudaFree(d_arr_ones));
     CUDA_ERROR(cudaFree(d_result));
-    spmat.free();
+    spmat.free_mat();
 }
 
 /* set up a simple AX=B system where A is a sparse matrix, B and C are dense
@@ -334,7 +334,7 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
             EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
         }
     }
-    A_mat.free();
+    A_mat.free_mat();
 }
 
 TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
@@ -371,6 +371,9 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
     // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
+
+    // A_mat.spmat_chol_test();
+    A_mat.spmat_chol_reorder(Reorder::NSTDIS);
     A_mat.spmat_chol_analysis();
     A_mat.spmat_chol_buffer_alloc();
     A_mat.spmat_chol_factor();
@@ -397,5 +400,5 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
             EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
         }
     }
-    A_mat.free();
+    A_mat.free_mat();
 }
\ No newline at end of file

From 9bb2ac8ef24c248019923ecf9399c243c689b989 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Tue, 28 Mar 2023 05:16:30 -0700
Subject: [PATCH 33/44] sync lab PC

---
 include/rxmesh/matrix/sparse_matrix.cuh | 74 ++++++++++++++-----------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 409c8de7..42d3a157 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -123,7 +123,8 @@ struct SparseMatrix
           m_spdescr(NULL),
           m_spmm_buffer_size(0),
           m_spmv_buffer_size(0),
-          m_use_reorder(false)
+          m_use_reorder(false),
+          m_allocated(LOCATION_NONE)
     {
         using namespace rxmesh;
         constexpr uint32_t blockThreads = 256;
@@ -711,6 +712,13 @@ struct SparseMatrix
             return;
         }
 
+        /*check on host*/
+        bool on_host = true;
+        if ((HOST & m_allocated) != HOST) {
+            move(DEVICE, HOST);
+            on_host = false;
+        }
+
         m_use_reorder = true;
 
         // allocate the purmutated csr
@@ -721,12 +729,9 @@ struct SparseMatrix
         CUDA_ERROR(
             cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT)));
 
-        /*check on host*/
-        bool on_host = true;
-        if ((HOST & m_allocated) != HOST) {
-            move(DEVICE, HOST);
-            on_host = false;
-        }
+        m_h_permute = (IndexT*)malloc(m_row_size * sizeof(IndexT));
+        CUDA_ERROR(
+            cudaMalloc((void**)&m_d_permute, m_row_size * sizeof(IndexT)));
 
         if (reorder == Reorder::SYMRCM) {
             CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle,
@@ -755,7 +760,19 @@ struct SparseMatrix
                                                      m_h_permute));
         }
 
+        CUDA_ERROR(cudaMemcpyAsync(m_d_permute,
+                                   m_h_permute,
+                                   m_row_size * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+
         // working space for permutation: B = A*Q*A^T
+        // the permutation for matrix A which works only for the col and row
+        // indices, the val will be done on device with the d/h_val_permute
+        IndexT* h_val_permute =
+            static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
+        IndexT* d_val_permute;
+        CUDA_ERROR(cudaMalloc((void**)&d_val_permute, m_nnz * sizeof(IndexT)));
+
         size_t size_perm       = 0;
         void*  perm_buffer_cpu = NULL;
 
@@ -771,14 +788,7 @@ struct SparseMatrix
                                                          &size_perm));
 
         perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm);
-        assert(NULL != perm_buffer_cpu);
 
-        IndexT* h_mapBfromQ =
-            static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
-        IndexT* d_mapBfromQ;
-        CUDA_ERROR(cudaMalloc((void**)&d_mapBfromQ, m_nnz * sizeof(IndexT)));
-
-        // do the permutation which works only on the col and row indices
         CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle,
                                               m_row_size,
                                               m_col_size,
@@ -788,7 +798,7 @@ struct SparseMatrix
                                               m_h_col_idx,
                                               m_h_permute,
                                               m_h_permute,
-                                              h_mapBfromQ,
+                                              h_val_permute,
                                               perm_buffer_cpu));
 
         // copy the purmutated csr from the host
@@ -805,33 +815,33 @@ struct SparseMatrix
                                    m_nnz * sizeof(IndexT),
                                    cudaMemcpyHostToDevice));
 
-        // do the permutation for val indices
-        cusparseDnVecDescr_t val_values;
-        cusparseSpVecDescr_t val_permutation;
+        // do the permutation for val indices on device
+        cusparseDnVecDescr_t val_final_desc;
+        cusparseSpVecDescr_t val_permute_desc;
 
-        CUDA_ERROR(cudaMemcpyAsync(d_mapBfromQ,
-                                   h_mapBfromQ,
+        CUDA_ERROR(cudaMemcpyAsync(d_val_permute,
+                                   h_val_permute,
                                    m_nnz * sizeof(IndexT),
                                    cudaMemcpyHostToDevice));
 
-        CUSPARSE_ERROR(cusparseCreateSpVec(&val_permutation,
+        CUSPARSE_ERROR(cusparseCreateSpVec(&val_permute_desc,
                                            m_nnz,
                                            m_nnz,
-                                           d_mapBfromQ,
+                                           d_val_permute,
                                            m_d_val,
                                            CUSPARSE_INDEX_32I,
                                            CUSPARSE_INDEX_BASE_ZERO,
                                            CUDA_R_32F));
         CUSPARSE_ERROR(cusparseCreateDnVec(
-            &val_values, m_nnz, m_d_solver_val, CUDA_R_32F));
-        CUSPARSE_ERROR(
-            cusparseScatter(m_cusparse_handle, val_permutation, val_values));
-        CUSPARSE_ERROR(cusparseDestroyDnVec(val_values));
-        CUSPARSE_ERROR(cusparseDestroySpVec(val_permutation));
+            &val_final_desc, m_nnz, m_d_solver_val, CUDA_R_32F));
+        CUSPARSE_ERROR(cusparseScatter(
+            m_cusparse_handle, val_permute_desc, val_final_desc));
+        CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc));
+        CUSPARSE_ERROR(cusparseDestroySpVec(val_permute_desc));
 
         free(perm_buffer_cpu);
-        free(h_mapBfromQ);
-        CUDA_ERROR(cudaFree(d_mapBfromQ));
+        free(h_val_permute);
+        // CUDA_ERROR(cudaFree(d_val_permute));
 
         // restore the host data back to the original
         if (on_host) {
@@ -1017,7 +1027,7 @@ struct SparseMatrix
     {
         if (source == target) {
             RXMESH_WARN(
-                "DenseMatrix::move() source ({}) and target ({}) "
+                "SparseMatrix::move() source ({}) and target ({}) "
                 "are the same.",
                 location_to_string(source),
                 location_to_string(target));
@@ -1027,7 +1037,7 @@ struct SparseMatrix
         if ((source == HOST || source == DEVICE) &&
             ((source & m_allocated) != source)) {
             RXMESH_ERROR(
-                "DenseMatrix::move() moving source is not valid"
+                "SparseMatrix::move() moving source is not valid"
                 " because it was not allocated on source i.e., {}",
                 location_to_string(source));
         }
@@ -1035,7 +1045,7 @@ struct SparseMatrix
         if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) &&
             ((target & m_allocated) != target)) {
             RXMESH_WARN(
-                "DenseMatrix::move() allocating target before moving to {}",
+                "SparseMatrix::move() allocating target before moving to {}",
                 location_to_string(target));
             allocate(target);
         }

From 69207d29c84c3edcd181caade9f00a40dcf11649 Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Wed, 29 Mar 2023 12:09:44 -0700
Subject: [PATCH 34/44] test reordering error

---
 include/rxmesh/matrix/dense_matrix.cuh   |   2 +
 include/rxmesh/matrix/sparse_matrix.cuh  | 184 +++++++++++++++--------
 tests/RXMesh_test/test_sparse_matrix.cuh |  58 +++----
 3 files changed, 156 insertions(+), 88 deletions(-)

diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index f264d426..19c77f7f 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -20,6 +20,7 @@ struct DenseMatrix
           m_allocated(LOCATION_NONE)
     {
         CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
+        m_allocated = m_allocated | DEVICE;
 
         CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr,
                                            m_row_size,
@@ -28,6 +29,7 @@ struct DenseMatrix
                                            m_d_val,
                                            CUDA_R_32F,
                                            CUSPARSE_ORDER_COL));
+        
     }
 
     void set_ones()
diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 42d3a157..c137ae6b 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -191,7 +191,7 @@ struct SparseMatrix
 
         // val pointer allocation, actual value init should be in another
         // function
-        CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(IndexT)));
+        CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T)));
 
         CUSPARSE_ERROR(cusparseCreateMatDescr(&m_descr));
         CUSPARSE_ERROR(
@@ -213,6 +213,8 @@ struct SparseMatrix
 
         CUSPARSE_ERROR(cusparseCreate(&m_cusparse_handle));
         CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle));
+
+        m_allocated = m_allocated | DEVICE;
     }
 
     void set_ones()
@@ -638,61 +640,83 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
-    void spmat_chol_test()
+    void spmat_chol_test_reorder()
     {
-        // cholesky csr info
-        csrcholInfo_t chol_info;
-        CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&chol_info));
-
-        // cholesky analysis
-        CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle,
-                                                  m_row_size,
-                                                  m_nnz,
-                                                  m_descr,
-                                                  m_d_row_ptr,
-                                                  m_d_col_idx,
-                                                  chol_info));
-
-        size_t internalDataInBytes = 0;
-        size_t workspaceInBytes    = 0;
-
-        // allocate cholesky buffer
-        CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle,
-                                                    m_row_size,
-                                                    m_nnz,
-                                                    m_descr,
-                                                    m_d_val,
-                                                    m_d_row_ptr,
-                                                    m_d_col_idx,
-                                                    chol_info,
-                                                    &internalDataInBytes,
-                                                    &workspaceInBytes));
-
-        void* chol_buffer;
-        CUDA_ERROR(cudaMalloc((void**)&chol_buffer, workspaceInBytes));
-
-
-        // cholesky factor
-        CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle,
-                                                m_row_size,
-                                                m_nnz,
-                                                m_descr,
-                                                m_d_val,
-                                                m_d_row_ptr,
-                                                m_d_col_idx,
-                                                chol_info,
-                                                chol_buffer));
-
-        double tol = 1.0e-8;
-        int    singularity;
+    }
 
-        CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot(
-            m_cusolver_sphandle, chol_info, tol, &singularity));
-        if (0 <= singularity) {
-            printf("WARNING: the matrix is singular at row %d under tol (%E)\n",
-                   singularity,
-                   tol);
+    void spmat_chol_test_purmute()
+    {
+        // Host problem definition
+        int   size         = 8;
+        int   nnz          = 4;
+        int   hX_indices[] = {0, 3, 4, 7, 1, 2, 5, 6};
+        float hX_values[]  = {5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f};
+        float hY[]         = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+        float hY_result[]  = {5.0f, 1.0f, 2.0f, 6.0f, 7.0f, 3.0f, 4.0f, 8.0f};
+        //--------------------------------------------------------------------------
+        // Device memory management
+        int*   dX_indices;
+        float *dY, *dX_values;
+        CUDA_ERROR(cudaMalloc((void**)&dX_indices, size * sizeof(int)));
+        CUDA_ERROR(cudaMalloc((void**)&dX_values, size * sizeof(float)));
+        CUDA_ERROR(cudaMalloc((void**)&dY, size * sizeof(float)));
+
+        CUDA_ERROR(cudaMemcpy(dX_indices,
+                              hX_indices,
+                              size * sizeof(int),
+                              cudaMemcpyHostToDevice));
+        CUDA_ERROR(cudaMemcpy(dX_values,
+                              hX_values,
+                              size * sizeof(float),
+                              cudaMemcpyHostToDevice));
+        CUDA_ERROR(
+            cudaMemcpy(dY, hY, size * sizeof(float), cudaMemcpyHostToDevice));
+        //--------------------------------------------------------------------------
+        // CUSPARSE APIs
+        cusparseHandle_t     handle = NULL;
+        cusparseSpVecDescr_t vecX;
+        cusparseDnVecDescr_t vecY;
+        CUSPARSE_ERROR(cusparseCreate(&handle));
+        // Create sparse vector X
+        CUSPARSE_ERROR(cusparseCreateSpVec(&vecX,
+                                           size,
+                                           size,
+                                           dX_indices,
+                                           dX_values,
+                                           CUSPARSE_INDEX_32I,
+                                           CUSPARSE_INDEX_BASE_ZERO,
+                                           CUDA_R_32F));
+        // Create dense vector y
+        CUSPARSE_ERROR(cusparseCreateDnVec(&vecY, size, dY, CUDA_R_32F));
+
+        // execute Scatter
+        CUSPARSE_ERROR(cusparseScatter(handle, vecX, vecY));
+
+        // destroy matrix/vector descriptors
+        CUSPARSE_ERROR(cusparseDestroySpVec(vecX));
+        CUSPARSE_ERROR(cusparseDestroyDnVec(vecY));
+        CUSPARSE_ERROR(cusparseDestroy(handle));
+        //--------------------------------------------------------------------------
+        // device result check
+        CUDA_ERROR(
+            cudaMemcpy(hY, dY, size * sizeof(float), cudaMemcpyDeviceToHost));
+        int correct = 1;
+        for (int i = 0; i < size; i++) {
+            RXMESH_INFO("hY[]: {}; hY_result[i]: {};", hY[i], hY_result[i]);
+            if (hY[i] != hY_result[i]) {
+                correct = 0;
+                break;
+            }
         }
+        if (correct)
+            printf("scatter_example test PASSED\n");
+        else
+            printf("scatter_example test FAILED: wrong result\n");
+        //--------------------------------------------------------------------------
+        // device memory deallocation
+        CUDA_ERROR(cudaFree(dX_indices));
+        CUDA_ERROR(cudaFree(dX_values));
+        CUDA_ERROR(cudaFree(dY));
     }
 
     void spmat_chol_reorder(rxmesh::Reorder reorder)
@@ -703,9 +727,9 @@ struct SparseMatrix
             m_use_reorder = false;
 
             if (m_reorder_allocated) {
-                GPU_FREE(m_d_val);
-                GPU_FREE(m_h_row_ptr);
-                GPU_FREE(m_h_col_idx);
+                GPU_FREE(m_d_solver_val);
+                GPU_FREE(m_d_solver_row_ptr);
+                GPU_FREE(m_d_solver_col_idx);
                 m_reorder_allocated = false;
             }
 
@@ -733,6 +757,27 @@ struct SparseMatrix
         CUDA_ERROR(
             cudaMalloc((void**)&m_d_permute, m_row_size * sizeof(IndexT)));
 
+        CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle));
+
+        RXMESH_INFO("nnz {} - size_row {} - size_col {}",
+                    m_nnz,
+                    m_row_size,
+                    m_col_size);
+
+        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+                    m_h_row_ptr[0],
+                    m_h_row_ptr[1],
+                    m_h_row_ptr[2],
+                    m_h_row_ptr[3],
+                    m_h_row_ptr[4]);
+
+        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+                    m_h_col_idx[0],
+                    m_h_col_idx[1],
+                    m_h_col_idx[2],
+                    m_h_col_idx[3],
+                    m_h_col_idx[4]);
+
         if (reorder == Reorder::SYMRCM) {
             CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle,
                                                     m_row_size,
@@ -760,6 +805,13 @@ struct SparseMatrix
                                                      m_h_permute));
         }
 
+        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+                    m_h_permute[0],
+                    m_h_permute[1],
+                    m_h_permute[2],
+                    m_h_permute[3],
+                    m_h_permute[4]);
+
         CUDA_ERROR(cudaMemcpyAsync(m_d_permute,
                                    m_h_permute,
                                    m_row_size * sizeof(IndexT),
@@ -789,6 +841,11 @@ struct SparseMatrix
 
         perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm);
 
+        RXMESH_INFO("size_perm {} - size_row {} - size_col {}",
+                    size_perm,
+                    m_row_size,
+                    m_col_size);
+
         CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle,
                                               m_row_size,
                                               m_col_size,
@@ -801,6 +858,14 @@ struct SparseMatrix
                                               h_val_permute,
                                               perm_buffer_cpu));
 
+
+        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+                    h_val_permute[0],
+                    h_val_permute[1],
+                    h_val_permute[2],
+                    h_val_permute[3],
+                    h_val_permute[4]);
+
         // copy the purmutated csr from the host
         CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val,
                                    m_h_val,
@@ -816,8 +881,8 @@ struct SparseMatrix
                                    cudaMemcpyHostToDevice));
 
         // do the permutation for val indices on device
-        cusparseDnVecDescr_t val_final_desc;
         cusparseSpVecDescr_t val_permute_desc;
+        cusparseDnVecDescr_t val_final_desc;
 
         CUDA_ERROR(cudaMemcpyAsync(d_val_permute,
                                    h_val_permute,
@@ -836,12 +901,13 @@ struct SparseMatrix
             &val_final_desc, m_nnz, m_d_solver_val, CUDA_R_32F));
         CUSPARSE_ERROR(cusparseScatter(
             m_cusparse_handle, val_permute_desc, val_final_desc));
-        CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc));
+
         CUSPARSE_ERROR(cusparseDestroySpVec(val_permute_desc));
+        CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc));
 
         free(perm_buffer_cpu);
         free(h_val_permute);
-        // CUDA_ERROR(cudaFree(d_val_permute));
+        CUDA_ERROR(cudaFree(d_val_permute));
 
         // restore the host data back to the original
         if (on_host) {
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index e202d36d..6d7db9e8 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -372,33 +372,33 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
 
     // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 
-    // A_mat.spmat_chol_test();
-    A_mat.spmat_chol_reorder(Reorder::NSTDIS);
-    A_mat.spmat_chol_analysis();
-    A_mat.spmat_chol_buffer_alloc();
-    A_mat.spmat_chol_factor();
-
-    for (int i = 0; i < B_mat.m_col_size; ++i) {
-        A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
-    }
-
-    A_mat.denmat_mul(X_mat, ret_mat);
-
-    std::vector<Vector3f> h_ret_mat(num_vertices);
-    CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
-                          ret_mat.data(),
-                          num_vertices * 3 * sizeof(float),
-                          cudaMemcpyDeviceToHost));
-    std::vector<Vector3f> h_B_mat(num_vertices);
-    CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
-                          B_mat.data(),
-                          num_vertices * 3 * sizeof(float),
-                          cudaMemcpyDeviceToHost));
-
-    for (uint32_t i = 0; i < num_vertices; ++i) {
-        for (uint32_t j = 0; j < 3; ++j) {
-            EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
-        }
-    }
-    A_mat.free_mat();
+    A_mat.spmat_chol_test_purmute();
+    A_mat.spmat_chol_reorder(Reorder::SYMRCM);
+    // A_mat.spmat_chol_analysis();
+    // A_mat.spmat_chol_buffer_alloc();
+    // A_mat.spmat_chol_factor();
+
+    // for (int i = 0; i < B_mat.m_col_size; ++i) {
+    //     A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
+    // }
+
+    // A_mat.denmat_mul(X_mat, ret_mat);
+
+    // std::vector<Vector3f> h_ret_mat(num_vertices);
+    // CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
+    //                       ret_mat.data(),
+    //                       num_vertices * 3 * sizeof(float),
+    //                       cudaMemcpyDeviceToHost));
+    // std::vector<Vector3f> h_B_mat(num_vertices);
+    // CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
+    //                       B_mat.data(),
+    //                       num_vertices * 3 * sizeof(float),
+    //                       cudaMemcpyDeviceToHost));
+
+    // for (uint32_t i = 0; i < num_vertices; ++i) {
+    //     for (uint32_t j = 0; j < 3; ++j) {
+    //         EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
+    //     }
+    // }
+    // A_mat.free_mat();
 }
\ No newline at end of file

From 41fb9d7b7257a26447cf9064714c20c0978e2201 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Wed, 29 Mar 2023 21:38:25 -0700
Subject: [PATCH 35/44] val_permute_bug

---
 include/rxmesh/matrix/sparse_matrix.cuh  | 52 +++++++++++++--------
 tests/RXMesh_test/test_sparse_matrix.cuh | 58 ++++++++++++------------
 2 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index c137ae6b..c424d1a3 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -640,7 +640,7 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
-    void spmat_chol_test_reorder()
+    void spmat_chol_test_val_reorder()
     {
     }
 
@@ -749,7 +749,7 @@ struct SparseMatrix
         m_reorder_allocated = true;
         CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T)));
         CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr,
-                              m_row_size * sizeof(IndexT)));
+                              (m_row_size + 1) * sizeof(IndexT)));
         CUDA_ERROR(
             cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT)));
 
@@ -764,14 +764,14 @@ struct SparseMatrix
                     m_row_size,
                     m_col_size);
 
-        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+        RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
                     m_h_row_ptr[0],
                     m_h_row_ptr[1],
                     m_h_row_ptr[2],
                     m_h_row_ptr[3],
                     m_h_row_ptr[4]);
 
-        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+        RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
                     m_h_col_idx[0],
                     m_h_col_idx[1],
                     m_h_col_idx[2],
@@ -805,7 +805,7 @@ struct SparseMatrix
                                                      m_h_permute));
         }
 
-        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+        RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}",
                     m_h_permute[0],
                     m_h_permute[1],
                     m_h_permute[2],
@@ -828,6 +828,20 @@ struct SparseMatrix
         size_t size_perm       = 0;
         void*  perm_buffer_cpu = NULL;
 
+        RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
+                    m_h_row_ptr[0],
+                    m_h_row_ptr[1],
+                    m_h_row_ptr[2],
+                    m_h_row_ptr[3],
+                    m_h_row_ptr[4]);
+
+        RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
+                    m_h_col_idx[0],
+                    m_h_col_idx[1],
+                    m_h_col_idx[2],
+                    m_h_col_idx[3],
+                    m_h_col_idx[4]);
+
         CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle,
                                                          m_row_size,
                                                          m_col_size,
@@ -841,10 +855,11 @@ struct SparseMatrix
 
         perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm);
 
-        RXMESH_INFO("size_perm {} - size_row {} - size_col {}",
-                    size_perm,
-                    m_row_size,
-                    m_col_size);
+        RXMESH_INFO("size_perm {}", size_perm);
+
+        for (int j = 0; j < m_nnz; j++) {
+            h_val_permute[j] = j;
+        }
 
         CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle,
                                               m_row_size,
@@ -858,8 +873,7 @@ struct SparseMatrix
                                               h_val_permute,
                                               perm_buffer_cpu));
 
-
-        RXMESH_INFO("print: {}, {}, {}, {}, {}",
+        RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}",
                     h_val_permute[0],
                     h_val_permute[1],
                     h_val_permute[2],
@@ -873,7 +887,7 @@ struct SparseMatrix
                                    cudaMemcpyHostToDevice));
         CUDA_ERROR(cudaMemcpyAsync(m_d_solver_row_ptr,
                                    m_h_row_ptr,
-                                   m_row_size * sizeof(IndexT),
+                                   (m_row_size + 1) * sizeof(IndexT),
                                    cudaMemcpyHostToDevice));
         CUDA_ERROR(cudaMemcpyAsync(m_d_solver_col_idx,
                                    m_h_col_idx,
@@ -1040,7 +1054,7 @@ struct SparseMatrix
                                                CUSPARSE_INDEX_BASE_ZERO,
                                                CUDA_R_32F));
             CUSPARSE_ERROR(
-                cusparseCreateDnVec(&b_values, m_nnz, d_solver_b, CUDA_R_32F));
+                cusparseCreateDnVec(&b_values, m_row_size, d_solver_b, CUDA_R_32F));
             CUSPARSE_ERROR(
                 cusparseScatter(m_cusparse_handle, b_permutation, b_values));
             CUSPARSE_ERROR(cusparseDestroyDnVec(b_values));
@@ -1058,7 +1072,7 @@ struct SparseMatrix
                                                CUSPARSE_INDEX_BASE_ZERO,
                                                CUDA_R_32F));
             CUSPARSE_ERROR(
-                cusparseCreateDnVec(&x_values, m_nnz, d_solver_x, CUDA_R_32F));
+                cusparseCreateDnVec(&x_values, m_col_size, d_solver_x, CUDA_R_32F));
             CUSPARSE_ERROR(
                 cusparseScatter(m_cusparse_handle, x_permutation, x_values));
             CUSPARSE_ERROR(cusparseDestroyDnVec(x_values));
@@ -1124,7 +1138,7 @@ struct SparseMatrix
                                        stream));
             CUDA_ERROR(cudaMemcpyAsync(m_d_row_ptr,
                                        m_h_row_ptr,
-                                       m_row_size * sizeof(IndexT),
+                                       (m_row_size + 1) * sizeof(IndexT),
                                        cudaMemcpyHostToDevice,
                                        stream));
             CUDA_ERROR(cudaMemcpyAsync(m_d_col_idx,
@@ -1140,7 +1154,7 @@ struct SparseMatrix
                                        stream));
             CUDA_ERROR(cudaMemcpyAsync(m_h_row_ptr,
                                        m_d_row_ptr,
-                                       m_row_size * sizeof(IndexT),
+                                       (m_row_size + 1) * sizeof(IndexT),
                                        cudaMemcpyDeviceToHost,
                                        stream));
             CUDA_ERROR(cudaMemcpyAsync(m_h_col_idx,
@@ -1179,7 +1193,7 @@ struct SparseMatrix
 
             m_h_val = static_cast<T*>(malloc(m_nnz * sizeof(T)));
             m_h_row_ptr =
-                static_cast<IndexT*>(malloc(m_row_size * sizeof(IndexT)));
+                static_cast<IndexT*>(malloc((m_row_size + 1) * sizeof(IndexT)));
             m_h_col_idx = static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
 
             m_allocated = m_allocated | HOST;
@@ -1189,8 +1203,8 @@ struct SparseMatrix
             release(DEVICE);
 
             CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T)));
-            CUDA_ERROR(
-                cudaMalloc((void**)&m_d_row_ptr, m_row_size * sizeof(IndexT)));
+            CUDA_ERROR(cudaMalloc((void**)&m_d_row_ptr,
+                                  (m_row_size + 1) * sizeof(IndexT)));
             CUDA_ERROR(
                 cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT)));
 
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 6d7db9e8..5df3b8e2 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -372,33 +372,33 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
 
     // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 
-    A_mat.spmat_chol_test_purmute();
-    A_mat.spmat_chol_reorder(Reorder::SYMRCM);
-    // A_mat.spmat_chol_analysis();
-    // A_mat.spmat_chol_buffer_alloc();
-    // A_mat.spmat_chol_factor();
-
-    // for (int i = 0; i < B_mat.m_col_size; ++i) {
-    //     A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
-    // }
-
-    // A_mat.denmat_mul(X_mat, ret_mat);
-
-    // std::vector<Vector3f> h_ret_mat(num_vertices);
-    // CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
-    //                       ret_mat.data(),
-    //                       num_vertices * 3 * sizeof(float),
-    //                       cudaMemcpyDeviceToHost));
-    // std::vector<Vector3f> h_B_mat(num_vertices);
-    // CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
-    //                       B_mat.data(),
-    //                       num_vertices * 3 * sizeof(float),
-    //                       cudaMemcpyDeviceToHost));
-
-    // for (uint32_t i = 0; i < num_vertices; ++i) {
-    //     for (uint32_t j = 0; j < 3; ++j) {
-    //         EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
-    //     }
-    // }
-    // A_mat.free_mat();
+    // A_mat.spmat_chol_test_purmute();
+    // A_mat.spmat_chol_reorder(Reorder::NSTDIS);
+    A_mat.spmat_chol_analysis();
+    A_mat.spmat_chol_buffer_alloc();
+    A_mat.spmat_chol_factor();
+
+    for (int i = 0; i < B_mat.m_col_size; ++i) {
+        A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
+    }
+
+    A_mat.denmat_mul(X_mat, ret_mat);
+
+    std::vector<Vector3f> h_ret_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
+                          ret_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+    std::vector<Vector3f> h_B_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
+                          B_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+
+    for (uint32_t i = 0; i < num_vertices; ++i) {
+        for (uint32_t j = 0; j < 3; ++j) {
+            EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
+        }
+    }
+    A_mat.free_mat();
 }
\ No newline at end of file

From a58e54a0b2b3f9be9f580be8cf53e9142776291a Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Fri, 31 Mar 2023 12:44:40 -0700
Subject: [PATCH 36/44] request debug

---
 include/rxmesh/matrix/sparse_matrix.cuh  | 138 ++++++++++++++---------
 tests/RXMesh_test/test_sparse_matrix.cuh |  29 ++++-
 2 files changed, 111 insertions(+), 56 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index c424d1a3..87115869 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -640,10 +640,6 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
-    void spmat_chol_test_val_reorder()
-    {
-    }
-
     void spmat_chol_test_purmute()
     {
         // Host problem definition
@@ -759,24 +755,31 @@ struct SparseMatrix
 
         CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle));
 
-        RXMESH_INFO("nnz {} - size_row {} - size_col {}",
-                    m_nnz,
-                    m_row_size,
-                    m_col_size);
-
-        RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
-                    m_h_row_ptr[0],
-                    m_h_row_ptr[1],
-                    m_h_row_ptr[2],
-                    m_h_row_ptr[3],
-                    m_h_row_ptr[4]);
-
-        RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
-                    m_h_col_idx[0],
-                    m_h_col_idx[1],
-                    m_h_col_idx[2],
-                    m_h_col_idx[3],
-                    m_h_col_idx[4]);
+        // RXMESH_INFO("nnz {} - size_row {} - size_col {}",
+        //             m_nnz,
+        //             m_row_size,
+        //             m_col_size);
+
+        // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
+        //             m_h_row_ptr[0],
+        //             m_h_row_ptr[1],
+        //             m_h_row_ptr[2],
+        //             m_h_row_ptr[3],
+        //             m_h_row_ptr[4]);
+
+        // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
+        //             m_h_col_idx[0],
+        //             m_h_col_idx[1],
+        //             m_h_col_idx[2],
+        //             m_h_col_idx[3],
+        //             m_h_col_idx[4]);
+
+        // printf("m_h_val: %f %f %f %f %f\n",
+        //        m_h_val[0],
+        //        m_h_val[1],
+        //        m_h_val[2],
+        //        m_h_val[3],
+        //        m_h_val[4]);
 
         if (reorder == Reorder::SYMRCM) {
             CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle,
@@ -805,12 +808,12 @@ struct SparseMatrix
                                                      m_h_permute));
         }
 
-        RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}",
-                    m_h_permute[0],
-                    m_h_permute[1],
-                    m_h_permute[2],
-                    m_h_permute[3],
-                    m_h_permute[4]);
+        // RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}",
+        //             m_h_permute[0],
+        //             m_h_permute[1],
+        //             m_h_permute[2],
+        //             m_h_permute[3],
+        //             m_h_permute[4]);
 
         CUDA_ERROR(cudaMemcpyAsync(m_d_permute,
                                    m_h_permute,
@@ -828,20 +831,6 @@ struct SparseMatrix
         size_t size_perm       = 0;
         void*  perm_buffer_cpu = NULL;
 
-        RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
-                    m_h_row_ptr[0],
-                    m_h_row_ptr[1],
-                    m_h_row_ptr[2],
-                    m_h_row_ptr[3],
-                    m_h_row_ptr[4]);
-
-        RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
-                    m_h_col_idx[0],
-                    m_h_col_idx[1],
-                    m_h_col_idx[2],
-                    m_h_col_idx[3],
-                    m_h_col_idx[4]);
-
         CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle,
                                                          m_row_size,
                                                          m_col_size,
@@ -873,12 +862,51 @@ struct SparseMatrix
                                               h_val_permute,
                                               perm_buffer_cpu));
 
-        RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}",
-                    h_val_permute[0],
-                    h_val_permute[1],
-                    h_val_permute[2],
-                    h_val_permute[3],
-                    h_val_permute[4]);
+        // RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}",
+        //             h_val_permute[0],
+        //             h_val_permute[1],
+        //             h_val_permute[2],
+        //             h_val_permute[3],
+        //             h_val_permute[4]);
+
+        // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
+        //             m_h_row_ptr[0],
+        //             m_h_row_ptr[1],
+        //             m_h_row_ptr[2],
+        //             m_h_row_ptr[3],
+        //             m_h_row_ptr[4]);
+
+        // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
+        //             m_h_col_idx[0],
+        //             m_h_col_idx[1],
+        //             m_h_col_idx[2],
+        //             m_h_col_idx[3],
+        //             m_h_col_idx[4]);
+
+        // printf("m_h_val: %f %f %f %f %f\n",
+        //        m_h_val[0],
+        //        m_h_val[1],
+        //        m_h_val[2],
+        //        m_h_val[3],
+        //        m_h_val[4]);
+
+
+        // host test val permute
+        // T* tmp_h_val = static_cast<T*>(malloc(m_nnz * sizeof(T)));
+
+        // for (int j = 0; j < m_nnz; j++) {
+        //     tmp_h_val[j] = m_h_val[j];
+        // }
+        // for (int j = 0; j < m_nnz; j++) {
+        //     m_h_val[j] = tmp_h_val[h_val_permute[j]];
+        // }
+
+        // printf("m_h_val: %f %f %f %f %f\n",
+        //        m_h_val[0],
+        //        m_h_val[1],
+        //        m_h_val[2],
+        //        m_h_val[3],
+        //        m_h_val[4]);
 
         // copy the purmutated csr from the host
         CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val,
@@ -1042,26 +1070,26 @@ struct SparseMatrix
             CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T)));
             CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T)));
 
-            cusparseDnVecDescr_t b_values;
             cusparseSpVecDescr_t b_permutation;
+            cusparseDnVecDescr_t b_values;
 
             CUSPARSE_ERROR(cusparseCreateSpVec(&b_permutation,
                                                m_row_size,
                                                m_row_size,
                                                m_d_permute,
-                                               d_b,
+                                               static_cast<T*>(d_b),
                                                CUSPARSE_INDEX_32I,
                                                CUSPARSE_INDEX_BASE_ZERO,
                                                CUDA_R_32F));
-            CUSPARSE_ERROR(
-                cusparseCreateDnVec(&b_values, m_row_size, d_solver_b, CUDA_R_32F));
+            CUSPARSE_ERROR(cusparseCreateDnVec(
+                &b_values, m_row_size, d_solver_b, CUDA_R_32F));
             CUSPARSE_ERROR(
                 cusparseScatter(m_cusparse_handle, b_permutation, b_values));
             CUSPARSE_ERROR(cusparseDestroyDnVec(b_values));
             CUSPARSE_ERROR(cusparseDestroySpVec(b_permutation));
 
-            cusparseDnVecDescr_t x_values;
             cusparseSpVecDescr_t x_permutation;
+            cusparseDnVecDescr_t x_values;
 
             CUSPARSE_ERROR(cusparseCreateSpVec(&x_permutation,
                                                m_col_size,
@@ -1071,8 +1099,8 @@ struct SparseMatrix
                                                CUSPARSE_INDEX_32I,
                                                CUSPARSE_INDEX_BASE_ZERO,
                                                CUDA_R_32F));
-            CUSPARSE_ERROR(
-                cusparseCreateDnVec(&x_values, m_col_size, d_solver_x, CUDA_R_32F));
+            CUSPARSE_ERROR(cusparseCreateDnVec(
+                &x_values, m_col_size, d_solver_x, CUDA_R_32F));
             CUSPARSE_ERROR(
                 cusparseScatter(m_cusparse_handle, x_permutation, x_values));
             CUSPARSE_ERROR(cusparseDestroyDnVec(x_values));
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 5df3b8e2..c4063aef 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -130,6 +130,21 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
     query.dispatch<Op::VV>(block, shrd_alloc, mat_setup);
 }
 
+template <typename T, uint32_t blockThreads>
+__global__ static void check_diag(const rxmesh::Context   context,
+                                  rxmesh::SparseMatrix<T> A_mat)
+{
+    using namespace rxmesh;
+    auto init = [&](VertexHandle& v_id, const VertexIterator& iter) {
+        printf("%lf\n", A_mat(v_id, v_id));
+    };
+
+    auto                block = cooperative_groups::this_thread_block();
+    Query<blockThreads> query(context);
+    ShmemAllocator      shrd_alloc;
+    query.dispatch<Op::VV>(block, shrd_alloc, init);
+}
+
 
 /* Check the access of the sparse matrix in CSR format in device */
 TEST(RXMeshStatic, SparseMatrix)
@@ -370,10 +385,22 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
                                          launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
+
+    LaunchBox<threads> test_launch_box;
+    rxmesh.prepare_launch_box(
+        {Op::VV}, test_launch_box, (void*)check_diag<float, threads>);
+
+    check_diag<float, threads>
+        <<<test_launch_box.blocks,
+           test_launch_box.num_threads,
+           test_launch_box.smem_bytes_dyn>>>(rxmesh.get_context(), A_mat);
+
+    cudaDeviceSynchronize();
+
     // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 
     // A_mat.spmat_chol_test_purmute();
-    // A_mat.spmat_chol_reorder(Reorder::NSTDIS);
+    A_mat.spmat_chol_reorder(Reorder::NSTDIS);
     A_mat.spmat_chol_analysis();
     A_mat.spmat_chol_buffer_alloc();
     A_mat.spmat_chol_factor();

From bdbe290065800dc6d46fbe8a611afc9607d3587c Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 3 Apr 2023 11:07:42 -0700
Subject: [PATCH 37/44] worksfine

---
 include/rxmesh/matrix/dense_matrix.cuh   |  53 ++++---
 include/rxmesh/matrix/sparse_matrix.cuh  | 183 +++++++----------------
 tests/RXMesh_test/test_sparse_matrix.cuh |  16 +-
 3 files changed, 96 insertions(+), 156 deletions(-)

diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index 19c77f7f..66ff6de1 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -10,18 +10,27 @@ namespace rxmesh {
 // Currently this is device only
 // host/device transormation will be added
 // only col major is supported
-template <typename T, typename IndexT = int>
+template <typename T, typename IndexT = int, unsigned int MemAlignSize = 0>
 struct DenseMatrix
 {
     DenseMatrix(IndexT row_size, IndexT col_size)
         : m_row_size(row_size),
           m_col_size(col_size),
           m_dendescr(NULL),
-          m_allocated(LOCATION_NONE)
+          m_allocated(LOCATION_NONE),
+          m_col_pad_bytes(0),
+          m_col_pad_idx(0)
     {
-        CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
         m_allocated = m_allocated | DEVICE;
 
+        IndexT col_data_bytes = m_row_size * sizeof(T);
+        if (MemAlignSize != 0 && col_data_bytes % MemAlignSize != 0) {
+            m_col_pad_bytes = MemAlignSize - (col_data_bytes % MemAlignSize);
+            m_col_pad_idx   = m_col_pad_bytes / sizeof(T);
+        }
+
+        CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
+
         CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr,
                                            m_row_size,
                                            m_col_size,
@@ -29,17 +38,16 @@ struct DenseMatrix
                                            m_d_val,
                                            CUDA_R_32F,
                                            CUSPARSE_ORDER_COL));
-        
     }
 
-    void set_ones()
-    {
-        std::vector<T> init_tmp_arr(m_row_size * m_col_size, 1);
-        CUDA_ERROR(cudaMemcpy(m_d_val,
-                              init_tmp_arr.data(),
-                              bytes() * sizeof(T),
-                              cudaMemcpyHostToDevice));
-    }
+    // void set_ones()
+    // {
+    //     std::vector<T> init_tmp_arr(m_row_size * m_col_size, 1);
+    //     CUDA_ERROR(cudaMemcpy(m_d_val,
+    //                           init_tmp_arr.data(),
+    //                           bytes() * sizeof(T),
+    //                           cudaMemcpyHostToDevice));
+    // }
 
     IndexT lead_dim() const
     {
@@ -52,9 +60,9 @@ struct DenseMatrix
         assert(col < m_col_size);
 
 #ifdef __CUDA_ARCH__
-        return m_d_val[col * m_row_size + row];
+        return m_d_val[col * (m_row_size + m_col_pad_idx) + row];
 #else
-        return m_h_val[col * m_row_size + row];
+        return m_h_val[col * (m_row_size + m_col_pad_idx) + row];
 #endif
     }
 
@@ -65,9 +73,9 @@ struct DenseMatrix
         assert(col < m_col_size);
 
 #ifdef __CUDA_ARCH__
-        return m_d_val[col * m_row_size + row];
+        return m_d_val[col * (m_row_size + m_col_pad_idx) + row];
 #else
-        return m_h_val[col * m_row_size + row];
+        return m_h_val[col * (m_row_size + m_col_pad_idx) + row];
 #endif
     }
 
@@ -88,11 +96,15 @@ struct DenseMatrix
     T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const
     {
         if ((location & HOST) == HOST) {
-            return m_h_val + ld_idx * lead_dim();
+            return m_h_val + ld_idx * (m_row_size + m_col_pad_idx);
         }
 
         if ((location & DEVICE) == DEVICE) {
-            return m_d_val + ld_idx * lead_dim();
+            return m_d_val + ld_idx * (m_row_size + m_col_pad_idx);
+        }
+
+        if ((location & m_allocated) == location) {
+            RXMESH_ERROR("Requested data not allocated on {}", location);
         }
 
         assert(1 != 1);
@@ -101,7 +113,7 @@ struct DenseMatrix
 
     IndexT bytes() const
     {
-        return m_row_size * m_col_size * sizeof(T);
+        return (m_row_size + m_col_pad_idx) * m_col_size * sizeof(T);
     }
 
     void move(locationT source, locationT target, cudaStream_t stream = NULL)
@@ -182,6 +194,9 @@ struct DenseMatrix
     IndexT               m_col_size;
     T*                   m_d_val;
     T*                   m_h_val;
+
+    IndexT m_col_pad_bytes;
+    IndexT m_col_pad_idx;
 };
 
 }  // namespace rxmesh
diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 87115869..1d11ee0e 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -6,6 +6,11 @@
 #include "rxmesh/query.cuh"
 #include "rxmesh/types.h"
 
+#include "thrust/device_ptr.h"
+#include "thrust/execution_policy.h"
+#include "thrust/gather.h"
+#include "thrust/scatter.h"
+
 #include "cusolverSp_LOWLEVEL_PREVIEW.h"
 #include "rxmesh/matrix/dense_matrix.cuh"
 
@@ -102,8 +107,43 @@ __global__ static void sparse_mat_col_fill(const rxmesh::Context context,
     query.dispatch<Op::VV>(block, shrd_alloc, col_fillin);
 }
 
+// d_out[d_p[i]] = d_in[i]
+template <typename T, typename IndexT = int>
+void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size)
+{
+    thrust::device_ptr<IndexT> t_p(d_p);
+    thrust::device_ptr<T> t_i(d_in);
+    thrust::device_ptr<T> t_o(d_out);
+
+    thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o);
+}
+
+// d_out[i] = d_in[d_p[i]]
+template <typename T, typename IndexT = int>
+void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size)
+{
+    thrust::device_ptr<IndexT> t_p(d_p);
+    thrust::device_ptr<T> t_i(d_in);
+    thrust::device_ptr<T> t_o(d_out);
+
+    thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o);
+}
+
 }  // namespace detail
 
+__global__ void check_arr_int(int* arr, int size) {
+    for (int i = 0; i < size; ++i) {
+        printf("%d ", arr[i]);
+    }
+    printf("\n ");
+}
+
+__global__ void check_arr_float(float* arr, int size) {
+    for (int i = 0; i < size; ++i) {
+        printf("%f ", arr[i]);
+    }
+    printf("\n ");
+}
 
 // TODO: add compatibility for EE, FF, VE......
 // TODO: purge operation?
@@ -755,32 +795,6 @@ struct SparseMatrix
 
         CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle));
 
-        // RXMESH_INFO("nnz {} - size_row {} - size_col {}",
-        //             m_nnz,
-        //             m_row_size,
-        //             m_col_size);
-
-        // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
-        //             m_h_row_ptr[0],
-        //             m_h_row_ptr[1],
-        //             m_h_row_ptr[2],
-        //             m_h_row_ptr[3],
-        //             m_h_row_ptr[4]);
-
-        // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
-        //             m_h_col_idx[0],
-        //             m_h_col_idx[1],
-        //             m_h_col_idx[2],
-        //             m_h_col_idx[3],
-        //             m_h_col_idx[4]);
-
-        // printf("m_h_val: %f %f %f %f %f\n",
-        //        m_h_val[0],
-        //        m_h_val[1],
-        //        m_h_val[2],
-        //        m_h_val[3],
-        //        m_h_val[4]);
-
         if (reorder == Reorder::SYMRCM) {
             CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle,
                                                     m_row_size,
@@ -808,13 +822,6 @@ struct SparseMatrix
                                                      m_h_permute));
         }
 
-        // RXMESH_INFO("m_h_permute: {}, {}, {}, {}, {}",
-        //             m_h_permute[0],
-        //             m_h_permute[1],
-        //             m_h_permute[2],
-        //             m_h_permute[3],
-        //             m_h_permute[4]);
-
         CUDA_ERROR(cudaMemcpyAsync(m_d_permute,
                                    m_h_permute,
                                    m_row_size * sizeof(IndexT),
@@ -844,8 +851,6 @@ struct SparseMatrix
 
         perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm);
 
-        RXMESH_INFO("size_perm {}", size_perm);
-
         for (int j = 0; j < m_nnz; j++) {
             h_val_permute[j] = j;
         }
@@ -862,36 +867,8 @@ struct SparseMatrix
                                               h_val_permute,
                                               perm_buffer_cpu));
 
-        // RXMESH_INFO("h_val_permute: {}, {}, {}, {}, {}",
-        //             h_val_permute[0],
-        //             h_val_permute[1],
-        //             h_val_permute[2],
-        //             h_val_permute[3],
-        //             h_val_permute[4]);
-
-        // RXMESH_INFO("m_h_row_ptr: {}, {}, {}, {}, {}",
-        //             m_h_row_ptr[0],
-        //             m_h_row_ptr[1],
-        //             m_h_row_ptr[2],
-        //             m_h_row_ptr[3],
-        //             m_h_row_ptr[4]);
-
-        // RXMESH_INFO("m_h_col_idx: {}, {}, {}, {}, {}",
-        //             m_h_col_idx[0],
-        //             m_h_col_idx[1],
-        //             m_h_col_idx[2],
-        //             m_h_col_idx[3],
-        //             m_h_col_idx[4]);
-
-        // printf("m_h_val: %f %f %f %f %f\n",
-        //        m_h_val[0],
-        //        m_h_val[1],
-        //        m_h_val[2],
-        //        m_h_val[3],
-        //        m_h_val[4]);
-
-
-        // host test val permute
+
+        // do the permutation for val indices on host
         // T* tmp_h_val = static_cast<T*>(malloc(m_nnz * sizeof(T)));
 
         // for (int j = 0; j < m_nnz; j++) {
@@ -901,13 +878,6 @@ struct SparseMatrix
         //     m_h_val[j] = tmp_h_val[h_val_permute[j]];
         // }
 
-        // printf("m_h_val: %f %f %f %f %f\n",
-        //        m_h_val[0],
-        //        m_h_val[1],
-        //        m_h_val[2],
-        //        m_h_val[3],
-        //        m_h_val[4]);
-
         // copy the purmutated csr from the host
         CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val,
                                    m_h_val,
@@ -923,33 +893,15 @@ struct SparseMatrix
                                    cudaMemcpyHostToDevice));
 
         // do the permutation for val indices on device
-        cusparseSpVecDescr_t val_permute_desc;
-        cusparseDnVecDescr_t val_final_desc;
-
         CUDA_ERROR(cudaMemcpyAsync(d_val_permute,
                                    h_val_permute,
                                    m_nnz * sizeof(IndexT),
                                    cudaMemcpyHostToDevice));
+        
+        detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz);
 
-        CUSPARSE_ERROR(cusparseCreateSpVec(&val_permute_desc,
-                                           m_nnz,
-                                           m_nnz,
-                                           d_val_permute,
-                                           m_d_val,
-                                           CUSPARSE_INDEX_32I,
-                                           CUSPARSE_INDEX_BASE_ZERO,
-                                           CUDA_R_32F));
-        CUSPARSE_ERROR(cusparseCreateDnVec(
-            &val_final_desc, m_nnz, m_d_solver_val, CUDA_R_32F));
-        CUSPARSE_ERROR(cusparseScatter(
-            m_cusparse_handle, val_permute_desc, val_final_desc));
-
-        CUSPARSE_ERROR(cusparseDestroySpVec(val_permute_desc));
-        CUSPARSE_ERROR(cusparseDestroyDnVec(val_final_desc));
-
-        free(perm_buffer_cpu);
         free(h_val_permute);
-        CUDA_ERROR(cudaFree(d_val_permute));
+        GPU_FREE(d_val_permute);
 
         // restore the host data back to the original
         if (on_host) {
@@ -1068,43 +1020,10 @@ struct SparseMatrix
         if (m_use_reorder) {
             /* purmute b and x*/
             CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T)));
-            CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T)));
+            detail::permute_gather(m_d_permute, d_b, d_solver_b, m_row_size);
 
-            cusparseSpVecDescr_t b_permutation;
-            cusparseDnVecDescr_t b_values;
-
-            CUSPARSE_ERROR(cusparseCreateSpVec(&b_permutation,
-                                               m_row_size,
-                                               m_row_size,
-                                               m_d_permute,
-                                               static_cast<T*>(d_b),
-                                               CUSPARSE_INDEX_32I,
-                                               CUSPARSE_INDEX_BASE_ZERO,
-                                               CUDA_R_32F));
-            CUSPARSE_ERROR(cusparseCreateDnVec(
-                &b_values, m_row_size, d_solver_b, CUDA_R_32F));
-            CUSPARSE_ERROR(
-                cusparseScatter(m_cusparse_handle, b_permutation, b_values));
-            CUSPARSE_ERROR(cusparseDestroyDnVec(b_values));
-            CUSPARSE_ERROR(cusparseDestroySpVec(b_permutation));
-
-            cusparseSpVecDescr_t x_permutation;
-            cusparseDnVecDescr_t x_values;
-
-            CUSPARSE_ERROR(cusparseCreateSpVec(&x_permutation,
-                                               m_col_size,
-                                               m_col_size,
-                                               m_d_permute,
-                                               d_x,
-                                               CUSPARSE_INDEX_32I,
-                                               CUSPARSE_INDEX_BASE_ZERO,
-                                               CUDA_R_32F));
-            CUSPARSE_ERROR(cusparseCreateDnVec(
-                &x_values, m_col_size, d_solver_x, CUDA_R_32F));
-            CUSPARSE_ERROR(
-                cusparseScatter(m_cusparse_handle, x_permutation, x_values));
-            CUSPARSE_ERROR(cusparseDestroyDnVec(x_values));
-            CUSPARSE_ERROR(cusparseDestroySpVec(x_permutation));
+            CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T)));
+            detail::permute_gather(m_d_permute, d_x, d_solver_x, m_row_size);
         } else {
             d_solver_b = d_b;
             d_solver_x = d_x;
@@ -1127,6 +1046,12 @@ struct SparseMatrix
                                                    m_chol_info,
                                                    m_chol_buffer));
         }
+
+        if (m_use_reorder) {
+            detail::permute_scatter(m_d_permute, d_solver_x, d_x, m_row_size);
+            GPU_FREE(d_solver_b);
+            GPU_FREE(d_solver_x);
+        }
     }
 
     /* Host data compatibility */
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index c4063aef..4e6ab2bd 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -386,16 +386,16 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
 
-    LaunchBox<threads> test_launch_box;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, test_launch_box, (void*)check_diag<float, threads>);
+    // LaunchBox<threads> test_launch_box;
+    // rxmesh.prepare_launch_box(
+    //     {Op::VV}, test_launch_box, (void*)check_diag<float, threads>);
 
-    check_diag<float, threads>
-        <<<test_launch_box.blocks,
-           test_launch_box.num_threads,
-           test_launch_box.smem_bytes_dyn>>>(rxmesh.get_context(), A_mat);
+    // check_diag<float, threads>
+    //     <<<test_launch_box.blocks,
+    //        test_launch_box.num_threads,
+    //        test_launch_box.smem_bytes_dyn>>>(rxmesh.get_context(), A_mat);
 
-    cudaDeviceSynchronize();
+    // cudaDeviceSynchronize();
 
     // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 

From a53ab72c7f4683e3a57a7fecd374b5e4801dcff9 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 3 Apr 2023 11:21:01 -0700
Subject: [PATCH 38/44] clean up

---
 include/rxmesh/matrix/sparse_matrix.cuh  | 107 +++--------------------
 tests/RXMesh_test/test_sparse_matrix.cuh |  15 +---
 2 files changed, 15 insertions(+), 107 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 1d11ee0e..4f73e405 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -112,8 +112,8 @@ template <typename T, typename IndexT = int>
 void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size)
 {
     thrust::device_ptr<IndexT> t_p(d_p);
-    thrust::device_ptr<T> t_i(d_in);
-    thrust::device_ptr<T> t_o(d_out);
+    thrust::device_ptr<T>      t_i(d_in);
+    thrust::device_ptr<T>      t_o(d_out);
 
     thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o);
 }
@@ -123,28 +123,14 @@ template <typename T, typename IndexT = int>
 void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size)
 {
     thrust::device_ptr<IndexT> t_p(d_p);
-    thrust::device_ptr<T> t_i(d_in);
-    thrust::device_ptr<T> t_o(d_out);
+    thrust::device_ptr<T>      t_i(d_in);
+    thrust::device_ptr<T>      t_o(d_out);
 
     thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o);
 }
 
 }  // namespace detail
 
-__global__ void check_arr_int(int* arr, int size) {
-    for (int i = 0; i < size; ++i) {
-        printf("%d ", arr[i]);
-    }
-    printf("\n ");
-}
-
-__global__ void check_arr_float(float* arr, int size) {
-    for (int i = 0; i < size; ++i) {
-        printf("%f ", arr[i]);
-    }
-    printf("\n ");
-}
-
 // TODO: add compatibility for EE, FF, VE......
 // TODO: purge operation?
 template <typename T, typename IndexT = int>
@@ -357,6 +343,12 @@ struct SparseMatrix
         CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle));
         CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr));
         CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle));
+
+        if (m_reorder_allocated) {
+            GPU_FREE(m_d_solver_val);
+            GPU_FREE(m_d_solver_row_ptr);
+            GPU_FREE(m_d_solver_col_idx);
+        }
     }
 
     /* ----- CUSPARSE SPMM & SPMV ----- */
@@ -680,81 +672,6 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
-    void spmat_chol_test_purmute()
-    {
-        // Host problem definition
-        int   size         = 8;
-        int   nnz          = 4;
-        int   hX_indices[] = {0, 3, 4, 7, 1, 2, 5, 6};
-        float hX_values[]  = {5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f};
-        float hY[]         = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-        float hY_result[]  = {5.0f, 1.0f, 2.0f, 6.0f, 7.0f, 3.0f, 4.0f, 8.0f};
-        //--------------------------------------------------------------------------
-        // Device memory management
-        int*   dX_indices;
-        float *dY, *dX_values;
-        CUDA_ERROR(cudaMalloc((void**)&dX_indices, size * sizeof(int)));
-        CUDA_ERROR(cudaMalloc((void**)&dX_values, size * sizeof(float)));
-        CUDA_ERROR(cudaMalloc((void**)&dY, size * sizeof(float)));
-
-        CUDA_ERROR(cudaMemcpy(dX_indices,
-                              hX_indices,
-                              size * sizeof(int),
-                              cudaMemcpyHostToDevice));
-        CUDA_ERROR(cudaMemcpy(dX_values,
-                              hX_values,
-                              size * sizeof(float),
-                              cudaMemcpyHostToDevice));
-        CUDA_ERROR(
-            cudaMemcpy(dY, hY, size * sizeof(float), cudaMemcpyHostToDevice));
-        //--------------------------------------------------------------------------
-        // CUSPARSE APIs
-        cusparseHandle_t     handle = NULL;
-        cusparseSpVecDescr_t vecX;
-        cusparseDnVecDescr_t vecY;
-        CUSPARSE_ERROR(cusparseCreate(&handle));
-        // Create sparse vector X
-        CUSPARSE_ERROR(cusparseCreateSpVec(&vecX,
-                                           size,
-                                           size,
-                                           dX_indices,
-                                           dX_values,
-                                           CUSPARSE_INDEX_32I,
-                                           CUSPARSE_INDEX_BASE_ZERO,
-                                           CUDA_R_32F));
-        // Create dense vector y
-        CUSPARSE_ERROR(cusparseCreateDnVec(&vecY, size, dY, CUDA_R_32F));
-
-        // execute Scatter
-        CUSPARSE_ERROR(cusparseScatter(handle, vecX, vecY));
-
-        // destroy matrix/vector descriptors
-        CUSPARSE_ERROR(cusparseDestroySpVec(vecX));
-        CUSPARSE_ERROR(cusparseDestroyDnVec(vecY));
-        CUSPARSE_ERROR(cusparseDestroy(handle));
-        //--------------------------------------------------------------------------
-        // device result check
-        CUDA_ERROR(
-            cudaMemcpy(hY, dY, size * sizeof(float), cudaMemcpyDeviceToHost));
-        int correct = 1;
-        for (int i = 0; i < size; i++) {
-            RXMESH_INFO("hY[]: {}; hY_result[i]: {};", hY[i], hY_result[i]);
-            if (hY[i] != hY_result[i]) {
-                correct = 0;
-                break;
-            }
-        }
-        if (correct)
-            printf("scatter_example test PASSED\n");
-        else
-            printf("scatter_example test FAILED: wrong result\n");
-        //--------------------------------------------------------------------------
-        // device memory deallocation
-        CUDA_ERROR(cudaFree(dX_indices));
-        CUDA_ERROR(cudaFree(dX_values));
-        CUDA_ERROR(cudaFree(dY));
-    }
-
     void spmat_chol_reorder(rxmesh::Reorder reorder)
     {
         if (reorder == Reorder::NONE) {
@@ -766,6 +683,8 @@ struct SparseMatrix
                 GPU_FREE(m_d_solver_val);
                 GPU_FREE(m_d_solver_row_ptr);
                 GPU_FREE(m_d_solver_col_idx);
+                GPU_FREE(m_d_permute);
+                free(m_h_permute);
                 m_reorder_allocated = false;
             }
 
@@ -897,7 +816,7 @@ struct SparseMatrix
                                    h_val_permute,
                                    m_nnz * sizeof(IndexT),
                                    cudaMemcpyHostToDevice));
-        
+
         detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz);
 
         free(h_val_permute);
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 4e6ab2bd..7362bf98 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -385,21 +385,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
                                          launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
-
-    // LaunchBox<threads> test_launch_box;
-    // rxmesh.prepare_launch_box(
-    //     {Op::VV}, test_launch_box, (void*)check_diag<float, threads>);
-
-    // check_diag<float, threads>
-    //     <<<test_launch_box.blocks,
-    //        test_launch_box.num_threads,
-    //        test_launch_box.smem_bytes_dyn>>>(rxmesh.get_context(), A_mat);
-
-    // cudaDeviceSynchronize();
-
     // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 
-    // A_mat.spmat_chol_test_purmute();
     A_mat.spmat_chol_reorder(Reorder::NSTDIS);
     A_mat.spmat_chol_analysis();
     A_mat.spmat_chol_buffer_alloc();
@@ -409,6 +396,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
         A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
     }
 
+    A_mat.spmat_chol_buffer_free();
+
     A_mat.denmat_mul(X_mat, ret_mat);
 
     std::vector<Vector3f> h_ret_mat(num_vertices);

From 8f5f563dcf36f00465a2598eb633ea5b2876e3ca Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 3 Apr 2023 11:21:01 -0700
Subject: [PATCH 39/44] clean up

---
 include/rxmesh/matrix/sparse_matrix.cuh  | 109 ++++-------------------
 tests/RXMesh_test/test_sparse_matrix.cuh |  15 +---
 2 files changed, 17 insertions(+), 107 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 1d11ee0e..c08c0ef2 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -112,8 +112,8 @@ template <typename T, typename IndexT = int>
 void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size)
 {
     thrust::device_ptr<IndexT> t_p(d_p);
-    thrust::device_ptr<T> t_i(d_in);
-    thrust::device_ptr<T> t_o(d_out);
+    thrust::device_ptr<T>      t_i(d_in);
+    thrust::device_ptr<T>      t_o(d_out);
 
     thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o);
 }
@@ -123,28 +123,14 @@ template <typename T, typename IndexT = int>
 void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size)
 {
     thrust::device_ptr<IndexT> t_p(d_p);
-    thrust::device_ptr<T> t_i(d_in);
-    thrust::device_ptr<T> t_o(d_out);
+    thrust::device_ptr<T>      t_i(d_in);
+    thrust::device_ptr<T>      t_o(d_out);
 
     thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o);
 }
 
 }  // namespace detail
 
-__global__ void check_arr_int(int* arr, int size) {
-    for (int i = 0; i < size; ++i) {
-        printf("%d ", arr[i]);
-    }
-    printf("\n ");
-}
-
-__global__ void check_arr_float(float* arr, int size) {
-    for (int i = 0; i < size; ++i) {
-        printf("%f ", arr[i]);
-    }
-    printf("\n ");
-}
-
 // TODO: add compatibility for EE, FF, VE......
 // TODO: purge operation?
 template <typename T, typename IndexT = int>
@@ -357,6 +343,14 @@ struct SparseMatrix
         CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle));
         CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr));
         CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle));
+
+        if (m_reorder_allocated) {
+            GPU_FREE(m_d_solver_val);
+            GPU_FREE(m_d_solver_row_ptr);
+            GPU_FREE(m_d_solver_col_idx);
+            GPU_FREE(m_d_permute);
+            free(m_h_permute);
+        }
     }
 
     /* ----- CUSPARSE SPMM & SPMV ----- */
@@ -680,81 +674,6 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
-    void spmat_chol_test_purmute()
-    {
-        // Host problem definition
-        int   size         = 8;
-        int   nnz          = 4;
-        int   hX_indices[] = {0, 3, 4, 7, 1, 2, 5, 6};
-        float hX_values[]  = {5.0f, 6.0f, 7.0f, 8.0f, 1.0f, 2.0f, 3.0f, 4.0f};
-        float hY[]         = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
-        float hY_result[]  = {5.0f, 1.0f, 2.0f, 6.0f, 7.0f, 3.0f, 4.0f, 8.0f};
-        //--------------------------------------------------------------------------
-        // Device memory management
-        int*   dX_indices;
-        float *dY, *dX_values;
-        CUDA_ERROR(cudaMalloc((void**)&dX_indices, size * sizeof(int)));
-        CUDA_ERROR(cudaMalloc((void**)&dX_values, size * sizeof(float)));
-        CUDA_ERROR(cudaMalloc((void**)&dY, size * sizeof(float)));
-
-        CUDA_ERROR(cudaMemcpy(dX_indices,
-                              hX_indices,
-                              size * sizeof(int),
-                              cudaMemcpyHostToDevice));
-        CUDA_ERROR(cudaMemcpy(dX_values,
-                              hX_values,
-                              size * sizeof(float),
-                              cudaMemcpyHostToDevice));
-        CUDA_ERROR(
-            cudaMemcpy(dY, hY, size * sizeof(float), cudaMemcpyHostToDevice));
-        //--------------------------------------------------------------------------
-        // CUSPARSE APIs
-        cusparseHandle_t     handle = NULL;
-        cusparseSpVecDescr_t vecX;
-        cusparseDnVecDescr_t vecY;
-        CUSPARSE_ERROR(cusparseCreate(&handle));
-        // Create sparse vector X
-        CUSPARSE_ERROR(cusparseCreateSpVec(&vecX,
-                                           size,
-                                           size,
-                                           dX_indices,
-                                           dX_values,
-                                           CUSPARSE_INDEX_32I,
-                                           CUSPARSE_INDEX_BASE_ZERO,
-                                           CUDA_R_32F));
-        // Create dense vector y
-        CUSPARSE_ERROR(cusparseCreateDnVec(&vecY, size, dY, CUDA_R_32F));
-
-        // execute Scatter
-        CUSPARSE_ERROR(cusparseScatter(handle, vecX, vecY));
-
-        // destroy matrix/vector descriptors
-        CUSPARSE_ERROR(cusparseDestroySpVec(vecX));
-        CUSPARSE_ERROR(cusparseDestroyDnVec(vecY));
-        CUSPARSE_ERROR(cusparseDestroy(handle));
-        //--------------------------------------------------------------------------
-        // device result check
-        CUDA_ERROR(
-            cudaMemcpy(hY, dY, size * sizeof(float), cudaMemcpyDeviceToHost));
-        int correct = 1;
-        for (int i = 0; i < size; i++) {
-            RXMESH_INFO("hY[]: {}; hY_result[i]: {};", hY[i], hY_result[i]);
-            if (hY[i] != hY_result[i]) {
-                correct = 0;
-                break;
-            }
-        }
-        if (correct)
-            printf("scatter_example test PASSED\n");
-        else
-            printf("scatter_example test FAILED: wrong result\n");
-        //--------------------------------------------------------------------------
-        // device memory deallocation
-        CUDA_ERROR(cudaFree(dX_indices));
-        CUDA_ERROR(cudaFree(dX_values));
-        CUDA_ERROR(cudaFree(dY));
-    }
-
     void spmat_chol_reorder(rxmesh::Reorder reorder)
     {
         if (reorder == Reorder::NONE) {
@@ -766,6 +685,8 @@ struct SparseMatrix
                 GPU_FREE(m_d_solver_val);
                 GPU_FREE(m_d_solver_row_ptr);
                 GPU_FREE(m_d_solver_col_idx);
+                GPU_FREE(m_d_permute);
+                free(m_h_permute);
                 m_reorder_allocated = false;
             }
 
@@ -897,7 +818,7 @@ struct SparseMatrix
                                    h_val_permute,
                                    m_nnz * sizeof(IndexT),
                                    cudaMemcpyHostToDevice));
-        
+
         detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz);
 
         free(h_val_permute);
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 4e6ab2bd..7362bf98 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -385,21 +385,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
                                          launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
-
-    // LaunchBox<threads> test_launch_box;
-    // rxmesh.prepare_launch_box(
-    //     {Op::VV}, test_launch_box, (void*)check_diag<float, threads>);
-
-    // check_diag<float, threads>
-    //     <<<test_launch_box.blocks,
-    //        test_launch_box.num_threads,
-    //        test_launch_box.smem_bytes_dyn>>>(rxmesh.get_context(), A_mat);
-
-    // cudaDeviceSynchronize();
-
     // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
 
-    // A_mat.spmat_chol_test_purmute();
     A_mat.spmat_chol_reorder(Reorder::NSTDIS);
     A_mat.spmat_chol_analysis();
     A_mat.spmat_chol_buffer_alloc();
@@ -409,6 +396,8 @@ TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
         A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
     }
 
+    A_mat.spmat_chol_buffer_free();
+
     A_mat.denmat_mul(X_mat, ret_mat);
 
     std::vector<Vector3f> h_ret_mat(num_vertices);

From b94f4b7d95e830396ad716b70a1d64ad9f23c5a1 Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 3 Apr 2023 11:45:12 -0700
Subject: [PATCH 40/44] fix error

---
 include/rxmesh/matrix/sparse_matrix.cuh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 6cce2fa7..c08c0ef2 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -348,11 +348,8 @@ struct SparseMatrix
             GPU_FREE(m_d_solver_val);
             GPU_FREE(m_d_solver_row_ptr);
             GPU_FREE(m_d_solver_col_idx);
-<<<<<<< HEAD
             GPU_FREE(m_d_permute);
             free(m_h_permute);
-=======
->>>>>>> a53ab72c7f4683e3a57a7fecd374b5e4801dcff9
         }
     }
 

From 4efe1e4e1d4fff81c6c38da268c042ec6a55e97e Mon Sep 17 00:00:00 2001
From: Eric Yuan <116010276@link.cuhk.edu.cn>
Date: Mon, 3 Apr 2023 22:07:49 -0700
Subject: [PATCH 41/44] add comments

---
 include/rxmesh/matrix/sparse_matrix.cuh | 29 +++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index c08c0ef2..47da78c6 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -674,6 +674,12 @@ struct SparseMatrix
 
     /* --- LOW LEVEL API --- */
 
+    /**
+     * @brief The lower level api of reordering. Sprcify the reordering type or
+     * simply NONE for no reordering. This should be called at the begining of
+     * the solving process. Any other function call order would be undefined.
+     * @param reorder: the reorder method applied.
+     */
     void spmat_chol_reorder(rxmesh::Reorder reorder)
     {
         if (reorder == Reorder::NONE) {
@@ -832,6 +838,10 @@ struct SparseMatrix
         }
     }
 
+    /**
+     * @brief The lower level api of matrix analysis. Generating a member value
+     * of type csrcholInfo_t for cucolver.
+     */
     void spmat_chol_analysis()
     {
         if (!m_use_reorder) {
@@ -852,6 +862,10 @@ struct SparseMatrix
                                                   m_chol_info));
     }
 
+    /**
+     * @brief The lower level api of matrix factorization buffer calculation and
+     * allocation. The buffer is a member variable.
+     */
     void spmat_chol_buffer_alloc()
     {
         if constexpr (std::is_same_v<T, float>) {
@@ -883,11 +897,18 @@ struct SparseMatrix
         CUDA_ERROR(cudaMalloc((void**)&m_chol_buffer, m_workspaceInBytes));
     }
 
+    /**
+     * @brief The lower level api of matrix factorization buffer release.
+     */
     void spmat_chol_buffer_free()
     {
         CUDA_ERROR(cudaFree(m_chol_buffer));
     }
 
+    /**
+     * @brief The lower level api of matrix factorization and save the
+     * factorization result in to the buffer.
+     */
     void spmat_chol_factor()
     {
         if constexpr (std::is_same_v<T, float>) {
@@ -932,6 +953,14 @@ struct SparseMatrix
         }
     }
 
+    /**
+     * @brief The lower level api of solving the linear system after using
+     * cholesky factorization. The format follows Ax=b to solve x, where A is
+     * the sparse matrix, x and b are device array. As long as A doesn't change.
+     * This function could be called for many different b and x.
+     * @param d_b: device array of b
+     * @param d_x: device array of x
+     */
     void spmat_chol_solve(T* d_b, T* d_x)
     {
 

From 5503798339d6794c310353aac18320ca41f6c0c4 Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Tue, 11 Apr 2023 09:49:59 -0700
Subject: [PATCH 42/44] remove test code

---
 include/rxmesh/matrix/dense_matrix.cuh   | 14 +-------------
 tests/RXMesh_test/test_sparse_matrix.cuh | 16 ----------------
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index 66ff6de1..d618e7d4 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -7,9 +7,6 @@
 
 namespace rxmesh {
 
-// Currently this is device only
-// host/device transormation will be added
-// only col major is supported
 template <typename T, typename IndexT = int, unsigned int MemAlignSize = 0>
 struct DenseMatrix
 {
@@ -40,15 +37,6 @@ struct DenseMatrix
                                            CUSPARSE_ORDER_COL));
     }
 
-    // void set_ones()
-    // {
-    //     std::vector<T> init_tmp_arr(m_row_size * m_col_size, 1);
-    //     CUDA_ERROR(cudaMemcpy(m_d_val,
-    //                           init_tmp_arr.data(),
-    //                           bytes() * sizeof(T),
-    //                           cudaMemcpyHostToDevice));
-    // }
-
     IndexT lead_dim() const
     {
         return m_row_size;
@@ -104,7 +92,7 @@ struct DenseMatrix
         }
 
         if ((location & m_allocated) == location) {
-            RXMESH_ERROR("Requested data not allocated on {}", location);
+            RXMESH_ERROR("Requested data not allocated on {}", location_to_string(location));
         }
 
         assert(1 != 1);
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 7362bf98..c4bd941a 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -130,22 +130,6 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
     query.dispatch<Op::VV>(block, shrd_alloc, mat_setup);
 }
 
-template <typename T, uint32_t blockThreads>
-__global__ static void check_diag(const rxmesh::Context   context,
-                                  rxmesh::SparseMatrix<T> A_mat)
-{
-    using namespace rxmesh;
-    auto init = [&](VertexHandle& v_id, const VertexIterator& iter) {
-        printf("%lf\n", A_mat(v_id, v_id));
-    };
-
-    auto                block = cooperative_groups::this_thread_block();
-    Query<blockThreads> query(context);
-    ShmemAllocator      shrd_alloc;
-    query.dispatch<Op::VV>(block, shrd_alloc, init);
-}
-
-
 /* Check the access of the sparse matrix in CSR format in device */
 TEST(RXMeshStatic, SparseMatrix)
 {

From 556e2103691c6aa1a036d96426f843988d0df370 Mon Sep 17 00:00:00 2001
From: ericycc <eccyuan@ucdavis.edu>
Date: Tue, 11 Apr 2023 12:28:47 -0700
Subject: [PATCH 43/44] add comments

---
 include/rxmesh/matrix/dense_matrix.cuh  | 28 ++++++++++++++++++++++++-
 include/rxmesh/matrix/sparse_matrix.cuh | 19 ++++++++++++++---
 2 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index d618e7d4..289b69df 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -7,6 +7,12 @@
 
 namespace rxmesh {
 
+/**
+ * @brief dense matrix use for device and host, inside is a array.
+ * The dense matrix is initialized as col major on device.
+ * We would only support col major dense matrix for now since that's what
+ * cusparse and cusolver wants.
+ */
 template <typename T, typename IndexT = int, unsigned int MemAlignSize = 0>
 struct DenseMatrix
 {
@@ -67,6 +73,9 @@ struct DenseMatrix
 #endif
     }
 
+    /**
+     * @brief return the raw pointer based on the location specified
+     */
     T* data(locationT location = DEVICE) const
     {
         if ((location & HOST) == HOST) {
@@ -81,6 +90,10 @@ struct DenseMatrix
         return 0;
     }
 
+    /**
+     * @brief return the raw pointer to columns based on column index the
+     * location specified and
+     */
     T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const
     {
         if ((location & HOST) == HOST) {
@@ -92,18 +105,25 @@ struct DenseMatrix
         }
 
         if ((location & m_allocated) == location) {
-            RXMESH_ERROR("Requested data not allocated on {}", location_to_string(location));
+            RXMESH_ERROR("Requested data not allocated on {}",
+                         location_to_string(location));
         }
 
         assert(1 != 1);
         return 0;
     }
 
+    /**
+     * @brief return the total number bytes used by the array
+    */
     IndexT bytes() const
     {
         return (m_row_size + m_col_pad_idx) * m_col_size * sizeof(T);
     }
 
+    /**
+     * @brief move the data between host an device 
+    */
     void move(locationT source, locationT target, cudaStream_t stream = NULL)
     {
         if (source == target) {
@@ -140,6 +160,9 @@ struct DenseMatrix
         }
     }
 
+    /**
+     * @brief release the data on host or device 
+    */
     void release(locationT location = LOCATION_ALL)
     {
         if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
@@ -155,6 +178,9 @@ struct DenseMatrix
         }
     }
 
+    /**
+     * @brief allocate the data on host or device 
+    */
     void allocate(locationT location)
     {
         if ((location & HOST) == HOST) {
diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 47da78c6..77e1c5fd 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -131,8 +131,13 @@ void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size)
 
 }  // namespace detail
 
-// TODO: add compatibility for EE, FF, VE......
-// TODO: purge operation?
+
+/**
+ * @brief the sparse matrix implementation and all the functions and soolvers
+ * related. Right now, only VV is supported and we would add more compatibility
+ * for EE, FF, VE......
+ * This is device/host compatiable
+ */
 template <typename T, typename IndexT = int>
 struct SparseMatrix
 {
@@ -1005,7 +1010,9 @@ struct SparseMatrix
     }
 
     /* Host data compatibility */
-
+    /**
+     * @brief move the data between host an device
+     */
     void move(locationT source, locationT target, cudaStream_t stream = NULL)
     {
         if (source == target) {
@@ -1068,6 +1075,9 @@ struct SparseMatrix
         }
     }
 
+    /**
+     * @brief release the data on host or device
+     */
     void release(locationT location = LOCATION_ALL)
     {
         if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
@@ -1089,6 +1099,9 @@ struct SparseMatrix
         }
     }
 
+    /**
+     * @brief allocate the data on host or device
+     */
     void allocate(locationT location)
     {
         if ((location & HOST) == HOST) {

From 0c126777332300fedc85ebd0905ac4365f490ef9 Mon Sep 17 00:00:00 2001
From: Ahmed Mahmoud <ahmed.mahmoud@autodesk.com>
Date: Mon, 27 Nov 2023 08:52:15 -0500
Subject: [PATCH 44/44] fix mem leak

---
 include/rxmesh/attribute.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/rxmesh/attribute.h b/include/rxmesh/attribute.h
index 80d7e880..cf5ba824 100644
--- a/include/rxmesh/attribute.h
+++ b/include/rxmesh/attribute.h
@@ -227,7 +227,10 @@ class Attribute : public AttributeBase
 
     Attribute(const Attribute& rhs) = default;
 
-    virtual ~Attribute() = default;
+    virtual ~Attribute()
+    {
+        free(m_name);
+    }
 
     /**
      * @brief Get the name of the attribute