diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60ae84f8..87f45668 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -172,4 +172,4 @@ add_subdirectory("include")
 
 include(GoogleTest)
 add_subdirectory(apps)
-add_subdirectory(tests)
\ No newline at end of file
+add_subdirectory(tests)
diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index 2b5179e2..c5c90471 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -28,6 +28,7 @@ struct arg
 
 #include "mcf_openmesh.h"
 #include "mcf_rxmesh.h"
+#include "mcf_sparse_matrix.cuh"
 
 
 TEST(App, MCF)
@@ -52,7 +53,10 @@ TEST(App, MCF)
     mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth);
 
     // RXMesh Impl
-    mcf_rxmesh(rxmesh, ground_truth);
+    mcf_rxmesh_cg(rxmesh, ground_truth);  
+
+    // RXMesh cusolver Impl
+    mcf_rxmesh_cusolver_chol(rxmesh, ground_truth); 
 }
 
 int main(int argc, char** argv)
diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index a58a62df..bb2c4ab4 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -49,7 +49,7 @@ void init_PR(rxmesh::RXMeshStatic&             rxmesh,
 }
 
 template <typename T>
-void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
+void mcf_rxmesh_cg(rxmesh::RXMeshStatic&              rxmesh,
                 const std::vector<std::vector<T>>& ground_truth)
 {
     using namespace rxmesh;
@@ -106,11 +106,11 @@ void mcf_rxmesh(rxmesh::RXMeshStatic&              rxmesh,
     rxmesh.prepare_launch_box({rxmesh::Op::VV},
                               launch_box_init_B,
                               (void*)init_B<T, blockThreads>,
-                              true);
+                              !Arg.use_uniform_laplace);
     rxmesh.prepare_launch_box({rxmesh::Op::VV},
                               launch_box_matvec,
                               (void*)rxmesh_matvec<T, blockThreads>,
-                              true);
+                              !Arg.use_uniform_laplace);
 
 
     // init kernel to initialize RHS (B)
diff --git a/apps/MCF/mcf_sparse_matrix.cuh b/apps/MCF/mcf_sparse_matrix.cuh
index daf1ed4f..4dbae51a 100644
--- a/apps/MCF/mcf_sparse_matrix.cuh
+++ b/apps/MCF/mcf_sparse_matrix.cuh
@@ -1,47 +1,227 @@
 #pragma once
+#include "mcf_util.h"
 #include "rxmesh/attribute.h"
 #include "rxmesh/matrix/dense_matrix.cuh"
 #include "rxmesh/matrix/sparse_matrix.cuh"
 #include "rxmesh/rxmesh_static.h"
 
+template <typename T, uint32_t blockThreads>
+__global__ static void mcf_B_setup(const rxmesh::Context            context,
+                                   const rxmesh::VertexAttribute<T> coords,
+                                   rxmesh::DenseMatrix<T>           B_mat,
+                                   const bool use_uniform_laplace)
+{
+    using namespace rxmesh;
+
+    auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) {
+        auto     r_ids      = p_id.unpack();
+        uint32_t r_patch_id = r_ids.first;
+        uint16_t r_local_id = r_ids.second;
+
+        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
+
+        if (use_uniform_laplace) {
+            const T valence     = static_cast<T>(iter.size());
+            B_mat(row_index, 0) = coords(p_id, 0) * valence;
+            B_mat(row_index, 1) = coords(p_id, 1) * valence;
+            B_mat(row_index, 2) = coords(p_id, 2) * valence;
+        } else {
+            T v_weight = 0;
+
+            // this is the last vertex in the one-ring (before r_id)
+            VertexHandle q_id = iter.back();
+
+            for (uint32_t v = 0; v < iter.size(); ++v) {
+                // the current one ring vertex
+                VertexHandle r_id = iter[v];
+
+                T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
+
+                v_weight += (tri_area > 0) ? tri_area : 0.0;
+
+                q_id = r_id;
+            }
+            v_weight = 0.5 / v_weight;
+
+            B_mat(row_index, 0) = coords(p_id, 0) / v_weight;
+            B_mat(row_index, 1) = coords(p_id, 1) / v_weight;
+            B_mat(row_index, 2) = coords(p_id, 2) / v_weight;
+        }
+    };
+
+    // With uniform Laplacian, we just need the valence, thus we
+    // call query and set oriented to false
+    auto block = cooperative_groups::this_thread_block();
+
+    Query<blockThreads> query(context);
+    ShmemAllocator      shrd_alloc;
+    query.dispatch<Op::VV>(
+        block,
+        shrd_alloc,
+        init_lambda,
+        [](VertexHandle) { return true; },
+        !use_uniform_laplace);
+}
 
 template <typename T, uint32_t blockThreads>
-__global__ static void mcf_A_B_setup(
-    const rxmesh::Context      context,
-    rxmesh::VertexAttribute<T> coords,  // for non-uniform
-    rxmesh::SparseMatrix<T>    A_mat,
-    rxmesh::DenseMatrix<T>     B_mat,
-    const bool                 use_uniform_laplace,  // for non-uniform
-    const T                    time_step)
+__global__ static void mcf_A_X_setup(
+    const rxmesh::Context            context,
+    const rxmesh::VertexAttribute<T> coords,
+    rxmesh::SparseMatrix<T>          A_mat,
+    rxmesh::DenseMatrix<T>           X_mat,
+    const bool                       use_uniform_laplace,  // for non-uniform
+    const T                          time_step)
 {
     using namespace rxmesh;
-    auto init_lambda = [&](VertexHandle& v_id, const VertexIterator& iter) {
+    auto init_lambda = [&](VertexHandle& p_id, const VertexIterator& iter) {
         T sum_e_weight(0);
+        T v_weight(0);
 
-        T v_weight = iter.size();
+        VertexHandle q_id = iter.back();
 
         // reference value calculation
-        auto     r_ids      = v_id.unpack();
+        auto     r_ids      = p_id.unpack();
         uint32_t r_patch_id = r_ids.first;
         uint16_t r_local_id = r_ids.second;
 
-        uint32_t row_index = A_mat.m_d_patch_ptr_v[r_patch_id] + r_local_id;
+        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
 
-        B_mat(row_index, 0) = coords(v_id, 0) * v_weight;
-        B_mat(row_index, 1) = coords(v_id, 1) * v_weight;
-        B_mat(row_index, 2) = coords(v_id, 2) * v_weight;
+        // set up initial X matrix
+        X_mat(row_index, 0) = coords(p_id, 0);
+        X_mat(row_index, 1) = coords(p_id, 1);
+        X_mat(row_index, 2) = coords(p_id, 2);
 
-        Vector<3, float> vi_coord(
-            coords(v_id, 0), coords(v_id, 1), coords(v_id, 2));
+        // set up matrix A
         for (uint32_t v = 0; v < iter.size(); ++v) {
-            T e_weight           = 1;
-            A_mat(v_id, iter[v]) = -time_step * e_weight;
+            VertexHandle r_id = iter[v];
+
+            T e_weight = 0;
+            if (use_uniform_laplace) {
+                e_weight = 1;
+            } else {
+                VertexHandle s_id =
+                    (v == iter.size() - 1) ? iter[0] : iter[v + 1];
+
+                e_weight = edge_cotan_weight(p_id, r_id, q_id, s_id, coords);
+                e_weight = (static_cast<T>(e_weight >= 0.0)) * e_weight;
+            }
 
+            e_weight *= time_step;
             sum_e_weight += e_weight;
+
+            A_mat(p_id, iter[v]) = -e_weight;
+
+            // compute vertex weight
+            if (use_uniform_laplace) {
+                ++v_weight;
+            } else {
+                T tri_area = partial_voronoi_area(p_id, q_id, r_id, coords);
+                v_weight += (tri_area > 0) ? tri_area : 0;
+                q_id = r_id;
+            }
         }
 
-        A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight;
+        // Diagonal entry
+        if (use_uniform_laplace) {
+            v_weight = 1.0 / v_weight;
+        } else {
+            v_weight = 0.5 / v_weight;
+        }
+
+        assert(!isnan(v_weight));
+        assert(!isinf(v_weight));
+
+        A_mat(p_id, p_id) = (1.0 / v_weight) + sum_e_weight;
     };
 
-    query_block_dispatcher<Op::VV, blockThreads>(context, init_lambda);
+    auto                block = cooperative_groups::this_thread_block();
+    Query<blockThreads> query(context);
+    ShmemAllocator      shrd_alloc;
+    query.dispatch<Op::VV>(
+        block,
+        shrd_alloc,
+        init_lambda,
+        [](VertexHandle) { return true; },
+        !use_uniform_laplace);
 }
+
+template <typename T>
+void mcf_rxmesh_cusolver_chol(rxmesh::RXMeshStatic&              rxmesh,
+                              const std::vector<std::vector<T>>& ground_truth)
+{
+    using namespace rxmesh;
+    constexpr uint32_t blockThreads = 256;
+
+    uint32_t num_vertices = rxmesh.get_num_vertices();
+    auto     coords       = rxmesh.get_input_vertex_coordinates();
+
+    SparseMatrix<float> A_mat(rxmesh);
+    DenseMatrix<float>  X_mat(num_vertices, 3);
+    DenseMatrix<float>  B_mat(num_vertices, 3);
+
+    RXMESH_INFO("use_uniform_laplace: {}, time_step: {}",
+           Arg.use_uniform_laplace,
+           Arg.time_step);
+
+    // B set up
+    LaunchBox<blockThreads> launch_box_B;
+    rxmesh.prepare_launch_box({Op::VV},
+                              launch_box_B,
+                              (void*)mcf_B_setup<float, blockThreads>,
+                              !Arg.use_uniform_laplace);
+
+    mcf_B_setup<float, blockThreads><<<launch_box_B.blocks,
+                                       launch_box_B.num_threads,
+                                       launch_box_B.smem_bytes_dyn>>>(
+        rxmesh.get_context(), *coords, B_mat, Arg.use_uniform_laplace);
+
+    CUDA_ERROR(cudaDeviceSynchronize());
+
+    // A and X set up
+    LaunchBox<blockThreads> launch_box_A_X;
+    rxmesh.prepare_launch_box({Op::VV},
+                              launch_box_A_X,
+                              (void*)mcf_A_X_setup<float, blockThreads>,
+                              !Arg.use_uniform_laplace);
+
+    mcf_A_X_setup<float, blockThreads>
+        <<<launch_box_A_X.blocks,
+           launch_box_A_X.num_threads,
+           launch_box_A_X.smem_bytes_dyn>>>(rxmesh.get_context(),
+                                            *coords,
+                                            A_mat,
+                                            X_mat,
+                                            Arg.use_uniform_laplace,
+                                            Arg.time_step);
+
+    // Solving the linear system using chol factorization and no reordering
+    A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NONE);
+
+    X_mat.move(rxmesh::DEVICE, rxmesh::HOST);
+
+    const T tol     = 0.001;
+    T       tmp_tol = tol;
+    bool    passed  = true;
+    rxmesh.for_each_vertex(HOST, [&](const VertexHandle vh) {
+        uint32_t v_id = rxmesh.map_to_global(vh);
+        uint32_t v_linear_id = rxmesh.linear_id(vh);
+
+        T a = X_mat(v_linear_id, 0);
+
+        for (uint32_t i = 0; i < 3; ++i) {
+            tmp_tol = std::abs((X_mat(v_linear_id, i) - ground_truth[v_id][i]) /
+                               ground_truth[v_id][i]);
+
+            if (tmp_tol > tol) {
+                RXMESH_WARN("val: {}, truth: {}, tol: {}\n",
+                       X_mat(v_linear_id, i),
+                       ground_truth[v_id][i],
+                       tmp_tol);
+                passed = false;
+                break;
+            }
+        }
+    });
+
+    EXPECT_TRUE(passed);
+}
\ No newline at end of file
diff --git a/include/rxmesh/attribute.h b/include/rxmesh/attribute.h
index 481c3d33..6c651000 100644
--- a/include/rxmesh/attribute.h
+++ b/include/rxmesh/attribute.h
@@ -227,7 +227,10 @@ class Attribute : public AttributeBase
 
     Attribute(const Attribute& rhs) = default;
 
-    virtual ~Attribute() = default;
+    virtual ~Attribute()
+    {
+        free(m_name);
+    }
 
     /**
      * @brief Get the name of the attribute
diff --git a/include/rxmesh/matrix/dense_matrix.cuh b/include/rxmesh/matrix/dense_matrix.cuh
index 41e4b81a..289b69df 100644
--- a/include/rxmesh/matrix/dense_matrix.cuh
+++ b/include/rxmesh/matrix/dense_matrix.cuh
@@ -1,93 +1,216 @@
 #pragma once
 #include <vector>
-
+#include "cusparse.h"
 #include "rxmesh/attribute.h"
 #include "rxmesh/context.h"
 #include "rxmesh/types.h"
 
 namespace rxmesh {
 
-// Currently this is device only
-// host/device transormation will be added
-template <typename T, typename IndexT = int>
+/**
+ * @brief dense matrix use for device and host, inside is a array.
+ * The dense matrix is initialized as col major on device.
+ * We would only support col major dense matrix for now since that's what
+ * cusparse and cusolver wants.
+ */
+template <typename T, typename IndexT = int, unsigned int MemAlignSize = 0>
 struct DenseMatrix
 {
     DenseMatrix(IndexT row_size, IndexT col_size)
-        : m_row_size(row_size), m_col_size(col_size)
+        : m_row_size(row_size),
+          m_col_size(col_size),
+          m_dendescr(NULL),
+          m_allocated(LOCATION_NONE),
+          m_col_pad_bytes(0),
+          m_col_pad_idx(0)
     {
-        cudaMalloc((void**)&m_d_val, bytes());
-        m_is_row_major = false;
+        m_allocated = m_allocated | DEVICE;
+
+        IndexT col_data_bytes = m_row_size * sizeof(T);
+        if (MemAlignSize != 0 && col_data_bytes % MemAlignSize != 0) {
+            m_col_pad_bytes = MemAlignSize - (col_data_bytes % MemAlignSize);
+            m_col_pad_idx   = m_col_pad_bytes / sizeof(T);
+        }
+
+        CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
+
+        CUSPARSE_ERROR(cusparseCreateDnMat(&m_dendescr,
+                                           m_row_size,
+                                           m_col_size,
+                                           m_row_size,  // leading dim
+                                           m_d_val,
+                                           CUDA_R_32F,
+                                           CUSPARSE_ORDER_COL));
     }
 
-    DenseMatrix(IndexT row_size, IndexT col_size, bool is_row_major)
-        : m_row_size(row_size),
-          m_col_size(col_size),
-          m_is_row_major(is_row_major)
+    IndexT lead_dim() const
     {
-        cudaMalloc((void**)&m_d_val, bytes());
+        return m_row_size;
     }
 
-    void set_ones()
+    __host__ __device__ T& operator()(const uint32_t row, const uint32_t col)
     {
-        std::vector<T> init_tmp_arr(m_row_size * m_col_size, 1);
-        CUDA_ERROR(cudaMemcpy(m_d_val,
-                              init_tmp_arr.data(),
-                              bytes() * sizeof(T),
-                              cudaMemcpyHostToDevice));
+        assert(row < m_row_size);
+        assert(col < m_col_size);
+
+#ifdef __CUDA_ARCH__
+        return m_d_val[col * (m_row_size + m_col_pad_idx) + row];
+#else
+        return m_h_val[col * (m_row_size + m_col_pad_idx) + row];
+#endif
     }
 
-    IndexT lead_dim() const
+    __host__ __device__ T& operator()(const uint32_t row,
+                                      const uint32_t col) const
     {
-        if (m_is_row_major) {
-            return m_col_size;
-        } else {
-            return m_row_size;
-        }
+        assert(row < m_row_size);
+        assert(col < m_col_size);
+
+#ifdef __CUDA_ARCH__
+        return m_d_val[col * (m_row_size + m_col_pad_idx) + row];
+#else
+        return m_h_val[col * (m_row_size + m_col_pad_idx) + row];
+#endif
     }
 
-    __device__ T& operator()(const uint32_t row, const uint32_t col)
+    /**
+     * @brief return the raw pointer based on the location specified
+     */
+    T* data(locationT location = DEVICE) const
     {
-        if (m_is_row_major) {
-            return m_d_val[row * m_col_size + col];
-        } else {
-            return m_d_val[col * m_row_size + row];
+        if ((location & HOST) == HOST) {
+            return m_h_val;
         }
+
+        if ((location & DEVICE) == DEVICE) {
+            return m_d_val;
+        }
+
+        assert(1 != 1);
+        return 0;
     }
 
-    __device__ T& operator()(const uint32_t row, const uint32_t col) const
+    /**
+     * @brief return the raw pointer to columns based on column index the
+     * location specified and
+     */
+    T* col_data(const uint32_t ld_idx, locationT location = DEVICE) const
     {
-        if (m_is_row_major) {
-            return m_d_val[row * m_col_size + col];
-        } else {
-            return m_d_val[col * m_row_size + row];
+        if ((location & HOST) == HOST) {
+            return m_h_val + ld_idx * (m_row_size + m_col_pad_idx);
         }
+
+        if ((location & DEVICE) == DEVICE) {
+            return m_d_val + ld_idx * (m_row_size + m_col_pad_idx);
+        }
+
+        if ((location & m_allocated) == location) {
+            RXMESH_ERROR("Requested data not allocated on {}",
+                         location_to_string(location));
+        }
+
+        assert(1 != 1);
+        return 0;
     }
 
-    T* data() const
+    /**
+     * @brief return the total number bytes used by the array
+    */
+    IndexT bytes() const
     {
-        return m_d_val;
+        return (m_row_size + m_col_pad_idx) * m_col_size * sizeof(T);
     }
 
-    T* ld_data(const uint32_t ld_idx) const
+    /**
+     * @brief move the data between host an device 
+    */
+    void move(locationT source, locationT target, cudaStream_t stream = NULL)
     {
-        return m_d_val + ld_idx * lead_dim();
+        if (source == target) {
+            RXMESH_WARN(
+                "DenseMatrix::move() source ({}) and target ({}) "
+                "are the same.",
+                location_to_string(source),
+                location_to_string(target));
+            return;
+        }
+
+        if ((source == HOST || source == DEVICE) &&
+            ((source & m_allocated) != source)) {
+            RXMESH_ERROR(
+                "DenseMatrix::move() moving source is not valid"
+                " because it was not allocated on source i.e., {}",
+                location_to_string(source));
+        }
+
+        if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) &&
+            ((target & m_allocated) != target)) {
+            RXMESH_WARN(
+                "DenseMatrix::move() allocating target before moving to {}",
+                location_to_string(target));
+            allocate(target);
+        }
+
+        if (source == HOST && target == DEVICE) {
+            CUDA_ERROR(cudaMemcpyAsync(
+                m_d_val, m_h_val, bytes(), cudaMemcpyHostToDevice, stream));
+        } else if (source == DEVICE && target == HOST) {
+            CUDA_ERROR(cudaMemcpyAsync(
+                m_h_val, m_d_val, bytes(), cudaMemcpyDeviceToHost, stream));
+        }
     }
 
-    IndexT bytes() const
+    /**
+     * @brief release the data on host or device 
+    */
+    void release(locationT location = LOCATION_ALL)
     {
-        return m_row_size * m_col_size * sizeof(T);
+        if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
+            free(m_h_val);
+            m_h_val     = nullptr;
+            m_allocated = m_allocated & (~HOST);
+        }
+
+        if (((location & DEVICE) == DEVICE) &&
+            ((m_allocated & DEVICE) == DEVICE)) {
+            GPU_FREE(m_d_val);
+            m_allocated = m_allocated & (~DEVICE);
+        }
     }
 
-    void quick_tanspose_w_ld_trans()
+    /**
+     * @brief allocate the data on host or device 
+    */
+    void allocate(locationT location)
     {
-        std::swap(m_row_size, m_col_size);
-        m_is_row_major = !(m_is_row_major);
+        if ((location & HOST) == HOST) {
+            release(HOST);
+
+            m_h_val = static_cast<T*>(malloc(bytes()));
+
+            m_allocated = m_allocated | HOST;
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            release(DEVICE);
+
+            CUDA_ERROR(cudaMalloc((void**)&m_d_val, bytes()));
+
+            m_allocated = m_allocated | DEVICE;
+        }
     }
 
-    bool   m_is_row_major;
-    IndexT m_row_size;
-    IndexT m_col_size;
-    T*     m_d_val;
+    // TODO: something like attribute->move()
+
+    cusparseDnMatDescr_t m_dendescr;
+    locationT            m_allocated;
+    IndexT               m_row_size;
+    IndexT               m_col_size;
+    T*                   m_d_val;
+    T*                   m_h_val;
+
+    IndexT m_col_pad_bytes;
+    IndexT m_col_pad_idx;
 };
 
 }  // namespace rxmesh
diff --git a/include/rxmesh/matrix/matrix_operation.cuh b/include/rxmesh/matrix/matrix_operation.cuh
deleted file mode 100644
index b9ce48bc..00000000
--- a/include/rxmesh/matrix/matrix_operation.cuh
+++ /dev/null
@@ -1,446 +0,0 @@
-#pragma once
-#include "cusolverSp.h"
-#include "cusparse.h"
-
-#include "rxmesh/attribute.h"
-#include "rxmesh/matrix/dense_matrix.cuh"
-#include "rxmesh/matrix/sparse_matrix.cuh"
-#include "rxmesh/rxmesh_static.h"
-
-namespace rxmesh {
-
-/**
- * @brief The enum class for choosing different solver types
- */
-enum class Solver
-{
-    CHOL = 0,
-    LU   = 1,
-    QR   = 2
-};
-
-/**
- * @brief The enum class for choosing different reorder types
- * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee
- * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based
- * on Quotient Graph, NSTDIS for Nested Dissection
- */
-enum class Reorder
-{
-    NONE   = 0,
-    SYMRCM = 1,
-    SYMAMD = 2,
-    NSTDIS = 3
-};
-
-static int reorder_to_int(const Reorder& reorder)
-{
-    switch (reorder) {
-        case Reorder::NONE:
-            return 0;
-        case Reorder::SYMRCM:
-            return 1;
-        case Reorder::SYMAMD:
-            return 2;
-        case Reorder::NSTDIS:
-            return 3;
-        default: {
-            RXMESH_ERROR("reorder_to_int() unknown input reorder");
-            return 0;
-        }
-    }
-}
-
-/**
- * @brief for transpose the dense matrix using cublas
- */
-template <typename T>
-void denmat_transpose(rxmesh::DenseMatrix<T> den_mat)
-{
-    T* d_rt_arr;
-    cudaMalloc(&d_rt_arr, den_mat.bytes());
-
-    float const alpha(1.0);
-    float const beta(0.0);
-
-    cublasHandle_t handle;
-    cublasCreate(&handle);
-    cublasSgeam(handle,
-                CUBLAS_OP_T,
-                CUBLAS_OP_N,
-                den_mat.m_row_size,
-                den_mat.m_col_size,
-                &alpha,               // 0
-                den_mat.data(),       // A_arr
-                den_mat.m_col_size,   // ld_a
-                &beta,                // 0
-                den_mat.data(),       // B_arr
-                den_mat.m_row_size,   // ld_b
-                d_rt_arr,             // rt_arr
-                den_mat.m_row_size);  // ld_cm
-    cublasDestroy(handle);
-
-    den_mat.m_d_val = d_rt_arr;
-    // TODO cont;
-}
-
-/**
- * @brief solve the Ax=b for x where x and b are all array
- */
-template <typename T>
-void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
-                        T*                      B_arr,
-                        T*                      X_arr,
-                        rxmesh::Solver          solver,
-                        rxmesh::Reorder         reorder)
-{
-    cusolverSpHandle_t handle         = NULL;
-    cusparseHandle_t   cusparseHandle = NULL;
-    cudaStream_t       stream         = NULL;
-    cusparseMatDescr_t descrA         = NULL;
-
-    cusolverSpCreate(&handle);
-    cusparseCreate(&cusparseHandle);
-
-    cudaStreamCreate(&stream);
-    cusolverSpSetStream(handle, stream);
-    cusparseSetStream(cusparseHandle, stream);
-
-    cusparseCreateMatDescr(&descrA);
-    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
-    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
-
-    cusparse_linear_solver_wrapper(solver,
-                                   reorder,
-                                   handle,
-                                   cusparseHandle,
-                                   descrA,
-                                   A_mat.m_row_size,
-                                   A_mat.m_col_size,
-                                   A_mat.m_nnz,
-                                   A_mat.m_d_row_ptr,
-                                   A_mat.m_d_col_idx,
-                                   A_mat.m_d_val,
-                                   B_arr,
-                                   X_arr);
-
-    cusolverSpDestroy(handle);
-    cusparseDestroy(cusparseHandle);
-    cudaStreamDestroy(stream);
-    cusparseDestroyMatDescr(descrA);
-}
-
-/**
- * @brief solve the AX=B for X where X and B are all dense matrix and we would
- * solve it in a column wise manner
- */
-template <typename T>
-void spmat_linear_solve(rxmesh::SparseMatrix<T> A_mat,
-                        rxmesh::DenseMatrix<T>  B_mat,
-                        rxmesh::DenseMatrix<T>  X_mat,
-                        rxmesh::Solver          solver,
-                        rxmesh::Reorder         reorder)
-{
-    cusolverSpHandle_t handle         = NULL;
-    cusparseHandle_t   cusparseHandle = NULL;
-    cudaStream_t       stream         = NULL;
-    cusparseMatDescr_t descrA         = NULL;
-
-    cusolverSpCreate(&handle);
-    cusparseCreate(&cusparseHandle);
-
-    cudaStreamCreate(&stream);
-    cusolverSpSetStream(handle, stream);
-    cusparseSetStream(cusparseHandle, stream);
-
-    cusparseCreateMatDescr(&descrA);
-    cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL);
-    cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO);
-
-    for (int i = 0; i < B_mat.m_col_size; ++i) {
-        cusparse_linear_solver_wrapper(solver,
-                                       reorder,
-                                       handle,
-                                       cusparseHandle,
-                                       descrA,
-                                       A_mat.m_row_size,
-                                       A_mat.m_col_size,
-                                       A_mat.m_nnz,
-                                       A_mat.m_d_row_ptr,
-                                       A_mat.m_d_col_idx,
-                                       A_mat.m_d_val,
-                                       B_mat.ld_data(i),
-                                       X_mat.ld_data(i));
-    }
-
-    cusolverSpDestroy(handle);
-    cusparseDestroy(cusparseHandle);
-    cudaStreamDestroy(stream);
-    cusparseDestroyMatDescr(descrA);
-}
-
-/**
- * @brief wrap up the cusolver api for solving linear systems. This is a lower
- * level api
- */
-template <typename T>
-void cusparse_linear_solver_wrapper(const rxmesh::Solver  solver,
-                                    const rxmesh::Reorder reorder,
-                                    cusolverSpHandle_t    handle,
-                                    cusparseHandle_t      cusparseHandle,
-                                    cusparseMatDescr_t    descrA,
-                                    int                   rowsA,
-                                    int                   colsA,
-                                    int                   nnzA,
-                                    int*                  d_csrRowPtrA,
-                                    int*                  d_csrColIndA,
-                                    T*                    d_csrValA,
-                                    T*                    d_b,
-                                    T*                    d_x)
-{
-    if constexpr ((!std::is_same_v<T, float>)&&(!std::is_same_v<T, double>)) {
-        RXMESH_ERROR(
-            "Unsupported type for cusparse: {}"
-            "Only float and double are supported",
-            typeid(T).name());
-    }
-
-    double tol         = 1.e-12;
-    int    singularity = 0; /* -1 if A is invertible under tol. */
-
-    /* solve B*z = Q*b */
-    if (solver == Solver::CHOL) {
-        if constexpr (std::is_same_v<T, float>) {
-            cusolverSpScsrlsvchol(handle,
-                                  rowsA,
-                                  nnzA,
-                                  descrA,
-                                  d_csrValA,
-                                  d_csrRowPtrA,
-                                  d_csrColIndA,
-                                  d_b,
-                                  tol,
-                                  reorder_to_int(reorder),
-                                  d_x,
-                                  &singularity);
-        }
-
-        if constexpr (std::is_same_v<T, double>) {
-            cusolverSpDcsrlsvchol(handle,
-                                  rowsA,
-                                  nnzA,
-                                  descrA,
-                                  d_csrValA,
-                                  d_csrRowPtrA,
-                                  d_csrColIndA,
-                                  d_b,
-                                  tol,
-                                  reorder_to_int(reorder),
-                                  d_x,
-                                  &singularity);
-        }
-
-    } else if (solver == Solver::QR) {
-        if constexpr (std::is_same_v<T, float>) {
-            cusolverSpScsrlsvqr(handle,
-                                rowsA,
-                                nnzA,
-                                descrA,
-                                d_csrValA,
-                                d_csrRowPtrA,
-                                d_csrColIndA,
-                                d_b,
-                                tol,
-                                reorder_to_int(reorder),
-                                d_x,
-                                &singularity);
-        }
-
-        if constexpr (std::is_same_v<T, double>) {
-            cusolverSpDcsrlsvqr(handle,
-                                rowsA,
-                                nnzA,
-                                descrA,
-                                d_csrValA,
-                                d_csrRowPtrA,
-                                d_csrColIndA,
-                                d_b,
-                                tol,
-                                reorder_to_int(reorder),
-                                d_x,
-                                &singularity);
-        }
-    } else {
-        RXMESH_ERROR(
-            "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x for "
-            "Solver::LU");
-    }
-    cudaDeviceSynchronize();
-
-    if (0 <= singularity) {
-        RXMESH_WARN("WARNING: the matrix is singular at row {} under tol ({})",
-                    singularity,
-                    tol);
-    }
-}
-
-/**
- * @brief wrap up the cusparse api for sparse matrix dense matrix
- * multiplication.
- */
-template <typename T>
-void spmat_denmat_mul(rxmesh::SparseMatrix<T> A_mat,
-                      rxmesh::DenseMatrix<T>  B_mat,
-                      rxmesh::DenseMatrix<T>  C_mat)
-{
-    float alpha = 1.0f;
-    float beta  = 0.0f;
-
-    cusparseHandle_t     handle = NULL;
-    cusparseSpMatDescr_t matA;
-    cusparseDnMatDescr_t matB, matC;
-    void*                dBuffer    = NULL;
-    size_t               bufferSize = 0;
-
-    cusparseCreate(&handle);
-    // Create sparse matrix A in CSR format
-    cusparseCreateCsr(&matA,
-                      A_mat.m_row_size,
-                      A_mat.m_col_size,
-                      A_mat.m_nnz,
-                      A_mat.m_d_row_ptr,
-                      A_mat.m_d_col_idx,
-                      A_mat.m_d_val,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_BASE_ZERO,
-                      CUDA_R_32F);
-    // Create dense matrix B
-    cusparseCreateDnMat(&matB,
-                        B_mat.m_row_size,
-                        B_mat.m_col_size,
-                        B_mat.lead_dim(),
-                        B_mat.data(),
-                        CUDA_R_32F,
-                        CUSPARSE_ORDER_COL);
-    // Create dense matrix C
-    cusparseCreateDnMat(&matC,
-                        C_mat.m_row_size,
-                        C_mat.m_col_size,
-                        C_mat.lead_dim(),
-                        C_mat.data(),
-                        CUDA_R_32F,
-                        CUSPARSE_ORDER_COL);
-    // allocate an external buffer if needed
-    cusparseSpMM_bufferSize(handle,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            &alpha,
-                            matA,
-                            matB,
-                            &beta,
-                            matC,
-                            CUDA_R_32F,
-                            CUSPARSE_SPMM_ALG_DEFAULT,
-                            &bufferSize);
-    cudaMalloc(&dBuffer, bufferSize);
-
-    // execute SpMM
-    cusparseSpMM(handle,
-                 CUSPARSE_OPERATION_NON_TRANSPOSE,
-                 CUSPARSE_OPERATION_NON_TRANSPOSE,
-                 &alpha,
-                 matA,
-                 matB,
-                 &beta,
-                 matC,
-                 CUDA_R_32F,
-                 CUSPARSE_SPMM_ALG_DEFAULT,
-                 dBuffer);
-
-    // destroy matrix/vector descriptors
-    cusparseDestroySpMat(matA);
-    cusparseDestroyDnMat(matB);
-    cusparseDestroyDnMat(matC);
-    cusparseDestroy(handle);
-}
-
-
-/**
- * @brief wrap up the cusparse api for sparse matrix array
- * multiplication.
- */
-// only works for float
-template <typename T>
-void spmat_arr_mul(rxmesh::SparseMatrix<T> sp_mat, T* in_arr, T* rt_arr)
-{
-    const float minus_one = -1.0f;
-    const float one       = 1.0f;
-    const float zero      = 0.0f;
-
-    cusparseHandle_t     handle     = NULL;
-    void*                buffer     = NULL;
-    size_t               bufferSize = 0;
-    cusparseSpMatDescr_t sp_mat_des = NULL;
-    cusparseDnVecDescr_t vecx       = NULL;
-    cusparseDnVecDescr_t vecy       = NULL;
-
-    cusparseCreate(&handle);
-    cusparseCreateCsr(&sp_mat_des,
-                      sp_mat.m_row_size,
-                      sp_mat.m_col_size,
-                      sp_mat.m_nnz,
-                      sp_mat.m_d_row_ptr,
-                      sp_mat.m_d_col_idx,
-                      sp_mat.m_d_val,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_32I,
-                      CUSPARSE_INDEX_BASE_ZERO,
-                      CUDA_R_32F);
-    cusparseCreateDnVec(&vecx, sp_mat.m_col_size, in_arr, CUDA_R_32F);
-    cusparseCreateDnVec(&vecy, sp_mat.m_row_size, rt_arr, CUDA_R_32F);
-
-    cusparseSpMV_bufferSize(handle,
-                            CUSPARSE_OPERATION_NON_TRANSPOSE,
-                            &one,
-                            sp_mat_des,
-                            vecx,
-                            &zero,
-                            vecy,
-                            CUDA_R_32F,
-                            CUSPARSE_SPMV_ALG_DEFAULT,
-                            &bufferSize);
-    cudaMalloc(&buffer, bufferSize);
-
-    cusparseSpMV(handle,
-                 CUSPARSE_OPERATION_NON_TRANSPOSE,
-                 &one,
-                 sp_mat_des,
-                 vecx,
-                 &zero,
-                 vecy,
-                 CUDA_R_32F,
-                 CUSPARSE_SPMV_ALG_DEFAULT,
-                 buffer);
-
-    cusparseDestroySpMat(sp_mat_des);
-    cusparseDestroyDnVec(vecx);
-    cusparseDestroyDnVec(vecy);
-    cusparseDestroy(handle);
-}
-
-/**
- * @brief do the sparse matrix dense matrix multiplication using sparse matrix
- * array multiplication in a column wise way
- */
-template <typename T>
-void spmat_denmat_mul_cw(rxmesh::SparseMatrix<T> A_mat,
-                         rxmesh::DenseMatrix<T>  B_mat,
-                         rxmesh::DenseMatrix<T>  C_mat)
-{
-    for (int i = 0; i < B_mat.m_col_size; ++i) {
-        spmat_arr_mul(A_mat, B_mat.ld_data(i), C_mat.ld_data(i));
-    }
-}
-
-}  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/matrix/sparse_matrix.cuh b/include/rxmesh/matrix/sparse_matrix.cuh
index 97133fbe..77e1c5fd 100644
--- a/include/rxmesh/matrix/sparse_matrix.cuh
+++ b/include/rxmesh/matrix/sparse_matrix.cuh
@@ -1,11 +1,63 @@
 #pragma once
+#include "cusolverSp.h"
+#include "cusparse.h"
 #include "rxmesh/attribute.h"
 #include "rxmesh/context.h"
 #include "rxmesh/query.cuh"
 #include "rxmesh/types.h"
 
+#include "thrust/device_ptr.h"
+#include "thrust/execution_policy.h"
+#include "thrust/gather.h"
+#include "thrust/scatter.h"
+
+#include "cusolverSp_LOWLEVEL_PREVIEW.h"
+#include "rxmesh/matrix/dense_matrix.cuh"
+
 namespace rxmesh {
 
+/**
+ * @brief The enum class for choosing different solver types
+ */
+enum class Solver
+{
+    CHOL = 0,
+    LU   = 1,
+    QR   = 2
+};
+
+/**
+ * @brief The enum class for choosing different reorder types
+ * NONE for No Reordering Applied, SYMRCM for Symmetric Reverse Cuthill-McKee
+ * permutation, SYMAMD for Symmetric Approximate Minimum Degree Algorithm based
+ * on Quotient Graph, NSTDIS for Nested Dissection
+ */
+enum class Reorder
+{
+    NONE   = 0,
+    SYMRCM = 1,
+    SYMAMD = 2,
+    NSTDIS = 3
+};
+
+static int reorder_to_int(const Reorder& reorder)
+{
+    switch (reorder) {
+        case Reorder::NONE:
+            return 0;
+        case Reorder::SYMRCM:
+            return 1;
+        case Reorder::SYMAMD:
+            return 2;
+        case Reorder::NSTDIS:
+            return 3;
+        default: {
+            RXMESH_ERROR("reorder_to_int() unknown input reorder");
+            return 0;
+        }
+    }
+}
+
 namespace detail {
 // this is the function for the CSR calculation
 template <uint32_t blockThreads, typename IndexT = int>
@@ -55,11 +107,37 @@ __global__ static void sparse_mat_col_fill(const rxmesh::Context context,
     query.dispatch<Op::VV>(block, shrd_alloc, col_fillin);
 }
 
+// d_out[d_p[i]] = d_in[i]
+template <typename T, typename IndexT = int>
+void permute_scatter(IndexT* d_p, T* d_in, T* d_out, IndexT size)
+{
+    thrust::device_ptr<IndexT> t_p(d_p);
+    thrust::device_ptr<T>      t_i(d_in);
+    thrust::device_ptr<T>      t_o(d_out);
+
+    thrust::scatter(thrust::device, t_i, t_i + size, t_p, t_o);
+}
+
+// d_out[i] = d_in[d_p[i]]
+template <typename T, typename IndexT = int>
+void permute_gather(IndexT* d_p, T* d_in, T* d_out, IndexT size)
+{
+    thrust::device_ptr<IndexT> t_p(d_p);
+    thrust::device_ptr<T>      t_i(d_in);
+    thrust::device_ptr<T>      t_o(d_out);
+
+    thrust::gather(thrust::device, t_p, t_p + size, t_i, t_o);
+}
+
 }  // namespace detail
 
 
-// TODO: add compatibility for EE, FF, VE......
-// TODO: purge operation?
+/**
+ * @brief the sparse matrix implementation and all the functions and soolvers
+ * related. Right now, only VV is supported and we would add more compatibility
+ * for EE, FF, VE......
+ * This is device/host compatiable
+ */
 template <typename T, typename IndexT = int>
 struct SparseMatrix
 {
@@ -70,7 +148,14 @@ struct SparseMatrix
           m_row_size(0),
           m_col_size(0),
           m_nnz(0),
-          m_context(rx.get_context())
+          m_context(rx.get_context()),
+          m_cusparse_handle(NULL),
+          m_descr(NULL),
+          m_spdescr(NULL),
+          m_spmm_buffer_size(0),
+          m_spmv_buffer_size(0),
+          m_use_reorder(false),
+          m_allocated(LOCATION_NONE)
     {
         using namespace rxmesh;
         constexpr uint32_t blockThreads = 256;
@@ -137,7 +222,30 @@ struct SparseMatrix
 
         // val pointer allocation, actual value init should be in another
         // function
-        CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(IndexT)));
+        CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T)));
+
+        CUSPARSE_ERROR(cusparseCreateMatDescr(&m_descr));
+        CUSPARSE_ERROR(
+            cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+        CUSPARSE_ERROR(
+            cusparseSetMatIndexBase(m_descr, CUSPARSE_INDEX_BASE_ZERO));
+
+        CUSPARSE_ERROR(cusparseCreateCsr(&m_spdescr,
+                                         m_row_size,
+                                         m_col_size,
+                                         m_nnz,
+                                         m_d_row_ptr,
+                                         m_d_col_idx,
+                                         m_d_val,
+                                         CUSPARSE_INDEX_32I,
+                                         CUSPARSE_INDEX_32I,
+                                         CUSPARSE_INDEX_BASE_ZERO,
+                                         CUDA_R_32F));
+
+        CUSPARSE_ERROR(cusparseCreate(&m_cusparse_handle));
+        CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle));
+
+        m_allocated = m_allocated | DEVICE;
     }
 
     void set_ones()
@@ -186,20 +294,887 @@ struct SparseMatrix
         return m_d_val[get_val_idx(row_v, col_v)];
     }
 
-    void free()
+    __device__ T& operator()(const IndexT x, const IndexT y)
+    {
+        const IndexT start = m_d_row_ptr[x];
+        const IndexT end   = m_d_row_ptr[x + 1];
+
+        for (IndexT i = start; i < end; ++i) {
+            if (m_d_col_idx[i] == y) {
+                return m_d_val[i];
+            }
+        }
+        assert(1 != 1);
+    }
+
+    __device__ T& operator()(const IndexT x, const IndexT y) const
+    {
+        const IndexT start = m_d_row_ptr[x];
+        const IndexT end   = m_d_row_ptr[x + 1];
+
+        for (IndexT i = start; i < end; ++i) {
+            if (m_d_col_idx[i] == y) {
+                return m_d_val[i];
+            }
+        }
+        assert(1 != 1);
+    }
+
+    __host__ __device__ IndexT& get_nnz() const
+    {
+        return m_nnz;
+    }
+
+    __device__ IndexT& get_row_ptr_at(IndexT idx) const
+    {
+        return m_d_row_ptr[idx];
+    }
+
+    __device__ IndexT& get_col_idx_at(IndexT idx) const
+    {
+        return m_d_col_idx[idx];
+    }
+
+    __device__ T& get_val_at(IndexT idx) const
+    {
+        return m_d_val[idx];
+    }
+
+    void free_mat()
     {
         CUDA_ERROR(cudaFree(m_d_row_ptr));
         CUDA_ERROR(cudaFree(m_d_col_idx));
         CUDA_ERROR(cudaFree(m_d_val));
+        CUSPARSE_ERROR(cusparseDestroy(m_cusparse_handle));
+        CUSPARSE_ERROR(cusparseDestroyMatDescr(m_descr));
+        CUSOLVER_ERROR(cusolverSpDestroy(m_cusolver_sphandle));
+
+        if (m_reorder_allocated) {
+            GPU_FREE(m_d_solver_val);
+            GPU_FREE(m_d_solver_row_ptr);
+            GPU_FREE(m_d_solver_col_idx);
+            GPU_FREE(m_d_permute);
+            free(m_h_permute);
+        }
     }
 
-    const Context m_context;
-    IndexT*       m_d_row_ptr;
-    IndexT*       m_d_col_idx;
-    T*            m_d_val;
-    IndexT        m_row_size;
-    IndexT        m_col_size;
-    IndexT        m_nnz;
+    /* ----- CUSPARSE SPMM & SPMV ----- */
+
+    /**
+     * @brief wrap up the cusparse api for sparse matrix dense matrix
+     * multiplication buffer size calculation.
+     */
+    void denmat_mul_buffer_size(rxmesh::DenseMatrix<T> B_mat,
+                                rxmesh::DenseMatrix<T> C_mat,
+                                cudaStream_t           stream = 0)
+    {
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        cusparseSpMatDescr_t matA    = m_spdescr;
+        cusparseDnMatDescr_t matB    = B_mat.m_dendescr;
+        cusparseDnMatDescr_t matC    = C_mat.m_dendescr;
+        void*                dBuffer = NULL;
+
+        CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream));
+
+        // allocate an external buffer if needed
+        CUSPARSE_ERROR(cusparseSpMM_bufferSize(m_cusparse_handle,
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                               &alpha,
+                                               matA,
+                                               matB,
+                                               &beta,
+                                               matC,
+                                               CUDA_R_32F,
+                                               CUSPARSE_SPMM_ALG_DEFAULT,
+                                               &m_spmm_buffer_size));
+    }
+
+    /**
+     * @brief wrap up the cusparse api for sparse matrix dense matrix
+     * multiplication.
+     */
+    void denmat_mul(rxmesh::DenseMatrix<T> B_mat,
+                    rxmesh::DenseMatrix<T> C_mat,
+                    cudaStream_t           stream = 0)
+    {
+        float alpha = 1.0f;
+        float beta  = 0.0f;
+
+        // A_mat.create_cusparse_handle();
+        cusparseSpMatDescr_t matA    = m_spdescr;
+        cusparseDnMatDescr_t matB    = B_mat.m_dendescr;
+        cusparseDnMatDescr_t matC    = C_mat.m_dendescr;
+        void*                dBuffer = NULL;
+
+        CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream));
+
+        // allocate an external buffer if needed
+        if (m_spmm_buffer_size == 0) {
+            RXMESH_WARN(
+                "Sparse matrix - Dense matrix multiplication buffer size not "
+                "initialized.",
+                "Calculate it now.");
+            denmat_mul_buffer_size(B_mat, C_mat, stream);
+        }
+        CUDA_ERROR(cudaMalloc(&dBuffer, m_spmm_buffer_size));
+
+        // execute SpMM
+        CUSPARSE_ERROR(cusparseSpMM(m_cusparse_handle,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    &alpha,
+                                    matA,
+                                    matB,
+                                    &beta,
+                                    matC,
+                                    CUDA_R_32F,
+                                    CUSPARSE_SPMM_ALG_DEFAULT,
+                                    dBuffer));
+
+        CUDA_ERROR(cudaFree(dBuffer));
+    }
+
+    void arr_mul_buffer_size(T* in_arr, T* rt_arr, cudaStream_t stream = 0)
+    {
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+
+        cusparseDnVecDescr_t vecx = NULL;
+        cusparseDnVecDescr_t vecy = NULL;
+
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F));
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F));
+
+        CUSPARSE_ERROR(cusparseSpMV_bufferSize(m_cusparse_handle,
+                                               CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                               &alpha,
+                                               m_spdescr,
+                                               vecx,
+                                               &beta,
+                                               vecy,
+                                               CUDA_R_32F,
+                                               CUSPARSE_SPMV_ALG_DEFAULT,
+                                               &m_spmv_buffer_size));
+    }
+
+    /**
+     * @brief wrap up the cusparse api for sparse matrix array
+     * multiplication.
+     */
+    void arr_mul(T* in_arr, T* rt_arr, cudaStream_t stream = 0)
+    {
+        const float alpha = 1.0f;
+        const float beta  = 0.0f;
+
+        void*                buffer = NULL;
+        cusparseDnVecDescr_t vecx   = NULL;
+        cusparseDnVecDescr_t vecy   = NULL;
+
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecx, m_col_size, in_arr, CUDA_R_32F));
+        CUSPARSE_ERROR(
+            cusparseCreateDnVec(&vecy, m_row_size, rt_arr, CUDA_R_32F));
+
+        CUSPARSE_ERROR(cusparseSetStream(m_cusparse_handle, stream));
+
+        if (m_spmv_buffer_size == 0) {
+            RXMESH_WARN(
+                "Sparse matrix - Array multiplication buffer size not "
+                "initialized."
+                "Calculate it now.");
+            arr_mul_buffer_size(in_arr, rt_arr, stream);
+        }
+
+        CUDA_ERROR(cudaMalloc(&buffer, m_spmv_buffer_size));
+
+        CUSPARSE_ERROR(cusparseSpMV(m_cusparse_handle,
+                                    CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                    &alpha,
+                                    m_spdescr,
+                                    vecx,
+                                    &beta,
+                                    vecy,
+                                    CUDA_R_32F,
+                                    CUSPARSE_SPMV_ALG_DEFAULT,
+                                    buffer));
+
+        CUSPARSE_ERROR(cusparseDestroyDnVec(vecx));
+        CUSPARSE_ERROR(cusparseDestroyDnVec(vecy));
+        CUDA_ERROR(cudaFree(buffer));
+    }
+
+    /**
+     * @brief do the sparse matrix dense matrix multiplication using sparse
+     * matrix array multiplication in a column wise way
+     */
+    void spmat_denmat_mul_cw(rxmesh::DenseMatrix<T> B_mat,
+                             rxmesh::DenseMatrix<T> C_mat)
+    {
+        for (int i = 0; i < B_mat.m_col_size; ++i) {
+            arr_mul(B_mat.col_data(i), C_mat.col_data(i));
+        }
+    }
+
+    /*  ----- SOLVER -----  */
+
+    /* --- HIGH LEVEL API --- */
+
+    /**
+     * @brief solve the Ax=b for x where x and b are all array
+     */
+    void spmat_linear_solve(T*              B_arr,
+                            T*              X_arr,
+                            rxmesh::Solver  solver,
+                            rxmesh::Reorder reorder)
+    {
+        cusparse_linear_solver_wrapper(solver,
+                                       reorder,
+                                       m_cusolver_sphandle,
+                                       m_descr,
+                                       m_row_size,
+                                       m_col_size,
+                                       m_nnz,
+                                       m_d_row_ptr,
+                                       m_d_col_idx,
+                                       m_d_val,
+                                       B_arr,
+                                       X_arr);
+    }
+
+    /**
+     * @brief solve the AX=B for X where X and B are all dense matrix and we
+     * would solve it in a column wise manner
+     */
+    void spmat_linear_solve(rxmesh::DenseMatrix<T> B_mat,
+                            rxmesh::DenseMatrix<T> X_mat,
+                            rxmesh::Solver         solver,
+                            rxmesh::Reorder        reorder)
+    {
+        for (int i = 0; i < B_mat.m_col_size; ++i) {
+            cusparse_linear_solver_wrapper(solver,
+                                           reorder,
+                                           m_cusolver_sphandle,
+                                           m_descr,
+                                           m_row_size,
+                                           m_col_size,
+                                           m_nnz,
+                                           m_d_row_ptr,
+                                           m_d_col_idx,
+                                           m_d_val,
+                                           B_mat.col_data(i),
+                                           X_mat.col_data(i));
+        }
+    }
+
+    /**
+     * @brief wrap up the cusolver api for solving linear systems. This is a
+     * lower level api
+     */
+    void cusparse_linear_solver_wrapper(const rxmesh::Solver  solver,
+                                        const rxmesh::Reorder reorder,
+                                        cusolverSpHandle_t    handle,
+                                        cusparseMatDescr_t    descrA,
+                                        int                   rowsA,
+                                        int                   colsA,
+                                        int                   nnzA,
+                                        int*                  d_csrRowPtrA,
+                                        int*                  d_csrColIndA,
+                                        T*                    d_csrValA,
+                                        T*                    d_b,
+                                        T*                    d_x)
+    {
+        if constexpr ((!std::is_same_v<T, float>)&&(
+                          !std::is_same_v<T, double>)) {
+            RXMESH_ERROR(
+                "Unsupported type for cusparse: {}"
+                "Only float and double are supported",
+                typeid(T).name());
+        }
+
+        double tol         = 1.e-12;
+        int    singularity = 0; /* -1 if A is invertible under tol. */
+
+        /* solve B*z = Q*b */
+        if (solver == Solver::CHOL) {
+            if constexpr (std::is_same_v<T, float>) {
+                CUSOLVER_ERROR(cusolverSpScsrlsvchol(handle,
+                                                     rowsA,
+                                                     nnzA,
+                                                     descrA,
+                                                     d_csrValA,
+                                                     d_csrRowPtrA,
+                                                     d_csrColIndA,
+                                                     d_b,
+                                                     tol,
+                                                     reorder_to_int(reorder),
+                                                     d_x,
+                                                     &singularity));
+            }
+
+            if constexpr (std::is_same_v<T, double>) {
+                CUSOLVER_ERROR(cusolverSpDcsrlsvchol(handle,
+                                                     rowsA,
+                                                     nnzA,
+                                                     descrA,
+                                                     d_csrValA,
+                                                     d_csrRowPtrA,
+                                                     d_csrColIndA,
+                                                     d_b,
+                                                     tol,
+                                                     reorder_to_int(reorder),
+                                                     d_x,
+                                                     &singularity));
+            }
+
+        } else if (solver == Solver::QR) {
+            if constexpr (std::is_same_v<T, float>) {
+                CUSOLVER_ERROR(cusolverSpScsrlsvqr(handle,
+                                                   rowsA,
+                                                   nnzA,
+                                                   descrA,
+                                                   d_csrValA,
+                                                   d_csrRowPtrA,
+                                                   d_csrColIndA,
+                                                   d_b,
+                                                   tol,
+                                                   reorder_to_int(reorder),
+                                                   d_x,
+                                                   &singularity));
+            }
+
+            if constexpr (std::is_same_v<T, double>) {
+                CUSOLVER_ERROR(cusolverSpDcsrlsvqr(handle,
+                                                   rowsA,
+                                                   nnzA,
+                                                   descrA,
+                                                   d_csrValA,
+                                                   d_csrRowPtrA,
+                                                   d_csrColIndA,
+                                                   d_b,
+                                                   tol,
+                                                   reorder_to_int(reorder),
+                                                   d_x,
+                                                   &singularity));
+            }
+        } else {
+            RXMESH_ERROR(
+                "Only Solver::CHOL and Solver::QR is supported, use CUDA 12.x "
+                "for "
+                "Solver::LU");
+        }
+        CUDA_ERROR(cudaDeviceSynchronize());
+
+        if (0 <= singularity) {
+            RXMESH_WARN(
+                "WARNING: the matrix is singular at row {} under tol ({})",
+                singularity,
+                tol);
+        }
+    }
+
+    /* --- LOW LEVEL API --- */
+
+    /**
+     * @brief The lower level api of reordering. Sprcify the reordering type or
+     * simply NONE for no reordering. This should be called at the begining of
+     * the solving process. Any other function call order would be undefined.
+     * @param reorder: the reorder method applied.
+     */
+    void spmat_chol_reorder(rxmesh::Reorder reorder)
+    {
+        if (reorder == Reorder::NONE) {
+            RXMESH_INFO("None reordering is specified",
+                        "Continue without reordering");
+            m_use_reorder = false;
+
+            if (m_reorder_allocated) {
+                GPU_FREE(m_d_solver_val);
+                GPU_FREE(m_d_solver_row_ptr);
+                GPU_FREE(m_d_solver_col_idx);
+                GPU_FREE(m_d_permute);
+                free(m_h_permute);
+                m_reorder_allocated = false;
+            }
+
+            return;
+        }
+
+        /*check on host*/
+        bool on_host = true;
+        if ((HOST & m_allocated) != HOST) {
+            move(DEVICE, HOST);
+            on_host = false;
+        }
+
+        m_use_reorder = true;
+
+        // allocate the purmutated csr
+        m_reorder_allocated = true;
+        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_val, m_nnz * sizeof(T)));
+        CUDA_ERROR(cudaMalloc((void**)&m_d_solver_row_ptr,
+                              (m_row_size + 1) * sizeof(IndexT)));
+        CUDA_ERROR(
+            cudaMalloc((void**)&m_d_solver_col_idx, m_nnz * sizeof(IndexT)));
+
+        m_h_permute = (IndexT*)malloc(m_row_size * sizeof(IndexT));
+        CUDA_ERROR(
+            cudaMalloc((void**)&m_d_permute, m_row_size * sizeof(IndexT)));
+
+        CUSOLVER_ERROR(cusolverSpCreate(&m_cusolver_sphandle));
+
+        if (reorder == Reorder::SYMRCM) {
+            CUSOLVER_ERROR(cusolverSpXcsrsymrcmHost(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_h_row_ptr,
+                                                    m_h_col_idx,
+                                                    m_h_permute));
+        } else if (reorder == Reorder::SYMAMD) {
+            CUSOLVER_ERROR(cusolverSpXcsrsymamdHost(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_h_row_ptr,
+                                                    m_h_col_idx,
+                                                    m_h_permute));
+        } else if (reorder == Reorder::NSTDIS) {
+            CUSOLVER_ERROR(cusolverSpXcsrmetisndHost(m_cusolver_sphandle,
+                                                     m_row_size,
+                                                     m_nnz,
+                                                     m_descr,
+                                                     m_h_row_ptr,
+                                                     m_h_col_idx,
+                                                     NULL,
+                                                     m_h_permute));
+        }
+
+        CUDA_ERROR(cudaMemcpyAsync(m_d_permute,
+                                   m_h_permute,
+                                   m_row_size * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+
+        // working space for permutation: B = A*Q*A^T
+        // the permutation for matrix A which works only for the col and row
+        // indices, the val will be done on device with the d/h_val_permute
+        IndexT* h_val_permute =
+            static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
+        IndexT* d_val_permute;
+        CUDA_ERROR(cudaMalloc((void**)&d_val_permute, m_nnz * sizeof(IndexT)));
+
+        size_t size_perm       = 0;
+        void*  perm_buffer_cpu = NULL;
+
+        CUSOLVER_ERROR(cusolverSpXcsrperm_bufferSizeHost(m_cusolver_sphandle,
+                                                         m_row_size,
+                                                         m_col_size,
+                                                         m_nnz,
+                                                         m_descr,
+                                                         m_h_row_ptr,
+                                                         m_h_col_idx,
+                                                         m_h_permute,
+                                                         m_h_permute,
+                                                         &size_perm));
+
+        perm_buffer_cpu = (void*)malloc(sizeof(char) * size_perm);
+
+        for (int j = 0; j < m_nnz; j++) {
+            h_val_permute[j] = j;
+        }
+
+        CUSOLVER_ERROR(cusolverSpXcsrpermHost(m_cusolver_sphandle,
+                                              m_row_size,
+                                              m_col_size,
+                                              m_nnz,
+                                              m_descr,
+                                              m_h_row_ptr,
+                                              m_h_col_idx,
+                                              m_h_permute,
+                                              m_h_permute,
+                                              h_val_permute,
+                                              perm_buffer_cpu));
+
+
+        // do the permutation for val indices on host
+        // T* tmp_h_val = static_cast<T*>(malloc(m_nnz * sizeof(T)));
+
+        // for (int j = 0; j < m_nnz; j++) {
+        //     tmp_h_val[j] = m_h_val[j];
+        // }
+        // for (int j = 0; j < m_nnz; j++) {
+        //     m_h_val[j] = tmp_h_val[h_val_permute[j]];
+        // }
+
+        // copy the purmutated csr from the host
+        CUDA_ERROR(cudaMemcpyAsync(m_d_solver_val,
+                                   m_h_val,
+                                   m_nnz * sizeof(T),
+                                   cudaMemcpyHostToDevice));
+        CUDA_ERROR(cudaMemcpyAsync(m_d_solver_row_ptr,
+                                   m_h_row_ptr,
+                                   (m_row_size + 1) * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+        CUDA_ERROR(cudaMemcpyAsync(m_d_solver_col_idx,
+                                   m_h_col_idx,
+                                   m_nnz * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+
+        // do the permutation for val indices on device
+        CUDA_ERROR(cudaMemcpyAsync(d_val_permute,
+                                   h_val_permute,
+                                   m_nnz * sizeof(IndexT),
+                                   cudaMemcpyHostToDevice));
+
+        detail::permute_gather(d_val_permute, m_d_val, m_d_solver_val, m_nnz);
+
+        free(h_val_permute);
+        GPU_FREE(d_val_permute);
+
+        // restore the host data back to the original
+        if (on_host) {
+            move(DEVICE, HOST);
+        } else {
+            release(HOST);
+        }
+    }
+
+    /**
+     * @brief The lower level api of matrix analysis. Generating a member value
+     * of type csrcholInfo_t for cucolver.
+     */
+    void spmat_chol_analysis()
+    {
+        if (!m_use_reorder) {
+            m_d_solver_row_ptr = m_d_row_ptr;
+            m_d_solver_col_idx = m_d_col_idx;
+            m_d_solver_val     = m_d_val;
+        }
+
+        CUSOLVER_ERROR(cusolverSpCreateCsrcholInfo(&m_chol_info));
+        m_internalDataInBytes = 0;
+        m_workspaceInBytes    = 0;
+        CUSOLVER_ERROR(cusolverSpXcsrcholAnalysis(m_cusolver_sphandle,
+                                                  m_row_size,
+                                                  m_nnz,
+                                                  m_descr,
+                                                  m_d_solver_row_ptr,
+                                                  m_d_solver_col_idx,
+                                                  m_chol_info));
+    }
+
+    /**
+     * @brief The lower level api of matrix factorization buffer calculation and
+     * allocation. The buffer is a member variable.
+     */
+    void spmat_chol_buffer_alloc()
+    {
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholBufferInfo(m_cusolver_sphandle,
+                                                        m_row_size,
+                                                        m_nnz,
+                                                        m_descr,
+                                                        m_d_solver_val,
+                                                        m_d_solver_row_ptr,
+                                                        m_d_solver_col_idx,
+                                                        m_chol_info,
+                                                        &m_internalDataInBytes,
+                                                        &m_workspaceInBytes));
+        }
+
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholBufferInfo(m_cusolver_sphandle,
+                                                        m_row_size,
+                                                        m_nnz,
+                                                        m_descr,
+                                                        m_d_solver_val,
+                                                        m_d_solver_row_ptr,
+                                                        m_d_solver_col_idx,
+                                                        m_chol_info,
+                                                        &m_internalDataInBytes,
+                                                        &m_workspaceInBytes));
+        }
+
+        CUDA_ERROR(cudaMalloc((void**)&m_chol_buffer, m_workspaceInBytes));
+    }
+
+    /**
+     * @brief The lower level api of matrix factorization buffer release.
+     */
+    void spmat_chol_buffer_free()
+    {
+        CUDA_ERROR(cudaFree(m_chol_buffer));
+    }
+
+    /**
+     * @brief The lower level api of matrix factorization and save the
+     * factorization result in to the buffer.
+     */
+    void spmat_chol_factor()
+    {
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholFactor(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_d_solver_val,
+                                                    m_d_solver_row_ptr,
+                                                    m_d_solver_col_idx,
+                                                    m_chol_info,
+                                                    m_chol_buffer));
+        }
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholFactor(m_cusolver_sphandle,
+                                                    m_row_size,
+                                                    m_nnz,
+                                                    m_descr,
+                                                    m_d_solver_val,
+                                                    m_d_solver_row_ptr,
+                                                    m_d_solver_col_idx,
+                                                    m_chol_info,
+                                                    m_chol_buffer));
+        }
+
+        double tol = 1.0e-8;
+        int    singularity;
+
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholZeroPivot(
+                m_cusolver_sphandle, m_chol_info, tol, &singularity));
+        }
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholZeroPivot(
+                m_cusolver_sphandle, m_chol_info, tol, &singularity));
+        }
+        if (0 <= singularity) {
+            RXMESH_WARN(
+                "WARNING: the matrix is singular at row {} under tol ({})",
+                singularity,
+                tol);
+        }
+    }
+
+    /**
+     * @brief The lower level api of solving the linear system after using
+     * cholesky factorization. The format follows Ax=b to solve x, where A is
+     * the sparse matrix, x and b are device array. As long as A doesn't change.
+     * This function could be called for many different b and x.
+     * @param d_b: device array of b
+     * @param d_x: device array of x
+     */
+    void spmat_chol_solve(T* d_b, T* d_x)
+    {
+
+        T* d_solver_b;
+        T* d_solver_x;
+
+        if (m_use_reorder) {
+            /* purmute b and x*/
+            CUDA_ERROR(cudaMalloc((void**)&d_solver_b, m_row_size * sizeof(T)));
+            detail::permute_gather(m_d_permute, d_b, d_solver_b, m_row_size);
+
+            CUDA_ERROR(cudaMalloc((void**)&d_solver_x, m_col_size * sizeof(T)));
+            detail::permute_gather(m_d_permute, d_x, d_solver_x, m_row_size);
+        } else {
+            d_solver_b = d_b;
+            d_solver_x = d_x;
+        }
+
+        if constexpr (std::is_same_v<T, float>) {
+            CUSOLVER_ERROR(cusolverSpScsrcholSolve(m_cusolver_sphandle,
+                                                   m_row_size,
+                                                   d_solver_b,
+                                                   d_solver_x,
+                                                   m_chol_info,
+                                                   m_chol_buffer));
+        }
+
+        if constexpr (std::is_same_v<T, double>) {
+            CUSOLVER_ERROR(cusolverSpDcsrcholSolve(m_cusolver_sphandle,
+                                                   m_row_size,
+                                                   d_solver_b,
+                                                   d_solver_x,
+                                                   m_chol_info,
+                                                   m_chol_buffer));
+        }
+
+        if (m_use_reorder) {
+            detail::permute_scatter(m_d_permute, d_solver_x, d_x, m_row_size);
+            GPU_FREE(d_solver_b);
+            GPU_FREE(d_solver_x);
+        }
+    }
+
+    /* Host data compatibility */
+    /**
+     * @brief move the data between host an device
+     */
+    void move(locationT source, locationT target, cudaStream_t stream = NULL)
+    {
+        if (source == target) {
+            RXMESH_WARN(
+                "SparseMatrix::move() source ({}) and target ({}) "
+                "are the same.",
+                location_to_string(source),
+                location_to_string(target));
+            return;
+        }
+
+        if ((source == HOST || source == DEVICE) &&
+            ((source & m_allocated) != source)) {
+            RXMESH_ERROR(
+                "SparseMatrix::move() moving source is not valid"
+                " because it was not allocated on source i.e., {}",
+                location_to_string(source));
+        }
+
+        if (((target & HOST) == HOST || (target & DEVICE) == DEVICE) &&
+            ((target & m_allocated) != target)) {
+            RXMESH_WARN(
+                "SparseMatrix::move() allocating target before moving to {}",
+                location_to_string(target));
+            allocate(target);
+        }
+
+        if (source == HOST && target == DEVICE) {
+            CUDA_ERROR(cudaMemcpyAsync(m_d_val,
+                                       m_h_val,
+                                       m_nnz * sizeof(T),
+                                       cudaMemcpyHostToDevice,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_d_row_ptr,
+                                       m_h_row_ptr,
+                                       (m_row_size + 1) * sizeof(IndexT),
+                                       cudaMemcpyHostToDevice,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_d_col_idx,
+                                       m_h_col_idx,
+                                       m_nnz * sizeof(IndexT),
+                                       cudaMemcpyHostToDevice,
+                                       stream));
+        } else if (source == DEVICE && target == HOST) {
+            CUDA_ERROR(cudaMemcpyAsync(m_h_val,
+                                       m_d_val,
+                                       m_nnz * sizeof(T),
+                                       cudaMemcpyDeviceToHost,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_h_row_ptr,
+                                       m_d_row_ptr,
+                                       (m_row_size + 1) * sizeof(IndexT),
+                                       cudaMemcpyDeviceToHost,
+                                       stream));
+            CUDA_ERROR(cudaMemcpyAsync(m_h_col_idx,
+                                       m_d_col_idx,
+                                       m_nnz * sizeof(IndexT),
+                                       cudaMemcpyDeviceToHost,
+                                       stream));
+        }
+    }
+
+    /**
+     * @brief release the data on host or device
+     */
+    void release(locationT location = LOCATION_ALL)
+    {
+        if (((location & HOST) == HOST) && ((m_allocated & HOST) == HOST)) {
+            free(m_h_val);
+            free(m_h_row_ptr);
+            free(m_h_col_idx);
+            m_h_val     = nullptr;
+            m_h_row_ptr = nullptr;
+            m_h_col_idx = nullptr;
+            m_allocated = m_allocated & (~HOST);
+        }
+
+        if (((location & DEVICE) == DEVICE) &&
+            ((m_allocated & DEVICE) == DEVICE)) {
+            GPU_FREE(m_d_val);
+            GPU_FREE(m_d_row_ptr);
+            GPU_FREE(m_d_col_idx);
+            m_allocated = m_allocated & (~DEVICE);
+        }
+    }
+
+    /**
+     * @brief allocate the data on host or device
+     */
+    void allocate(locationT location)
+    {
+        if ((location & HOST) == HOST) {
+            release(HOST);
+
+            m_h_val = static_cast<T*>(malloc(m_nnz * sizeof(T)));
+            m_h_row_ptr =
+                static_cast<IndexT*>(malloc((m_row_size + 1) * sizeof(IndexT)));
+            m_h_col_idx = static_cast<IndexT*>(malloc(m_nnz * sizeof(IndexT)));
+
+            m_allocated = m_allocated | HOST;
+        }
+
+        if ((location & DEVICE) == DEVICE) {
+            release(DEVICE);
+
+            CUDA_ERROR(cudaMalloc((void**)&m_d_val, m_nnz * sizeof(T)));
+            CUDA_ERROR(cudaMalloc((void**)&m_d_row_ptr,
+                                  (m_row_size + 1) * sizeof(IndexT)));
+            CUDA_ERROR(
+                cudaMalloc((void**)&m_d_col_idx, m_nnz * sizeof(IndexT)));
+
+            m_allocated = m_allocated | DEVICE;
+        }
+    }
+
+
+   private:
+    const Context        m_context;
+    cusparseHandle_t     m_cusparse_handle;
+    cusolverSpHandle_t   m_cusolver_sphandle;
+    cusparseSpMatDescr_t m_spdescr;
+    cusparseMatDescr_t   m_descr;
+
+    IndexT m_row_size;
+    IndexT m_col_size;
+    IndexT m_nnz;
+
+    // device csr data
+    IndexT* m_d_row_ptr;
+    IndexT* m_d_col_idx;
+    T*      m_d_val;
+
+    // host csr data
+    IndexT* m_h_row_ptr;
+    IndexT* m_h_col_idx;
+    T*      m_h_val;
+
+    // susparse buffer
+    size_t m_spmm_buffer_size;
+    size_t m_spmv_buffer_size;
+
+    // lower level API parameters
+    csrcholInfo_t m_chol_info;
+    size_t        m_internalDataInBytes;
+    size_t        m_workspaceInBytes;
+    void*         m_chol_buffer;
+
+    // purmutation array
+    IndexT* m_h_permute;
+    IndexT* m_d_permute;
+
+    // CSR matrix for solving only
+    // equal to the original matrix if not permutated
+    // only allocated as a new CSR matrix if permutated
+    bool    m_reorder_allocated;
+    IndexT* m_d_solver_row_ptr;
+    IndexT* m_d_solver_col_idx;
+    T*      m_d_solver_val;
+
+    // flags
+    bool      m_use_reorder;
+    locationT m_allocated;
 };
 
 }  // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/util/macros.h b/include/rxmesh/util/macros.h
index 8cf0e34f..0ad4be4b 100644
--- a/include/rxmesh/util/macros.h
+++ b/include/rxmesh/util/macros.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <cuda_runtime_api.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
 #include <stdint.h>
 #include "rxmesh/util/log.h"
 
@@ -48,6 +50,67 @@ inline void HandleError(cudaError_t err, const char* file, int line)
 }
 #define CUDA_ERROR(err) (HandleError(err, __FILE__, __LINE__))
 
+inline void cusparseHandleError(cusparseStatus_t status,
+                                const char*      file,
+                                const int        line)
+{
+    if (status != CUSPARSE_STATUS_SUCCESS) {
+        Log::get_logger()->error("Line {} File {}", line, file);
+        Log::get_logger()->error("CUSPARSE ERROR: {}",
+                                 cusparseGetErrorString(status));
+#ifdef _WIN32
+        system("pause");
+#else
+        exit(EXIT_FAILURE);
+#endif
+    }
+    return;
+}
+#define CUSPARSE_ERROR(err) (cusparseHandleError(err, __FILE__, __LINE__))
+
+
+static inline void cusolverHandleError(cusolverStatus_t status,
+                                       const char*      file,
+                                       const int        line)
+{
+    if (status != CUSOLVER_STATUS_SUCCESS) {
+        auto cusolverGetErrorString = [](cusolverStatus_t status) {
+            switch (status) {
+                case CUSOLVER_STATUS_SUCCESS:
+                    return "CUSOLVER_STATUS_SUCCESS";
+                case CUSOLVER_STATUS_NOT_INITIALIZED:
+                    return "CUSOLVER_STATUS_NOT_INITIALIZED";
+                case CUSOLVER_STATUS_ALLOC_FAILED:
+                    return "CUSOLVER_STATUS_ALLOC_FAILED";
+                case CUSOLVER_STATUS_INVALID_VALUE:
+                    return "CUSOLVER_STATUS_INVALID_VALUE";
+                case CUSOLVER_STATUS_ARCH_MISMATCH:
+                    return "CUSOLVER_STATUS_ARCH_MISMATCH";
+                case CUSOLVER_STATUS_EXECUTION_FAILED:
+                    return "CUSOLVER_STATUS_EXECUTION_FAILED";
+                case CUSOLVER_STATUS_INTERNAL_ERROR:
+                    return "CUSOLVER_STATUS_INTERNAL_ERROR";
+                case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+                    return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+                default:
+                    return "UNKNOWN_ERROR";
+            }
+        };
+
+        Log::get_logger()->error("Line {} File {}", line, file);
+        Log::get_logger()->error("CUSOLVER ERROR: {}",
+                                 cusolverGetErrorString(status));
+#ifdef _WIN32
+        system("pause");
+#else
+        exit(EXIT_FAILURE);
+#endif
+    }
+    return;
+}
+#define CUSOLVER_ERROR(err) (cusolverHandleError(err, __FILE__, __LINE__))
+
+
 // GPU_FREE
 #define GPU_FREE(ptr)              \
     if (ptr != nullptr) {          \
diff --git a/tests/RXMesh_test/test_sparse_matrix.cuh b/tests/RXMesh_test/test_sparse_matrix.cuh
index 1fc4014c..c4bd941a 100644
--- a/tests/RXMesh_test/test_sparse_matrix.cuh
+++ b/tests/RXMesh_test/test_sparse_matrix.cuh
@@ -1,7 +1,8 @@
 #include "gtest/gtest.h"
 
 #include "rxmesh/attribute.h"
-#include "rxmesh/matrix/matrix_operation.cuh"
+#include "rxmesh/matrix/dense_matrix.cuh"
+#include "rxmesh/matrix/sparse_matrix.cuh"
 #include "rxmesh/query.cuh"
 #include "rxmesh/rxmesh_static.h"
 
@@ -11,7 +12,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context,
                                        IndexT*               vet_degree)
 {
     using namespace rxmesh;
-
     auto compute_valence = [&](VertexHandle& v_id, const VertexIterator& iter) {
         auto     ids      = v_id.unpack();
         uint32_t patch_id = ids.first;
@@ -26,26 +26,6 @@ __global__ static void sparse_mat_test(const rxmesh::Context context,
     query.dispatch<Op::VV>(block, shrd_alloc, compute_valence);
 }
 
-template <uint32_t blockThreads>
-__global__ static void sparse_mat_query_test(
-    const rxmesh::Context     context,
-    rxmesh::SparseMatrix<int> sparse_mat)
-{
-    using namespace rxmesh;
-    auto fillin = [&](VertexHandle& v_id, const VertexIterator& iter) {
-        sparse_mat(v_id, v_id) = 2;
-        for (uint32_t v = 0; v < iter.size(); ++v) {
-            sparse_mat(v_id, iter[v]) = 2;
-            sparse_mat(iter[v], v_id) = 2;
-        }
-    };
-
-    auto                block = cooperative_groups::this_thread_block();
-    Query<blockThreads> query(context);
-    ShmemAllocator      shrd_alloc;
-    query.dispatch<Op::VV>(block, shrd_alloc, fillin);
-}
-
 template <typename T, uint32_t blockThreads>
 __global__ static void sparse_mat_edge_len_test(
     const rxmesh::Context      context,
@@ -61,8 +41,7 @@ __global__ static void sparse_mat_edge_len_test(
         uint32_t r_patch_id = r_ids.first;
         uint16_t r_local_id = r_ids.second;
 
-        uint32_t row_index =
-            sparse_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
+        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
 
         arr_ref[row_index]     = 0;
         sparse_mat(v_id, v_id) = 0;
@@ -72,9 +51,9 @@ __global__ static void sparse_mat_edge_len_test(
             Vector<3, T> vi_coord(
                 coords(iter[v], 0), coords(iter[v], 1), coords(iter[v], 2));
 
-            sparse_mat(v_id, iter[v]) = dist(v_coord, vi_coord);
+            sparse_mat(v_id, iter[v]) = 1;  // dist(v_coord, vi_coord);
 
-            arr_ref[row_index] += dist(v_coord, vi_coord);
+            arr_ref[row_index] += 1;  // dist(v_coord, vi_coord);
         }
     };
 
@@ -93,11 +72,11 @@ __global__ void spmat_multi_hardwired_kernel(T*                      vec,
     int   tid = threadIdx.x + blockIdx.x * blockDim.x;
     float sum = 0;
     if (tid < N) {
-        uint32_t start = sparse_mat.m_d_row_ptr[tid];
-        uint32_t end   = sparse_mat.m_d_row_ptr[tid + 1];
+        uint32_t start = sparse_mat.get_row_ptr_at(tid);
+        uint32_t end   = sparse_mat.get_row_ptr_at(tid + 1);
         for (int i = 0; i < end - start; i++) {
-            sum += vec[sparse_mat.m_d_col_idx[start + i]] *
-                   sparse_mat.m_d_val[start + i];
+            sum += vec[sparse_mat.get_col_idx_at(start + i)] *
+                   sparse_mat.get_val_at(start + i);
         }
         out[tid] = sum;
     }
@@ -122,12 +101,11 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
         uint32_t r_patch_id = r_ids.first;
         uint16_t r_local_id = r_ids.second;
 
-        uint32_t row_index =
-            A_mat.m_context.m_vertex_prefix[r_patch_id] + r_local_id;
+        uint32_t row_index = context.m_vertex_prefix[r_patch_id] + r_local_id;
 
-        B_mat(row_index, 0) = iter.size() * 7;
-        B_mat(row_index, 1) = iter.size() * 2;
-        B_mat(row_index, 2) = iter.size() * 10;
+        B_mat(row_index, 0) = iter.size() * 7.4f;
+        B_mat(row_index, 1) = iter.size() * 2.6f;
+        B_mat(row_index, 2) = iter.size() * 10.3f;
 
         X_mat(row_index, 0) = coords(v_id, 0) * v_weight;
         X_mat(row_index, 1) = coords(v_id, 1) * v_weight;
@@ -137,12 +115,13 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
             coords(v_id, 0), coords(v_id, 1), coords(v_id, 2));
         for (uint32_t v = 0; v < iter.size(); ++v) {
             T e_weight           = 1;
-            A_mat(v_id, iter[v]) = -time_step * e_weight;
+            A_mat(v_id, iter[v]) = time_step * e_weight;
 
             sum_e_weight += e_weight;
         }
 
-        A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight;
+        A_mat(v_id, v_id) = v_weight + time_step * sum_e_weight +
+                            iter.size() * iter.size() + 1000000;
     };
 
     auto                block = cooperative_groups::this_thread_block();
@@ -151,6 +130,7 @@ __global__ static void simple_A_X_B_setup(const rxmesh::Context      context,
     query.dispatch<Op::VV>(block, shrd_alloc, mat_setup);
 }
 
+/* Check the access of the sparse matrix in CSR format in device */
 TEST(RXMeshStatic, SparseMatrix)
 {
     using namespace rxmesh;
@@ -197,6 +177,7 @@ TEST(RXMeshStatic, SparseMatrix)
     rxmesh.prepare_launch_box(
         {Op::VV}, launch_box, (void*)sparse_mat_test<threads>);
 
+    // test kernel
     sparse_mat_test<threads>
         <<<launch_box.blocks,
            launch_box.num_threads,
@@ -214,52 +195,12 @@ TEST(RXMeshStatic, SparseMatrix)
     CUDA_ERROR(cudaFree(d_arr_ones));
     CUDA_ERROR(cudaFree(d_result));
     CUDA_ERROR(cudaFree(vet_degree));
-    spmat.free();
-}
-
-TEST(RXMeshStatic, SparseMatrixQuery)
-{
-    using namespace rxmesh;
-
-    // Select device
-    cuda_query(0);
-
-    // generate rxmesh obj
-    std::string  obj_path = STRINGIFY(INPUT_DIR) "cube.obj";
-    RXMeshStatic rxmesh(obj_path);
-
-    uint32_t num_vertices = rxmesh.get_num_vertices();
-
-    const uint32_t threads = 256;
-    const uint32_t blocks  = DIVIDE_UP(num_vertices, threads);
-
-    SparseMatrix<int> spmat(rxmesh);
-    spmat.set_ones();
-
-    LaunchBox<threads> launch_box;
-    rxmesh.prepare_launch_box(
-        {Op::VV}, launch_box, (void*)sparse_mat_query_test<threads>);
-
-    sparse_mat_query_test<threads>
-        <<<launch_box.blocks,
-           launch_box.num_threads,
-           launch_box.smem_bytes_dyn>>>(rxmesh.get_context(), spmat);
-
-    std::vector<uint32_t> h_result(spmat.m_nnz);
-    CUDA_ERROR(cudaMemcpy(h_result.data(),
-                          spmat.m_d_val,
-                          spmat.m_nnz * sizeof(int),
-                          cudaMemcpyDeviceToHost));
-
-    std::vector<uint32_t> h_ref(spmat.m_nnz, 2);
-
-    for (int i = 0; i < spmat.m_nnz; ++i) {
-        EXPECT_EQ(h_result[i], h_ref[i]);
-    }
-
-    spmat.free();
+    spmat.free_mat();
 }
 
+/* First replace the sparse matrix entry with the edge length and then do spmv
+ * with an all one array and check the result
+ */
 TEST(RXMeshStatic, SparseMatrixEdgeLen)
 {
     using namespace rxmesh;
@@ -268,8 +209,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
     cuda_query(0);
 
     // generate rxmesh obj
-    std::string  obj_path = STRINGIFY(INPUT_DIR) "cube.obj";
-    RXMeshStatic rxmesh(obj_path);
+    RXMeshStatic rxmesh(rxmesh_args.obj_file_name, rxmesh_args.quite);
 
     uint32_t num_vertices = rxmesh.get_num_vertices();
 
@@ -304,11 +244,7 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
                                                launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, spmat, d_arr_ref);
 
-    // Spmat matrix multiply
-
-    // spmat_multi_hardwired_kernel<float>
-    //     <<<blocks, threads>>>(d_arr_ones, spmat, d_result, num_vertices);
-    spmat_arr_mul(spmat, d_arr_ones, d_result);
+    spmat.arr_mul(d_arr_ones, d_result);
 
     // copy the value back to host
     std::vector<float> h_arr_ref(num_vertices);
@@ -330,9 +266,13 @@ TEST(RXMeshStatic, SparseMatrixEdgeLen)
     CUDA_ERROR(cudaFree(d_arr_ref));
     CUDA_ERROR(cudaFree(d_arr_ones));
     CUDA_ERROR(cudaFree(d_result));
-    spmat.free();
+    spmat.free_mat();
 }
 
+/* set up a simple AX=B system where A is a sparse matrix, B and C are dense
+ * matrix. Solve it using the warpped up cusolver API and check the final AX
+ * with B using warpped up cusparse API.
+ */
 TEST(RXMeshStatic, SparseMatrixSimpleSolve)
 {
     using namespace rxmesh;
@@ -341,8 +281,8 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
     cuda_query(0);
 
     // generate rxmesh obj
-    std::string  obj_path = STRINGIFY(INPUT_DIR) "dragon.obj";
-    RXMeshStatic rxmesh(obj_path);
+    std::string  obj_path = rxmesh_args.obj_file_name;
+    RXMeshStatic rxmesh(obj_path, rxmesh_args.quite);
 
     uint32_t num_vertices = rxmesh.get_num_vertices();
 
@@ -366,21 +306,99 @@ TEST(RXMeshStatic, SparseMatrixSimpleSolve)
                                          launch_box.smem_bytes_dyn>>>(
         rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
 
-    spmat_linear_solve(A_mat, B_mat, X_mat, Solver::CHOL, Reorder::NONE);
+    A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
+
+    // timing begins for spmm
+    GPUTimer timer;
+    timer.start();
 
-    spmat_denmat_mul(A_mat, X_mat, ret_mat);
-    // spmat_denmat_mul_cw(A_mat, X_mat, ret_mat);
+    A_mat.denmat_mul(X_mat, ret_mat);
 
-    std::vector<float> h_ret_mat(num_vertices * 3);
-    cudaMemcpy(h_ret_mat.data(),
-               ret_mat.data(),
-               num_vertices * 3,
-               cudaMemcpyDeviceToHost);
-    std::vector<float> h_B_mat(num_vertices * 3);
-    cudaMemcpy(
-        h_B_mat.data(), B_mat.data(), num_vertices * 3, cudaMemcpyDeviceToHost);
+    timer.stop();
+    RXMESH_TRACE("SPMM_rxmesh() took {} (ms) ", timer.elapsed_millis());
 
-    for (uint32_t i = 0; i < num_vertices * 3; ++i) {
-        EXPECT_NEAR(h_ret_mat[i], h_B_mat[i], 1e-3);
+    std::vector<Vector3f> h_ret_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
+                          ret_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+    std::vector<Vector3f> h_B_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
+                          B_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+
+    for (uint32_t i = 0; i < num_vertices; ++i) {
+        for (uint32_t j = 0; j < 3; ++j) {
+            EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
+        }
+    }
+    A_mat.free_mat();
+}
+
+TEST(RXMeshStatic, SparseMatrixLowerLevelAPISolve)
+{
+    using namespace rxmesh;
+
+    // Select device
+    cuda_query(0);
+
+    // generate rxmesh obj
+    std::string  obj_path = rxmesh_args.obj_file_name;
+    RXMeshStatic rxmesh(obj_path, rxmesh_args.quite);
+
+    uint32_t num_vertices = rxmesh.get_num_vertices();
+
+    const uint32_t threads = 256;
+    const uint32_t blocks  = DIVIDE_UP(num_vertices, threads);
+
+    auto                coords = rxmesh.get_input_vertex_coordinates();
+    SparseMatrix<float> A_mat(rxmesh);
+    DenseMatrix<float>  X_mat(num_vertices, 3);
+    DenseMatrix<float>  B_mat(num_vertices, 3);
+    DenseMatrix<float>  ret_mat(num_vertices, 3);
+
+    float time_step = 1.f;
+
+    LaunchBox<threads> launch_box;
+    rxmesh.prepare_launch_box(
+        {Op::VV}, launch_box, (void*)simple_A_X_B_setup<float, threads>);
+
+    simple_A_X_B_setup<float, threads><<<launch_box.blocks,
+                                         launch_box.num_threads,
+                                         launch_box.smem_bytes_dyn>>>(
+        rxmesh.get_context(), *coords, A_mat, X_mat, B_mat, time_step);
+
+    // A_mat.spmat_linear_solve(B_mat, X_mat, Solver::CHOL, Reorder::NSTDIS);
+
+    A_mat.spmat_chol_reorder(Reorder::NSTDIS);
+    A_mat.spmat_chol_analysis();
+    A_mat.spmat_chol_buffer_alloc();
+    A_mat.spmat_chol_factor();
+
+    for (int i = 0; i < B_mat.m_col_size; ++i) {
+        A_mat.spmat_chol_solve(B_mat.col_data(i), X_mat.col_data(i));
+    }
+
+    A_mat.spmat_chol_buffer_free();
+
+    A_mat.denmat_mul(X_mat, ret_mat);
+
+    std::vector<Vector3f> h_ret_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_ret_mat.data(),
+                          ret_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+    std::vector<Vector3f> h_B_mat(num_vertices);
+    CUDA_ERROR(cudaMemcpy(h_B_mat.data(),
+                          B_mat.data(),
+                          num_vertices * 3 * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+
+    for (uint32_t i = 0; i < num_vertices; ++i) {
+        for (uint32_t j = 0; j < 3; ++j) {
+            EXPECT_NEAR(h_ret_mat[i][j], h_B_mat[i][j], 1e-3);
+        }
     }
+    A_mat.free_mat();
 }
\ No newline at end of file