reorg secp

owensgroup · Dec 25, 2024 · e05473c · e05473c
1 parent 3dd2f42
commit e05473c
Show file tree

Hide file tree

Showing 4 changed files with 186 additions and 240 deletions.
diff --git a/apps/SECPriority/CMakeLists.txt b/apps/SECPriority/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SOURCE_LIST
     secp.cu
 	secp_rxmesh.cuh
 	secp_kernels.cuh
+	secp_pair.h
 )
 
 target_sources(SECPriority

diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh
@@ -2,14 +2,12 @@
 #include "../Remesh/link_condition.cuh"
 #include "rxmesh/cavity_manager.cuh"
 
-#include <cooperative_groups.h>
-#include <cuda_runtime.h>
+#include "secp_pair.h"
 
 template <typename T, uint32_t blockThreads>
 __global__ static void secp(rxmesh::Context             context,
                             rxmesh::VertexAttribute<T>  coords,
-                            const int                   reduce_threshold,
-                            rxmesh::EdgeAttribute<bool> e_pop_attr)
+                            rxmesh::EdgeAttribute<bool> to_collapse)
 {
     using namespace rxmesh;
     auto           block = cooperative_groups::this_thread_block();
@@ -40,12 +38,12 @@ __global__ static void secp(rxmesh::Context             context,
     ev_query.prologue<Op::EV>(block, shrd_alloc);
     block.sync();
 
-    // 1a) mark edge we want to collapse given e_pop_attr
+    // 1a) mark edge we want to collapse given to_collapse
     for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
         assert(eh.local_id() < cavity.patch_info().num_edges[0]);
 
-        // edge_mask.set(eh.local_id(), e_pop_attr(eh));
-        if (true == e_pop_attr(eh)) {
+        // edge_mask.set(eh.local_id(), to_collapse(eh));
+        if (to_collapse(eh)) {
             edge_mask.set(eh.local_id(), true);
         }
     });
@@ -73,7 +71,7 @@ __global__ static void secp(rxmesh::Context             context,
     ev_query.epilogue(block, shrd_alloc);
 
     // create the cavity
-    if (cavity.prologue(block, shrd_alloc, coords)) {
+    if (cavity.prologue(block, shrd_alloc, coords, to_collapse)) {
         edge_mask.reset(block);
         block.sync();
 
@@ -145,7 +143,7 @@ template <typename T, uint32_t blockThreads>
 __global__ static void compute_edge_priorities(
     rxmesh::Context                  context,
     const rxmesh::VertexAttribute<T> coords,
-    PQView_t                         pq_view,
+    PQViewT                          pq_view,
     size_t                           pq_num_bytes)
 {
     using namespace rxmesh;
@@ -154,53 +152,48 @@ __global__ static void compute_edge_priorities(
     ShmemAllocator   shrd_alloc;
 
     Query<blockThreads> query(context);
-    auto                intermediatePairs =
-        shrd_alloc.alloc<PriorityPair_t>(query.get_patch_info().num_edges[0]);
+
+    PriorityPairT* s_pairs =
+        shrd_alloc.alloc<PriorityPairT>(query.get_patch_info().num_edges[0]);
     __shared__ int pair_counter;
     pair_counter = 0;
 
     auto edge_len = [&](const EdgeHandle& eh, const VertexIterator& iter) {
         const VertexHandle v0 = iter[0];
         const VertexHandle v1 = iter[1];
 
-        const Vec3<T> p0(coords(v0, 0), coords(v0, 1), coords(v0, 2));
-        const Vec3<T> p1(coords(v1, 0), coords(v1, 1), coords(v1, 2));
+        const vec3<T> p0 = coords.to_glm<3>(v0);
+        const vec3<T> p1 = coords.to_glm<3>(v1);
 
-        T len2 = glm::distance2(p0, p1);
+        const T len2 = glm::distance2(p0, p1);
 
-        auto p_e = rxmesh::detail::unpack(eh.unique_id());
-        // printf("p_id:%u\te_id:%hu\n", p_e.first, p_e.second);
-        // printf("e_id:%llu\t, len:%f\n", eh.unique_id(), len2);
+        assert(eh.patch_id() < (1 << 16));
 
         // repack the EdgeHandle into smaller 32 bits for
         // use with priority queue. Need to check elsewhere
         // that there are less than 2^16 patches.
-        auto id32 = unique_id32(p_e.second, (uint16_t)p_e.first);
-        // auto p_e_32 = unpack32(id32);
-        // printf("32bit p_id:%hu\te_id:%hu\n", p_e_32.first, p_e_32.second);
+        const uint32_t id32 =
+            unique_id32(eh.local_id(), (uint16_t)eh.patch_id());
+
+        const PriorityPairT p{len2, id32};
 
-        PriorityPair_t p{len2, id32};
-        // PriorityPair_t p{len2, eh};
+        int val_counter = atomicAdd(&pair_counter, 1);
 
-        auto val_counter               = atomicAdd(&pair_counter, 1);
-        intermediatePairs[val_counter] = p;
+        s_pairs[val_counter] = p;
     };
 
     auto block = cooperative_groups::this_thread_block();
     query.dispatch<Op::EV>(block, shrd_alloc, edge_len);
     block.sync();
 
     char* pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
-    pq_view.push(block,
-                 intermediatePairs,
-                 intermediatePairs + pair_counter,
-                 pq_shrd_mem);
+    pq_view.push(block, s_pairs, s_pairs + pair_counter, pq_shrd_mem);
 }
 
 template <uint32_t blockThreads>
 __global__ static void pop_and_mark_edges_to_collapse(
-    PQView_t                    pq_view,
-    rxmesh::EdgeAttribute<bool> marked_edges,
+    PQViewT                     pq_view,
+    rxmesh::EdgeAttribute<bool> to_collapse,
     uint32_t                    pop_num_edges)
 {
     // setup shared memory array to store the popped pairs
@@ -210,24 +203,24 @@ __global__ static void pop_and_mark_edges_to_collapse(
     using namespace rxmesh;
     ShmemAllocator shrd_alloc;
 
-    auto  intermediatePairs = shrd_alloc.alloc<PriorityPair_t>(blockThreads);
-    char* pq_shrd_mem  = shrd_alloc.alloc(pq_view.get_shmem_size(blockThreads));
+    PriorityPairT* s_pairs = shrd_alloc.alloc<PriorityPairT>(blockThreads);
+
+    char* pq_shrd_mem = shrd_alloc.alloc(pq_view.get_shmem_size(blockThreads));
+
     cg::thread_block g = cg::this_thread_block();
-    pq_view.pop(
-        g, intermediatePairs, intermediatePairs + blockThreads, pq_shrd_mem);
 
-    int tid       = blockIdx.x * blockDim.x + threadIdx.x;
-    int local_tid = threadIdx.x;
+    pq_view.pop(g, s_pairs, s_pairs + blockThreads, pq_shrd_mem);
+
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
 
     // Make sure the index is within bounds
     if (tid < pop_num_edges) {
-        // printf("tid: %d\n", tid);
         // unpack the uid to get the patch and edge ids
-        auto p_e = unpack32(intermediatePairs[local_tid].second);
-        // printf("32bit p_id:%hu\te_id:%hu\n", p_e.first, p_e.second);
-        rxmesh::EdgeHandle eh(p_e.first, rxmesh::LocalEdgeT(p_e.second));
+        auto [patch_id, local_id] = unpack32(s_pairs[threadIdx.x].second);
+
+        EdgeHandle eh(patch_id, LocalEdgeT(local_id));
 
         // use the eh to index into a passed in edge attribute
-        marked_edges(eh) = true;
+        to_collapse(eh) = true;
     }
 }
diff --git a/apps/SECPriority/secp_pair.h b/apps/SECPriority/secp_pair.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <cuco/pair.cuh>
+#include <cuco/priority_queue.cuh>
+
+
+/**
+ * @brief Return unique index of the local mesh element composed by the
+ * patch id and the local index
+ *
+ * @param local_id the local within-patch mesh element id
+ * @param patch_id the patch owning the mesh element
+ * @return
+ */
+constexpr __device__ __host__ __forceinline__ uint32_t
+unique_id32(const uint16_t local_id, const uint16_t patch_id)
+{
+    uint32_t ret = patch_id;
+    ret          = (ret << 16);
+    ret |= local_id;
+    return ret;
+}
+
+
+/**
+ * @brief unpack a 32 uint to its high and low 16 bits.
+ * This is used to convert the unique id to its local id (16
+ * low bit) and patch id (high 16 bit)
+ * @param uid unique id
+ * @return a std::pair storing the patch id and local id
+ */
+constexpr __device__ __host__ __forceinline__ std::pair<uint16_t, uint16_t>
+                                              unpack32(uint32_t uid)
+{
+    uint16_t local_id = uid & ((1 << 16) - 1);
+    uint16_t patch_id = uid >> 16;
+    return std::make_pair(patch_id, local_id);
+}
+
+
+/**
+ * @brief less than operator for std::pair
+ * @tparam T
+ */
+template <typename T>
+struct pair_less
+{
+    __host__ __device__ __forceinline__ bool operator()(const T& a,
+                                                        const T& b) const
+    {
+        return a.first < b.first;
+    }
+};
+
+
+// Priority queue setup. Use 'pair_less' to prioritize smaller values.
+using PriorityPairT   = cuco::pair<float, uint32_t>;
+using PriorityCompare = pair_less<PriorityPairT>;
+using PriorityQueueT  = cuco::priority_queue<PriorityPairT, PriorityCompare>;
+using PQViewT         = PriorityQueueT::device_mutable_view;