diff --git a/apps/SECPriority/CMakeLists.txt b/apps/SECPriority/CMakeLists.txt index c3e7d826..7b66fd4c 100644 --- a/apps/SECPriority/CMakeLists.txt +++ b/apps/SECPriority/CMakeLists.txt @@ -4,6 +4,7 @@ set(SOURCE_LIST secp.cu secp_rxmesh.cuh secp_kernels.cuh + secp_pair.h ) target_sources(SECPriority diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh index 35aca5bd..65f90229 100644 --- a/apps/SECPriority/secp_kernels.cuh +++ b/apps/SECPriority/secp_kernels.cuh @@ -2,14 +2,12 @@ #include "../Remesh/link_condition.cuh" #include "rxmesh/cavity_manager.cuh" -#include -#include +#include "secp_pair.h" template __global__ static void secp(rxmesh::Context context, rxmesh::VertexAttribute coords, - const int reduce_threshold, - rxmesh::EdgeAttribute e_pop_attr) + rxmesh::EdgeAttribute to_collapse) { using namespace rxmesh; auto block = cooperative_groups::this_thread_block(); @@ -40,12 +38,12 @@ __global__ static void secp(rxmesh::Context context, ev_query.prologue(block, shrd_alloc); block.sync(); - // 1a) mark edge we want to collapse given e_pop_attr + // 1a) mark edge we want to collapse given to_collapse for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) { assert(eh.local_id() < cavity.patch_info().num_edges[0]); - // edge_mask.set(eh.local_id(), e_pop_attr(eh)); - if (true == e_pop_attr(eh)) { + // edge_mask.set(eh.local_id(), to_collapse(eh)); + if (to_collapse(eh)) { edge_mask.set(eh.local_id(), true); } }); @@ -73,7 +71,7 @@ __global__ static void secp(rxmesh::Context context, ev_query.epilogue(block, shrd_alloc); // create the cavity - if (cavity.prologue(block, shrd_alloc, coords)) { + if (cavity.prologue(block, shrd_alloc, coords, to_collapse)) { edge_mask.reset(block); block.sync(); @@ -145,7 +143,7 @@ template __global__ static void compute_edge_priorities( rxmesh::Context context, const rxmesh::VertexAttribute coords, - PQView_t pq_view, + PQViewT pq_view, size_t pq_num_bytes) { using namespace rxmesh; @@ -154,8 +152,9 @@ __global__ static void compute_edge_priorities( ShmemAllocator shrd_alloc; Query query(context); - auto intermediatePairs = - shrd_alloc.alloc(query.get_patch_info().num_edges[0]); + + PriorityPairT* s_pairs = + shrd_alloc.alloc(query.get_patch_info().num_edges[0]); __shared__ int pair_counter; pair_counter = 0; @@ -163,27 +162,24 @@ __global__ static void compute_edge_priorities( const VertexHandle v0 = iter[0]; const VertexHandle v1 = iter[1]; - const Vec3 p0(coords(v0, 0), coords(v0, 1), coords(v0, 2)); - const Vec3 p1(coords(v1, 0), coords(v1, 1), coords(v1, 2)); + const vec3 p0 = coords.to_glm<3>(v0); + const vec3 p1 = coords.to_glm<3>(v1); - T len2 = glm::distance2(p0, p1); + const T len2 = glm::distance2(p0, p1); - auto p_e = rxmesh::detail::unpack(eh.unique_id()); - // printf("p_id:%u\te_id:%hu\n", p_e.first, p_e.second); - // printf("e_id:%llu\t, len:%f\n", eh.unique_id(), len2); + assert(eh.patch_id() < (1 << 16)); // repack the EdgeHandle into smaller 32 bits for // use with priority queue. Need to check elsewhere // that there are less than 2^16 patches. - auto id32 = unique_id32(p_e.second, (uint16_t)p_e.first); - // auto p_e_32 = unpack32(id32); - // printf("32bit p_id:%hu\te_id:%hu\n", p_e_32.first, p_e_32.second); + const uint32_t id32 = + unique_id32(eh.local_id(), (uint16_t)eh.patch_id()); + + const PriorityPairT p{len2, id32}; - PriorityPair_t p{len2, id32}; - // PriorityPair_t p{len2, eh}; + int val_counter = atomicAdd(&pair_counter, 1); - auto val_counter = atomicAdd(&pair_counter, 1); - intermediatePairs[val_counter] = p; + s_pairs[val_counter] = p; }; auto block = cooperative_groups::this_thread_block(); @@ -191,16 +187,13 @@ __global__ static void compute_edge_priorities( block.sync(); char* pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes); - pq_view.push(block, - intermediatePairs, - intermediatePairs + pair_counter, - pq_shrd_mem); + pq_view.push(block, s_pairs, s_pairs + pair_counter, pq_shrd_mem); } template __global__ static void pop_and_mark_edges_to_collapse( - PQView_t pq_view, - rxmesh::EdgeAttribute marked_edges, + PQViewT pq_view, + rxmesh::EdgeAttribute to_collapse, uint32_t pop_num_edges) { // setup shared memory array to store the popped pairs @@ -210,24 +203,24 @@ __global__ static void pop_and_mark_edges_to_collapse( using namespace rxmesh; ShmemAllocator shrd_alloc; - auto intermediatePairs = shrd_alloc.alloc(blockThreads); - char* pq_shrd_mem = shrd_alloc.alloc(pq_view.get_shmem_size(blockThreads)); + PriorityPairT* s_pairs = shrd_alloc.alloc(blockThreads); + + char* pq_shrd_mem = shrd_alloc.alloc(pq_view.get_shmem_size(blockThreads)); + cg::thread_block g = cg::this_thread_block(); - pq_view.pop( - g, intermediatePairs, intermediatePairs + blockThreads, pq_shrd_mem); - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int local_tid = threadIdx.x; + pq_view.pop(g, s_pairs, s_pairs + blockThreads, pq_shrd_mem); + + int tid = blockIdx.x * blockDim.x + threadIdx.x; // Make sure the index is within bounds if (tid < pop_num_edges) { - // printf("tid: %d\n", tid); // unpack the uid to get the patch and edge ids - auto p_e = unpack32(intermediatePairs[local_tid].second); - // printf("32bit p_id:%hu\te_id:%hu\n", p_e.first, p_e.second); - rxmesh::EdgeHandle eh(p_e.first, rxmesh::LocalEdgeT(p_e.second)); + auto [patch_id, local_id] = unpack32(s_pairs[threadIdx.x].second); + + EdgeHandle eh(patch_id, LocalEdgeT(local_id)); // use the eh to index into a passed in edge attribute - marked_edges(eh) = true; + to_collapse(eh) = true; } } diff --git a/apps/SECPriority/secp_pair.h b/apps/SECPriority/secp_pair.h new file mode 100644 index 00000000..0571e18c --- /dev/null +++ b/apps/SECPriority/secp_pair.h @@ -0,0 +1,60 @@ +#pragma once + +#include +#include + + +/** + * @brief Return unique index of the local mesh element composed by the + * patch id and the local index + * + * @param local_id the local within-patch mesh element id + * @param patch_id the patch owning the mesh element + * @return + */ +constexpr __device__ __host__ __forceinline__ uint32_t +unique_id32(const uint16_t local_id, const uint16_t patch_id) +{ + uint32_t ret = patch_id; + ret = (ret << 16); + ret |= local_id; + return ret; +} + + +/** + * @brief unpack a 32 uint to its high and low 16 bits. + * This is used to convert the unique id to its local id (16 + * low bit) and patch id (high 16 bit) + * @param uid unique id + * @return a std::pair storing the patch id and local id + */ +constexpr __device__ __host__ __forceinline__ std::pair + unpack32(uint32_t uid) +{ + uint16_t local_id = uid & ((1 << 16) - 1); + uint16_t patch_id = uid >> 16; + return std::make_pair(patch_id, local_id); +} + + +/** + * @brief less than operator for std::pair + * @tparam T + */ +template +struct pair_less +{ + __host__ __device__ __forceinline__ bool operator()(const T& a, + const T& b) const + { + return a.first < b.first; + } +}; + + +// Priority queue setup. Use 'pair_less' to prioritize smaller values. +using PriorityPairT = cuco::pair; +using PriorityCompare = pair_less; +using PriorityQueueT = cuco::priority_queue; +using PQViewT = PriorityQueueT::device_mutable_view; diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh index bddb4b4f..94bc5471 100644 --- a/apps/SECPriority/secp_rxmesh.cuh +++ b/apps/SECPriority/secp_rxmesh.cuh @@ -1,98 +1,10 @@ #pragma once - -#define GLM_ENABLE_EXPERIMENTAL -#include -#include - - #include "rxmesh/query.cuh" #include "rxmesh/rxmesh_dynamic.h" - -// Priority Queue related includes -#include -#include - -#include -#include - -/** - * @brief Return unique index of the local mesh element composed by the - * patch id and the local index - * - * @param local_id the local within-patch mesh element id - * @param patch_id the patch owning the mesh element - * @return - */ -constexpr __device__ __host__ __forceinline__ uint32_t -unique_id32(const uint16_t local_id, const uint16_t patch_id) -{ - uint32_t ret = patch_id; - ret = (ret << 16); - ret |= local_id; - return ret; -} - -/** - * @brief unpack a 32 uint to its high and low 16 bits. - * This is used to convert the unique id to its local id (16 - * low bit) and patch id (high 16 bit) - * @param uid unique id - * @return a std::pair storing the patch id and local id - */ -constexpr __device__ __host__ __forceinline__ std::pair - unpack32(uint32_t uid) -{ - uint16_t local_id = uid & ((1 << 16) - 1); - uint16_t patch_id = uid >> 16; - return std::make_pair(patch_id, local_id); -} - -// Priority queue setup. Use 'pair_less' to prioritize smaller values. -template -struct pair_less -{ - __host__ __device__ bool operator()(const T& a, const T& b) const - { - return a.first < b.first; - } -}; - -using PriorityPair_t = cuco::pair; -using PriorityCompare = pair_less; -using PriorityQueue_t = cuco::priority_queue; -using PQView_t = PriorityQueue_t::device_mutable_view; - - -template -using Vec3 = glm::vec<3, T, glm::defaultp>; - -#include "secp_kernels.cuh" - #include "rxmesh/util/report.h" -template -void render_edge_attr( - rxmesh::RXMeshDynamic& rx, - const std::shared_ptr>& edge_attr) -{ - using namespace rxmesh; - // make sure the attribute is on the HOST - edge_attr->move(DEVICE, HOST); - - std::vector edgeColors(rx.get_num_edges()); - rx.for_each_edge(HOST, [&](EdgeHandle eh) { - if (true == (*edge_attr)(eh)) { - edgeColors[rx.linear_id(eh)] = 200.0f; - } else { - edgeColors[rx.linear_id(eh)] = eh.patch_id(); - } - }); - - auto ps_mesh = rx.get_polyscope_mesh(); - auto edge_colors = - ps_mesh->addEdgeScalarQuantity("Edges to Collapse", edgeColors); - edge_colors->setEnabled(true); -} +#include "secp_kernels.cuh" +#include "secp_pair.h" inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, const uint32_t final_num_vertices, @@ -103,7 +15,7 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, using namespace rxmesh; constexpr uint32_t blockThreads = 256; - rxmesh::Report report("SECP_RXMesh"); + Report report("SECP_RXMesh"); report.command_line(Arg.argc, Arg.argv); report.device(); report.system(); @@ -115,15 +27,17 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, LaunchBox launch_box; - float total_time = 0; - float app_time = 0; - float slice_time = 0; - float cleanup_time = 0; - float pq_time = 0; - float pop_mark_time = 0; - float e_priority_time = 0; + Timers timers; + timers.add("Total"); - auto e_pop_attr = rx.add_edge_attribute("ePop", 1); + timers.add("App"); + timers.add("Slice"); + timers.add("Cleanup"); + timers.add("PriorityQueue"); + timers.add("PriorityQueuePop"); + timers.add("EdgePriority"); + + auto to_collapse = rx.add_edge_attribute("ePop", 1); RXMESH_INFO("#Vertices {}", rx.get_num_vertices()); RXMESH_INFO("#Edges {}", rx.get_num_edges()); @@ -147,20 +61,18 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, int num_passes = 0; CUDA_ERROR(cudaProfilerStart()); - GPUTimer timer; - timer.start(); + + timers.start("Total"); while (rx.get_num_vertices(true) > final_num_vertices) { ++num_passes; - GPUTimer pq_timer; - pq_timer.start(); + timers.start("PriorityQueue"); - // rebuild every round? Not necessarily a great way to use a pq. - PriorityQueue_t pq(rx.get_num_edges()); - e_pop_attr->reset(false, DEVICE); - - // rx.prepare_launch_box( - rx.update_launch_box( + // rebuild every round? Not necessarily a great way to use a priority + // queue. + PriorityQueueT priority_queue(rx.get_num_edges()); + to_collapse->reset(false, DEVICE); + rx.prepare_launch_box( {Op::EV}, launch_box, (void*)compute_edge_priorities, @@ -171,70 +83,58 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, [&](uint32_t v, uint32_t e, uint32_t f) { // Allocate enough additional memory // for the priority queue and the intermediate - // array of PriorityPair_t. - return pq.get_shmem_size(blockThreads) + - (e * sizeof(PriorityPair_t)); + // array of PriorityPairT. + return priority_queue.get_shmem_size(blockThreads) + + (e * sizeof(PriorityPairT)); }); - GPUTimer edge_priorities_timer; - edge_priorities_timer.start(); + timers.start("EdgePriority"); compute_edge_priorities <<>>(rx.get_context(), - *coords, - pq.get_mutable_device_view(), - pq.get_shmem_size(blockThreads)); - edge_priorities_timer.stop(); - e_priority_time += edge_priorities_timer.elapsed_millis(); - // cudaDeviceSynchronize(); - // RXMESH_TRACE("launch_box.smem_bytes_dyn = {}", - // launch_box.smem_bytes_dyn); RXMESH_TRACE("pq.get_shmem_size = {}", - // pq.get_shmem_size(blockThreads)); + launch_box.smem_bytes_dyn>>>( + rx.get_context(), + *coords, + priority_queue.get_mutable_device_view(), + priority_queue.get_shmem_size(blockThreads)); + timers.stop("EdgePriority"); // Next kernel needs to pop some percentage of the top // elements in the priority queue and store popped elements // to be used by the next kernel that actually does the collapses - - float reduce_ratio = edge_reduce_ratio; - const int num_edges_before = int(rx.get_num_edges()); + const int num_edges_before = int(rx.get_num_edges(true)); const int reduce_threshold = - std::max(1, int(reduce_ratio * float(num_edges_before))); + std::max(1, int(edge_reduce_ratio * float(num_edges_before))); // Mark the edge attributes to be collapsed - uint32_t pop_num_edges = - reduce_threshold; // reduce_ratio * rx.get_num_edges(); - // RXMESH_TRACE("pop_num_edges: {}", pop_num_edges); + uint32_t pop_num_edges = reduce_threshold; constexpr uint32_t threads_per_block = 256; uint32_t number_of_blocks = (pop_num_edges + threads_per_block - 1) / threads_per_block; - int shared_mem_bytes = pq.get_shmem_size(threads_per_block) + - (threads_per_block * sizeof(PriorityPair_t)); - // RXMESH_TRACE("threads_per_block: {}", threads_per_block); - // RXMESH_TRACE("number_of_blocks: {}", number_of_blocks); - // RXMESH_TRACE("shared_mem_bytes: {}", shared_mem_bytes); - - GPUTimer pop_mark_timer; - pop_mark_timer.start(); + int shared_mem_bytes = + priority_queue.get_shmem_size(threads_per_block) + + (threads_per_block * sizeof(PriorityPairT)); + + timers.start("PriorityQueuePop"); pop_and_mark_edges_to_collapse <<>>( - pq.get_mutable_device_view(), *e_pop_attr, pop_num_edges); + priority_queue.get_mutable_device_view(), + *to_collapse, + pop_num_edges); - // if(num_passes == 1) - // { - // render_edge_attr(rx, e_pop_attr); - // } - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); - pop_mark_timer.stop(); - pop_mark_time += pop_mark_timer.elapsed_millis(); + timers.stop("PriorityQueuePop"); - pq_timer.stop(); + timers.stop("PriorityQueue"); - pq_time += pq_timer.elapsed_millis(); + { + to_collapse->move(DEVICE, HOST); + rx.get_polyscope_mesh()->addEdgeScalarQuantity("ToCollapse", + *to_collapse); + polyscope::show(); + } - // loop over the mesh, and try to collapse + // loop over the mesh, and try to collapse rx.reset_scheduler(); while (!rx.is_queue_empty() && rx.get_num_vertices(true) > final_num_vertices) { @@ -257,67 +157,58 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, 3 * ShmemAllocator::default_alignment; }); - max_smem_bytes_dyn = - std::max(max_smem_bytes_dyn, launch_box.smem_bytes_dyn); - max_smem_bytes_static = - std::max(max_smem_bytes_static, launch_box.smem_bytes_static); - max_num_registers_per_thread = - std::max(max_num_registers_per_thread, - launch_box.num_registers_per_thread); - max_num_blocks = - std::max(max_num_blocks, DIVIDE_UP(launch_box.blocks, 8)); - GPUTimer app_timer; - - app_timer.start(); + timers.start("App"); secp<<>>( - rx.get_context(), *coords, reduce_threshold, *e_pop_attr); - // should we cudaDeviceSyn here? stopping timers too soon? - // CUDA_ERROR(cudaDeviceSynchronize()); - // CUDA_ERROR(cudaGetLastError()); + rx.get_context(), *coords, *to_collapse); + timers.stop("App"); - app_timer.stop(); - - GPUTimer cleanup_timer; - cleanup_timer.start(); + timers.start("Cleanup"); rx.cleanup(); - cleanup_timer.stop(); + timers.stop("Cleanup"); - GPUTimer slice_timer; - slice_timer.start(); + timers.start("Slice"); rx.slice_patches(*coords); - slice_timer.stop(); + timers.stop("Slice"); - GPUTimer cleanup_timer2; - cleanup_timer2.start(); + timers.start("Cleanup"); rx.cleanup(); - cleanup_timer2.stop(); + timers.stop("Cleanup"); + } + { + rx.update_polyscope(); - CUDA_ERROR(cudaDeviceSynchronize()); - CUDA_ERROR(cudaGetLastError()); + auto ps_mesh = rx.get_polyscope_mesh(); + ps_mesh->updateVertexPositions(*coords); + ps_mesh->setEnabled(false); - app_time += app_timer.elapsed_millis(); - slice_time += slice_timer.elapsed_millis(); - cleanup_time += cleanup_timer.elapsed_millis(); - cleanup_time += cleanup_timer2.elapsed_millis(); + rx.render_vertex_patch(); + rx.render_edge_patch(); + rx.render_face_patch(); + polyscope::show(); } } - timer.stop(); - total_time += timer.elapsed_millis(); + timers.stop("Total"); + + CUDA_ERROR(cudaProfilerStop()); RXMESH_INFO("secp_rxmesh() RXMesh SEC took {} (ms), num_passes= {}", - total_time, + timers.elapsed_millis("Total"), num_passes); - RXMESH_INFO("secp_rxmesh() PriorityQ time {} (ms)", pq_time); + RXMESH_INFO("secp_rxmesh() PriorityQ time {} (ms)", + timers.elapsed_millis("PriorityQueue")); RXMESH_INFO("secp_rxmesh() |-Edge priorities time {} (ms)", - e_priority_time); - RXMESH_INFO("secp_rxmesh() |-Pop and Mark time {} (ms)", pop_mark_time); - RXMESH_INFO("secp_rxmesh() App time {} (ms)", app_time); - RXMESH_INFO("secp_rxmesh() Slice timer {} (ms)", slice_time); - RXMESH_INFO("secp_rxmesh() Cleanup timer {} (ms)", cleanup_time); + timers.elapsed_millis("EdgePriority")); + RXMESH_INFO("secp_rxmesh() |-Pop and Mark time {} (ms)", + timers.elapsed_millis("PriorityQueuePop")); + RXMESH_INFO("secp_rxmesh() App time {} (ms)", timers.elapsed_millis("App")); + RXMESH_INFO("secp_rxmesh() Slice timer {} (ms)", + timers.elapsed_millis("Slice")); + RXMESH_INFO("secp_rxmesh() Cleanup timer {} (ms)", + timers.elapsed_millis("Cleanup")); RXMESH_INFO("#Vertices {}", rx.get_num_vertices(true)); RXMESH_INFO("#Edges {}", rx.get_num_edges(true)); @@ -326,8 +217,8 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, rx.update_host(); - coords->move(DEVICE, HOST); + EXPECT_TRUE(rx.validate()); report.add_member("num_passes", num_passes); report.add_member("max_smem_bytes_dyn", max_smem_bytes_dyn); @@ -335,11 +226,12 @@ inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx, report.add_member("max_num_registers_per_thread", max_num_registers_per_thread); report.add_member("max_num_blocks", max_num_blocks); - report.add_member("secp_remesh_time", total_time); - report.add_member("priority_queue_time", pq_time); - report.add_member("app_time", app_time); - report.add_member("slice_time", slice_time); - report.add_member("cleanup_time", cleanup_time); + report.add_member("secp_remesh_time", timers.elapsed_millis("Total")); + report.add_member("priority_queue_time", + timers.elapsed_millis("PriorityQueue")); + report.add_member("app_time", timers.elapsed_millis("App")); + report.add_member("slice_time", timers.elapsed_millis("Slice")); + report.add_member("cleanup_time", timers.elapsed_millis("Cleanup")); report.add_member("attributes_memory_mg", coords->get_memory_mg()); report.model_data(Arg.obj_file_name + "_after", rx, "model_after");