From cb282339eb8e4f9bd0120ca76b890c17193c3e1e Mon Sep 17 00:00:00 2001
From: ahmed <a.has.mahmoud@gmail.com>
Date: Wed, 11 Dec 2024 13:57:52 -0500
Subject: [PATCH] patch stash mapper in cavity

---
 include/rxmesh/cavity_manager.cuh      |  18 ++-
 include/rxmesh/cavity_manager_impl.cuh | 146 ++++++++++++++-----------
 include/rxmesh/patch_lock.h            |  12 +-
 3 files changed, 106 insertions(+), 70 deletions(-)
diff --git a/include/rxmesh/cavity_manager.cuh b/include/rxmesh/cavity_manager.cuh
index 70a4bc8b..35828eeb 100644
--- a/include/rxmesh/cavity_manager.cuh
+++ b/include/rxmesh/cavity_manager.cuh
@@ -779,7 +779,7 @@ struct CavityManager
 
     /**
      * @brief return the number of elements as stored in shared memory based on
-     * template paramter
+     * template parameter
      */
     template <typename HandleT>
     __device__ __inline__ uint32_t get_num_elements()
@@ -870,6 +870,15 @@ struct CavityManager
         }
     }
 
+    /**
+     * @brief build patch stash mapper that maps q's patch stash index to p's
+     * patch stash index
+     */
+    __device__ __inline__ void build_patch_stash_mapper(
+        cooperative_groups::thread_block& block,
+        const PatchInfo&                  q_patch_info);
+
+
     // indicate if this block can write its updates to global memory during
     // epilogue
     bool m_write_to_gmem;
@@ -879,7 +888,7 @@ struct CavityManager
 
     // the prefix sum of the cavities sizes. the size of the cavity is the
     // number of boundary edges in the cavity
-    // this also could have be uint16_t but we use itn since we do atomicAdd on
+    // this also could have be uint16_t but we use it since we do atomicAdd on
     // it
     int* m_s_cavity_size_prefix;
 
@@ -981,6 +990,11 @@ struct CavityManager
     // patch stash stored in shared memory
     PatchStash m_s_patch_stash;
 
+    // indexed by q's patch stash id and returns the corresponding p's patch
+    // stash id. if the patch corresponds to p itself, we stores INVALID8-1
+    // if the patch does not exits in p's patch stash, we store INVALID8
+    uint8_t* m_s_patch_stash_mapper;
+
     PatchInfo m_patch_info;
     Context   m_context;
 
diff --git a/include/rxmesh/cavity_manager_impl.cuh b/include/rxmesh/cavity_manager_impl.cuh
index a91aa29d..92a37524 100644
--- a/include/rxmesh/cavity_manager_impl.cuh
+++ b/include/rxmesh/cavity_manager_impl.cuh
@@ -380,6 +380,9 @@ CavityManager<blockThreads, cop>::alloc_shared_memory(
         m_s_patch_stash.m_stash[i] = m_patch_info.patch_stash.m_stash[i];
     }
 
+    __shared__ uint8_t p_st2[PatchStash::stash_size];
+    m_s_patch_stash_mapper = p_st2;
+
     // cavity prefix sum
     // this assertion is because when we allocated dynamic shared memory
     // during kernel launch we assumed the number of cavities is at most
@@ -2703,15 +2706,6 @@ CavityManager<blockThreads, cop>::soft_migrate_from_patch(
                 if (!inserted) {
                     m_s_should_slice[0] = true;
                 }
-                // if (!inserted) {
-                //     printf("\n p= %u, load factor = %f, stash load factor =
-                //     %f",
-                //            patch_id(),
-                //            m_patch_info.lp_v.compute_load_factor(m_s_table_v),
-                //            m_patch_info.lp_v.compute_load_factor(
-                //                m_s_table_stash_v));
-                // }
-                //  assert(inserted);
             }
             block.sync();
         }
@@ -3276,7 +3270,7 @@ __device__ __inline__ LPPair CavityManager<blockThreads, cop>::migrate_vertex(
                 assert(owner_stash_id < m_s_patches_to_lock_mask.size());
                 m_s_patches_to_lock_mask.set(owner_stash_id, true);
             } else if (o != q && o != m_patch_info.patch_id &&
-                       o_stash != INVALID4) {
+                       o_stash != INVALID8) {
                 assert(o_stash != INVALID8);
                 assert(o_stash < m_s_patches_to_lock_mask.size());
                 m_s_patches_to_lock_mask.set(o_stash, true);
@@ -3418,7 +3412,7 @@ __device__ __inline__ LPPair CavityManager<blockThreads, cop>::migrate_edge(
                 assert(owner_stash_id < m_s_patches_to_lock_mask.size());
                 m_s_patches_to_lock_mask.set(owner_stash_id, true);
             } else if (o != q && o != m_patch_info.patch_id &&
-                       o_stash != INVALID4) {
+                       o_stash != INVALID8) {
                 assert(o_stash != INVALID8);
                 assert(o_stash < m_s_patches_to_lock_mask.size());
                 m_s_patches_to_lock_mask.set(o_stash, true);
@@ -3520,7 +3514,7 @@ __device__ __inline__ LPPair CavityManager<blockThreads, cop>::migrate_face(
                 assert(owner_stash_id < m_s_patches_to_lock_mask.size());
                 m_s_patches_to_lock_mask.set(owner_stash_id, true);
             } else if (o != q && o != m_patch_info.patch_id &&
-                       o_stash != INVALID4) {
+                       o_stash != INVALID8) {
                 assert(o_stash != INVALID8);
                 assert(o_stash < m_s_patches_to_lock_mask.size());
                 m_s_patches_to_lock_mask.set(o_stash, true);
@@ -3723,11 +3717,25 @@ CavityManager<blockThreads, cop>::populate_correspondence(
     }
 
 
+    // build patch stash mapper
+    build_patch_stash_mapper(block, q_patch_info);
+    block.sync();
+
+
     // for other not-owned elements in q, we store the local id in the owner
     // patch and patch stash id (in q patch stash). When we store the local id
     // in the owner patch, we set the high bit to one to mark these elements
     // differently for the next for loop, where we check if these elements
     // exists in p
+
+    // The q's not-owned element could be
+    // 1) don't exists at all in p.
+    // 2) elements where the owner patch is p.
+    // 3) elements where the owner patch is k and these
+    // elements have a copy in p. This will be handle in the next for loop
+    //
+    // To handle 1) and 3), we have to convert q's patch stash id to p's patch
+    // stash id and this is what we do in this for loop as well
     int q_num_elements = q_patch_info.get_num_elements<HandleT>()[0];
     assert(q_num_elements <= s_correspondence_size);
 
@@ -3740,65 +3748,53 @@ CavityManager<blockThreads, cop>::populate_correspondence(
 
             assert(s_correspondence[b] == INVALID16);
             assert(lp.patch_stash_id() < PatchStash::stash_size);
+            assert(lp.patch_stash_id() != INVALID8);
+
 
             // set the high bit to one
             uint16_t s = lp.local_id_in_owner_patch();
-            s |= (1 << 15);
 
-            s_correspondence[b] = s;
-
-            // patch stash id in q's patch stash
-            s_correspondence_stash[b] = lp.patch_stash_id();
 
-            // TODO assert that this entry is actually owned by the other patch
-        }
-    }
-    block.sync();
-
-
-    // The for loop above could result into q's not-owned element that are
-    // 1) don't exists at all in p.
-    // 2) elements where the owner patch is p. This will
-    // handle in this for loop
-    // 3) elements where the owner patch is k and these
-    // elements have a copy in p. This will be handle in the next for loop
-    //
-    // To handle 1) and 3), we have to convert q's patch stash id to p's patch
-    // stash id and this is what we do in this for loop
+            // it might sound wise to assert that the k_patch (which is the
+            // patch  point to by lp) is the owning patch of s. But, we are not
+            // even sure if we are going to use s at all. Also, later on, we
+            // check if k_pach is the owner (ensure_ownership) and if it is
+            // dirty in migrate() However, in there, we do this check on the
+            // patches that we are sure that we are going to change their
+            // ownership.
+            //
+            // const uint32_t k_patch =
+            //     q_patch_info.patch_stash.get_patch(lp.patch_stash_id());
+            // assert(m_context.m_patches_info[k_patch].is_owned(LocalT(s)));
 
-    for (int b = threadIdx.x; b < q_num_elements; b += blockThreads) {
-        // if the highest bit is set
-        const uint16_t corres = s_correspondence[b];
-        if (corres != INVALID16 && (corres & (1 << 15))) {
-            const uint8_t  q_stash_id = s_correspondence_stash[b];
-            const uint32_t k_patch =
-                q_patch_info.patch_stash.get_patch(q_stash_id);
 
-            assert(k_patch != INVALID32);
+            // patch stash id in q's patch stash
+            const uint8_t p_stash_id =
+                m_s_patch_stash_mapper[lp.patch_stash_id()];
 
             // Case 2
-            if (k_patch == patch_id()) {
-                // clear the high bit
-                s_correspondence[b]       = corres & ~(1 << 15);
+            if (p_stash_id == INVALID8 - 1u) {
                 s_correspondence_stash[b] = INVALID8;
-                continue;
-            }
-
-            // Case 1, early exit, i.e., if the k patch is not in p's patch
-            // stash, then the element is for sure not in p at all
+            } else {
 
-            const uint8_t p_stash_id =
-                m_s_patch_stash.find_patch_index(k_patch);
+                // Case 1, early exit, i.e., if the owner patch is not in p's
+                // patch stash, then the element is for sure not in p at all
 
-            // Could be either INVALID8 or the actual patch stash ID in p's
-            // patch stash
-            s_correspondence_stash[b] = p_stash_id;
+                // Could be either INVALID8 or the actual patch stash ID in p's
+                // patch stash
+                s_correspondence_stash[b] = p_stash_id;
 
-            if (p_stash_id == INVALID8) {
-                // if this patch is not in p, then reset this correspondence
-                // entry
-                s_correspondence[b] = INVALID16;
+                if (p_stash_id == INVALID8) {
+                    // if this patch is not in p, then reset this correspondence
+                    // entry
+                    s = INVALID16;
+                } else {
+                    // otherwise, mark this element by setting its highest bit
+                    s |= (1 << 15);
+                }
             }
+
+            s_correspondence[b] = s;
         }
     }
     block.sync();
@@ -3807,14 +3803,15 @@ CavityManager<blockThreads, cop>::populate_correspondence(
     // Now, in s_correspondence, there are some elements that are stored where
     // we have the local index in the owner patch and the patch stash in p.
     // some of these elements have a corresponding copy in p and we want to
-    // replace thier entries with the local index in p. Of course these are
+    // replace their entries with the local index in p. Of course these are
     // not-owned elements in p. So, there are different way of doing this.
-    // 1) outer for loop on the correspondance, and inner loop on p's table and
+    // 1) outer for loop on the correspondence, and inner loop on p's table and
     // p's table stash
     // 2) outer loop on p's not-owned elements and find the LPPair, then inner
-    // for loop in the correspondance
+    // for loop in the correspondence
     // We go with 2) since we have "probably" have smaller number of
-    // correspondance to search through than number of not-owned elements
+    // correspondence to search through than number of not-owned elements
+
 
     const int num_elements = get_num_elements<HandleT>();
 
@@ -3845,7 +3842,7 @@ CavityManager<blockThreads, cop>::populate_correspondence(
 
     block.sync();
 
-    // finally reset the correspondance element where their high bit is set to
+    // finally reset the correspondence element where their high bit is set to
     // one since these elements are not-owned in q but they don't appear in p
     for (int b = threadIdx.x; b < q_num_elements; b += blockThreads) {
         // if the highest bit is set
@@ -3858,6 +3855,31 @@ CavityManager<blockThreads, cop>::populate_correspondence(
 }
 
 
+template <uint32_t blockThreads, CavityOp cop>
+__device__ __inline__ void
+CavityManager<blockThreads, cop>::build_patch_stash_mapper(
+    cooperative_groups::thread_block& block,
+    const PatchInfo&                  q_patch_info)
+{
+    // build patch stash mapper that maps q's patch stash index to p's patch
+    // stash index
+    for (int q_stash_id = threadIdx.x; q_stash_id < PatchStash::stash_size;
+         q_stash_id += blockThreads) {
+
+        const uint32_t k_patch = q_patch_info.patch_stash.get_patch(q_stash_id);
+
+        if (k_patch != INVALID32) {
+            const uint8_t p_stash_id =
+                m_s_patch_stash.find_patch_index(k_patch);
+            m_s_patch_stash_mapper[q_stash_id] = p_stash_id;
+        } else if (k_patch == patch_id()) {
+            m_s_patch_stash_mapper[q_stash_id] = INVALID8 - 1u;
+        } else {
+            m_s_patch_stash_mapper[q_stash_id] = INVALID8;
+        }
+    }
+}
+
 template <uint32_t blockThreads, CavityOp cop>
 template <typename HandleT>
 __device__ __inline__ bool CavityManager<blockThreads, cop>::ensure_ownership(
diff --git a/include/rxmesh/patch_lock.h b/include/rxmesh/patch_lock.h
index 9925e789..c5f6882a 100644
--- a/include/rxmesh/patch_lock.h
+++ b/include/rxmesh/patch_lock.h
@@ -10,12 +10,12 @@ namespace rxmesh {
  */
 struct PatchLock
 {
-    __device__ __host__ PatchLock() : lock(nullptr), spin(nullptr){};
-    __device__ __host__ PatchLock(const PatchLock& other) = default;
-    __device__ __host__ PatchLock(PatchLock&&)            = default;
+    __device__ __host__ PatchLock() : lock(nullptr), spin(nullptr) {};
+    __device__ __host__ PatchLock(const PatchLock& other)      = default;
+    __device__ __host__ PatchLock(PatchLock&&)                 = default;
     __device__ __host__ PatchLock& operator=(const PatchLock&) = default;
-    __device__ __host__ PatchLock& operator=(PatchLock&&) = default;
-    __device__                     __host__ ~PatchLock()  = default;
+    __device__ __host__ PatchLock& operator=(PatchLock&&)      = default;
+    __device__                     __host__ ~PatchLock()       = default;
 
 
     /**
@@ -58,7 +58,7 @@ struct PatchLock
     /**
      * @brief check if the patch is locked
      */
-    __device__ bool is_locked()
+    __device__ bool is_locked() const
     {
 #ifdef __CUDA_ARCH__
         return atomic_read(lock) == LOCKED;