Skip to content

Commit

Permalink
fix patch slicing
Browse files Browse the repository at this point in the history
  • Loading branch information
Ahdhn committed Jan 3, 2025
1 parent 4712126 commit c0fb5b9
Show file tree
Hide file tree
Showing 4 changed files with 230 additions and 267 deletions.
68 changes: 42 additions & 26 deletions include/rxmesh/kernels/rxmesh_queries.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,24 +30,36 @@ __device__ __forceinline__ void block_mat_transpose(
fill_n<blockThreads>(temp_local, num_cols, uint16_t(0));
__syncthreads();

const uint32_t half_nnz = DIVIDE_UP(nnz, 2);
const uint32_t* mat_32 = reinterpret_cast<const uint32_t*>(mat);
for (int i = threadIdx.x; i < half_nnz; i += blockThreads) {
const uint32_t c = mat_32[i];

uint16_t c0 = detail::extract_low_bits<16>(c);
uint16_t c1 = detail::extract_high_bits<16>(c);

c0 = c0 >> shift;
// const uint32_t half_nnz = DIVIDE_UP(nnz, 2);
// const uint32_t* mat_32 = reinterpret_cast<const uint32_t*>(mat);
// for (int i = threadIdx.x; i < half_nnz; i += blockThreads) {
// const uint32_t c = mat_32[i];
//
// uint16_t c0 = detail::extract_low_bits<16>(c);
// uint16_t c1 = detail::extract_high_bits<16>(c);
//
// c0 = c0 >> shift;
//
// assert(c0 < num_cols);
//
// atomicAdd(temp_size + c0, 1u);
//
// if (i * 2 + 1 < nnz) {
// c1 = c1 >> shift;
// assert(c1 < num_cols);
// atomicAdd(temp_size + c1, 1u);
// }
// }

assert(c0 < num_cols);
for (int i = threadIdx.x; i < nnz; i += blockThreads) {
const uint32_t r = uint16_t(i) / rowOffset;
if (!is_deleted(r, row_active_mask)) {
uint32_t c = mat[i];

atomicAdd(temp_size + c0, 1u);
c = c >> shift;

if (i * 2 + 1 < nnz) {
c1 = c1 >> shift;
assert(c1 < num_cols);
atomicAdd(temp_size + c1, 1u);
assert(c < num_cols);
atomicAdd(temp_size + c, 1u);
}
}

Expand All @@ -57,28 +69,32 @@ __device__ __forceinline__ void block_mat_transpose(


for (int i = threadIdx.x; i < nnz; i += blockThreads) {
uint16_t col_id = mat[i];
const uint16_t row_id = uint16_t(i) / rowOffset;

col_id = col_id >> shift;
if (!is_deleted(row_id, row_active_mask)) {
uint16_t col_id = mat[i];

assert(col_id < num_cols);
col_id = col_id >> shift;

const uint16_t local_id = atomicAdd(temp_local + col_id, 1u);
assert(col_id < num_cols);

const uint16_t prefix = temp_size[col_id];
const uint16_t local_id = atomicAdd(temp_local + col_id, 1u);

assert(local_id < temp_size[col_id + 1] - temp_size[col_id]);
assert(local_id < nnz);
const uint16_t prefix = temp_size[col_id];

const uint16_t row_id = uint16_t(i) / rowOffset;
assert(local_id < temp_size[col_id + 1] - temp_size[col_id]);
assert(local_id < nnz);

assert(row_id < num_rows);
output[local_id + prefix] = row_id;

assert(row_id < num_rows);

output[local_id + prefix] = row_id;
}
}

__syncthreads();

assert(temp_size[num_cols] == nnz);
//assert(temp_size[num_cols] == nnz);
for (int i = threadIdx.x; i < num_cols + 1; i += blockThreads) {
mat[i] = temp_size[i];
}
Expand Down
187 changes: 110 additions & 77 deletions include/rxmesh/rxmesh_dynamic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,14 @@ __global__ static void hashtable_calibration(const Context context)
return;
}

PatchInfo pi = context.m_patches_info[pid];
if (pi.patch_id == INVALID32) {
return;
}

ShmemMutex patch_stash_mutex;
patch_stash_mutex.alloc();
PatchInfo pi = context.m_patches_info[pid];


hashtable_calibration<blockThreads, VertexHandle>(
context, pi, patch_stash_mutex);
Expand Down Expand Up @@ -429,6 +434,10 @@ __global__ static void remove_surplus_elements(Context context)

PatchInfo pi = context.m_patches_info[pid];

if (pi.patch_id == INVALID32) {
return;
}

context.m_patches_info[pid].child_id = INVALID32;

const uint16_t num_vertices = pi.num_vertices[0];
Expand Down Expand Up @@ -2131,6 +2140,22 @@ __global__ static void check_ribbon_faces(const Context context,
}
}


__global__ static void reset(uint32_t* v,
uint32_t* e,
uint32_t* f,
uint32_t* max_v,
uint32_t* max_e,
uint32_t* max_f)
{
v[0] = 0;
e[0] = 0;
f[0] = 0;
max_v[0] = 0;
max_e[0] = 0;
max_f[0] = 0;
}

} // namespace detail


Expand Down Expand Up @@ -2607,51 +2632,60 @@ bool RXMeshDynamic::validate()

void RXMeshDynamic::cleanup()
{
CUDA_ERROR(cudaMemcpy(&m_num_patches,
m_rxmesh_context.m_num_patches,
sizeof(uint32_t),
cudaMemcpyDeviceToHost));
// CUDA_ERROR(cudaMemcpy(&m_num_patches,
// m_rxmesh_context.m_num_patches,
// sizeof(uint32_t),
// cudaMemcpyDeviceToHost));

constexpr uint32_t block_size = 256;
const uint32_t grid_size = get_num_patches();

CUDA_ERROR(cudaMemcpy(&this->m_max_vertices_per_patch,
this->m_rxmesh_context.m_max_num_vertices,
sizeof(uint32_t),
cudaMemcpyDeviceToHost));

CUDA_ERROR(cudaMemcpy(&this->m_max_edges_per_patch,
this->m_rxmesh_context.m_max_num_edges,
sizeof(uint32_t),
cudaMemcpyDeviceToHost));

CUDA_ERROR(cudaMemcpy(&this->m_max_faces_per_patch,
this->m_rxmesh_context.m_max_num_faces,
sizeof(uint32_t),
cudaMemcpyDeviceToHost));

CUDA_ERROR(
cudaMemset(m_rxmesh_context.m_num_vertices, 0, sizeof(uint32_t)));
CUDA_ERROR(cudaMemset(m_rxmesh_context.m_num_edges, 0, sizeof(uint32_t)));
CUDA_ERROR(cudaMemset(m_rxmesh_context.m_num_faces, 0, sizeof(uint32_t)));

CUDA_ERROR(
cudaMemset(m_rxmesh_context.m_max_num_vertices, 0, sizeof(uint32_t)));
CUDA_ERROR(
cudaMemset(m_rxmesh_context.m_max_num_edges, 0, sizeof(uint32_t)));
CUDA_ERROR(
cudaMemset(m_rxmesh_context.m_max_num_faces, 0, sizeof(uint32_t)));
const uint32_t grid_size = get_max_num_patches();

// CUDA_ERROR(cudaMemcpy(&this->m_max_vertices_per_patch,
// this->m_rxmesh_context.m_max_num_vertices,
// sizeof(uint32_t),
// cudaMemcpyDeviceToHost));
//
// CUDA_ERROR(cudaMemcpy(&this->m_max_edges_per_patch,
// this->m_rxmesh_context.m_max_num_edges,
// sizeof(uint32_t),
// cudaMemcpyDeviceToHost));
//
// CUDA_ERROR(cudaMemcpy(&this->m_max_faces_per_patch,
// this->m_rxmesh_context.m_max_num_faces,
// sizeof(uint32_t),
// cudaMemcpyDeviceToHost));

// CUDA_ERROR(
// cudaMemset(m_rxmesh_context.m_num_vertices, 0, sizeof(uint32_t)));
// CUDA_ERROR(cudaMemset(m_rxmesh_context.m_num_edges, 0,
// sizeof(uint32_t))); CUDA_ERROR(cudaMemset(m_rxmesh_context.m_num_faces,
// 0, sizeof(uint32_t)));
//
// CUDA_ERROR(
// cudaMemset(m_rxmesh_context.m_max_num_vertices, 0,
// sizeof(uint32_t)));
// CUDA_ERROR(
// cudaMemset(m_rxmesh_context.m_max_num_edges, 0, sizeof(uint32_t)));
// CUDA_ERROR(
// cudaMemset(m_rxmesh_context.m_max_num_faces, 0, sizeof(uint32_t)));

detail::reset<<<1, 1>>>(m_rxmesh_context.m_num_vertices,
m_rxmesh_context.m_num_edges,
m_rxmesh_context.m_num_faces,
m_rxmesh_context.m_max_num_vertices,
m_rxmesh_context.m_max_num_edges,
m_rxmesh_context.m_max_num_faces);

uint32_t dyn_shmem = 0;

dyn_shmem += 3 * detail::mask_num_bytes(this->m_max_vertices_per_patch) +
3 * ShmemAllocator::default_alignment;
dyn_shmem +=
3 * detail::mask_num_bytes(get_per_patch_max_vertex_capacity()) +
3 * ShmemAllocator::default_alignment;

dyn_shmem += 3 * detail::mask_num_bytes(this->m_max_edges_per_patch) +
dyn_shmem += 3 * detail::mask_num_bytes(get_per_patch_max_edge_capacity()) +
3 * ShmemAllocator::default_alignment;

dyn_shmem += 3 * detail::mask_num_bytes(this->m_max_faces_per_patch) +
dyn_shmem += 3 * detail::mask_num_bytes(get_per_patch_max_face_capacity()) +
3 * ShmemAllocator::default_alignment;

uint32_t hash_table_shmem =
Expand All @@ -2664,8 +2698,8 @@ void RXMeshDynamic::cleanup()

uint32_t connect_shmem =
2 * ShmemAllocator::default_alignment +
(3 * this->m_max_faces_per_patch) * sizeof(uint16_t) +
(2 * this->m_max_edges_per_patch) * sizeof(uint16_t);
(3 * get_per_patch_max_face_capacity()) * sizeof(uint16_t) +
(2 * get_per_patch_max_edge_capacity()) * sizeof(uint16_t);

dyn_shmem += std::max(hash_table_shmem, connect_shmem);

Expand All @@ -2675,20 +2709,20 @@ void RXMeshDynamic::cleanup()
detail::remove_surplus_elements<block_size>
<<<grid_size, block_size, dyn_shmem>>>(this->m_rxmesh_context);

CUDA_ERROR(cudaMemcpy(&this->m_max_vertices_per_patch,
this->m_rxmesh_context.m_max_num_vertices,
sizeof(uint32_t),
cudaMemcpyDeviceToHost));

CUDA_ERROR(cudaMemcpy(&this->m_max_edges_per_patch,
this->m_rxmesh_context.m_max_num_edges,
sizeof(uint32_t),
cudaMemcpyDeviceToHost));

CUDA_ERROR(cudaMemcpy(&this->m_max_faces_per_patch,
this->m_rxmesh_context.m_max_num_faces,
sizeof(uint32_t),
cudaMemcpyDeviceToHost));
// CUDA_ERROR(cudaMemcpy(&this->m_max_vertices_per_patch,
// this->m_rxmesh_context.m_max_num_vertices,
// sizeof(uint32_t),
// cudaMemcpyDeviceToHost));
//
// CUDA_ERROR(cudaMemcpy(&this->m_max_edges_per_patch,
// this->m_rxmesh_context.m_max_num_edges,
// sizeof(uint32_t),
// cudaMemcpyDeviceToHost));
//
// CUDA_ERROR(cudaMemcpy(&this->m_max_faces_per_patch,
// this->m_rxmesh_context.m_max_num_faces,
// sizeof(uint32_t),
// cudaMemcpyDeviceToHost));
}

void RXMeshDynamic::update_host()
Expand Down Expand Up @@ -2949,30 +2983,29 @@ void RXMeshDynamic::update_polyscope(std::string new_name)
}


template __device__ void detail::slice<256>(
Context&,
cooperative_groups::thread_block&,
PatchInfo&,
const uint32_t,
const uint16_t,
const uint16_t,
const uint16_t,
PatchStash&,
// PatchStash&,
Bitmask&,
Bitmask&,
Bitmask&,
const Bitmask&,
const Bitmask&,
const Bitmask&,
const uint16_t*,
const uint16_t*,
Bitmask&,
Bitmask&,
Bitmask&,
Bitmask&,
Bitmask&,
Bitmask&);
template __device__ void detail::slice<256>(Context&,
cooperative_groups::thread_block&,
PatchInfo&,
const uint32_t,
const uint16_t,
const uint16_t,
const uint16_t,
PatchStash&,
// PatchStash&,
Bitmask&,
Bitmask&,
Bitmask&,
const Bitmask&,
const Bitmask&,
const Bitmask&,
const uint16_t*,
const uint16_t*,
Bitmask&,
Bitmask&,
Bitmask&,
Bitmask&,
Bitmask&,
Bitmask&);

template __device__ void detail::bi_assignment<256>(
cooperative_groups::thread_block&,
Expand Down
Loading

0 comments on commit c0fb5b9

Please sign in to comment.