Skip to content

Commit

Permalink
fix type mismatch and convert real valuetype to device (for __half)
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Feb 8, 2023
1 parent 89b8cec commit 37f50b4
Show file tree
Hide file tree
Showing 14 changed files with 48 additions and 43 deletions.
3 changes: 1 addition & 2 deletions common/cuda_hip/matrix/fbcsr_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,7 @@ void fallback_transpose(const std::shared_ptr<const DefaultExecutor> exec,
thrust::device_pointer_cast(out_row_idxs.get_data()),
thrust::device_pointer_cast(out_col_idxs),
thrust::device_pointer_cast(permutation.get_data())));
using tuple_type =
thrust::tuple<IndexType, IndexType, device_type<ValueType>>;
using tuple_type = thrust::tuple<IndexType, IndexType, IndexType>;
thrust::sort(thrust::device, zip_it, zip_it + nnzb,
[] __device__(const tuple_type& a, const tuple_type& b) {
return thrust::tie(thrust::get<0>(a), thrust::get<1>(a)) <
Expand Down
11 changes: 7 additions & 4 deletions common/unified/multigrid/pgm_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,9 @@ void find_strongest_neighbor(
exec,
[] GKO_KERNEL(auto row, auto row_ptrs, auto col_idxs, auto weight_vals,
auto diag, auto agg, auto strongest_neighbor) {
auto max_weight_unagg = zero<ValueType>();
auto max_weight_agg = zero<ValueType>();
using value_type = device_type<ValueType>;
auto max_weight_unagg = zero<value_type>();
auto max_weight_agg = zero<value_type>();
IndexType strongest_unagg = -1;
IndexType strongest_agg = -1;
if (agg[row] != -1) {
Expand Down Expand Up @@ -267,7 +268,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
if (agg_val[row] != -1) {
return;
}
ValueType max_weight_agg = zero<ValueType>();
using value_type = device_type<ValueType>;
value_type max_weight_agg = zero<value_type>();
IndexType strongest_agg = -1;
for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
auto col = col_idxs[idx];
Expand Down Expand Up @@ -304,7 +306,8 @@ void assign_to_exist_agg(std::shared_ptr<const DefaultExecutor> exec,
if (agg_val[row] != -1) {
return;
}
ValueType max_weight_agg = zero<ValueType>();
using value_type = device_type<ValueType>;
value_type max_weight_agg = zero<value_type>();
IndexType strongest_agg = -1;
for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; idx++) {
auto col = col_idxs[idx];
Expand Down
8 changes: 4 additions & 4 deletions cuda/factorization/par_ilut_filter_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
if (num_blocks > 0) {
kernel::threshold_filter_nnz<subwarp_size>
<<<num_blocks, default_block_size>>>(
old_row_ptrs, as_cuda_type(old_vals), num_rows, threshold,
new_row_ptrs, lower);
old_row_ptrs, as_cuda_type(old_vals), num_rows,
as_cuda_type(threshold), new_row_ptrs, lower);
}

// build row pointers
Expand Down Expand Up @@ -126,8 +126,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
kernel::threshold_filter<subwarp_size>
<<<num_blocks, default_block_size>>>(
old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), num_rows,
threshold, new_row_ptrs, new_row_idxs, new_col_idxs,
as_cuda_type(new_vals), lower);
as_cuda_type(threshold), new_row_ptrs, new_row_idxs,
new_col_idxs, as_cuda_type(new_vals), lower);
}
}

Expand Down
6 changes: 3 additions & 3 deletions cuda/factorization/par_ilut_select_common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
// pick sample, build searchtree
kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size,
tree);
as_cuda_type(tree));
// determine bucket sizes
if (num_blocks > 0) {
kernel::count_buckets<<<num_blocks, default_block_size>>>(
as_cuda_type(values), size, tree, partial_counts, oracles,
items_per_thread);
as_cuda_type(values), size, as_cuda_type(tree), partial_counts,
oracles, items_per_thread);
}
// compute prefix sum and total sum over block-local values
kernel::block_prefix_sum<<<bucket_count, default_block_size>>>(
Expand Down
6 changes: 3 additions & 3 deletions cuda/factorization/par_ilut_select_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ void sampleselect_filter(const ValueType* values, IndexType size,
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
if (num_blocks > 0) {
kernel::filter_bucket<<<num_blocks, default_block_size>>>(
as_cuda_type(values), size, bucket, oracles, partial_counts, out,
items_per_thread);
as_cuda_type(values), size, bucket, oracles, partial_counts,
as_cuda_type(out), items_per_thread);
}
}

Expand Down Expand Up @@ -172,7 +172,7 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
// base case
auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
kernel::basecase_select<<<1, kernel::basecase_block_size>>>(
tmp22, bucket.size, rank, out_ptr);
as_cuda_type(tmp22), bucket.size, rank, as_cuda_type(out_ptr));
threshold = exec->copy_val_to_host(out_ptr);
}

Expand Down
2 changes: 1 addition & 1 deletion cuda/solver/idr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ void compute_omega(
{
const auto grid_dim = ceildiv(nrhs, config::warp_size);
compute_omega_kernel<<<grid_dim, config::warp_size>>>(
nrhs, kappa, as_cuda_type(tht->get_const_values()),
nrhs, as_cuda_type(kappa), as_cuda_type(tht->get_const_values()),
as_cuda_type(residual_norm->get_const_values()),
as_cuda_type(omega->get_values()),
as_cuda_type(stop_status->get_const_data()));
Expand Down
2 changes: 1 addition & 1 deletion cuda/solver/multigrid_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
if (grid > 0) {
kernel::kcycle_check_stop_kernel<<<grid, default_block_size>>>(
nrhs, as_cuda_type(old_norm->get_const_values()),
as_cuda_type(new_norm->get_const_values()), rel_tol,
as_cuda_type(new_norm->get_const_values()), as_cuda_type(rel_tol),
as_cuda_type(dis_stop.get_data()));
}
is_stop = exec->copy_val_to_host(dis_stop.get_const_data());
Expand Down
4 changes: 2 additions & 2 deletions cuda/stop/residual_norm_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ void residual_norm(std::shared_ptr<const CudaExecutor> exec,

if (grid_size > 0) {
residual_norm_kernel<<<grid_size, block_size>>>(
tau->get_size()[1], rel_residual_goal,
tau->get_size()[1], as_cuda_type(rel_residual_goal),
as_cuda_type(tau->get_const_values()),
as_cuda_type(orig_tau->get_const_values()), stoppingId,
setFinalized, as_cuda_type(stop_status->get_data()),
Expand Down Expand Up @@ -185,7 +185,7 @@ void implicit_residual_norm(

if (grid_size > 0) {
implicit_residual_norm_kernel<<<grid_size, block_size>>>(
tau->get_size()[1], rel_residual_goal,
tau->get_size()[1], as_cuda_type(rel_residual_goal),
as_cuda_type(tau->get_const_values()),
as_cuda_type(orig_tau->get_const_values()), stoppingId,
setFinalized, as_cuda_type(stop_status->get_data()),
Expand Down
8 changes: 5 additions & 3 deletions hip/factorization/par_ilut_filter_kernel.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
hipLaunchKernelGGL(
HIP_KERNEL_NAME(kernel::threshold_filter_nnz<subwarp_size>),
num_blocks, default_block_size, 0, 0, old_row_ptrs,
as_hip_type(old_vals), num_rows, threshold, new_row_ptrs, lower);
as_hip_type(old_vals), num_rows, as_hip_type(threshold),
new_row_ptrs, lower);
}

// build row pointers
Expand Down Expand Up @@ -129,8 +130,9 @@ void threshold_filter(syn::value_list<int, subwarp_size>,
hipLaunchKernelGGL(
HIP_KERNEL_NAME(kernel::threshold_filter<subwarp_size>), num_blocks,
default_block_size, 0, 0, old_row_ptrs, old_col_idxs,
as_hip_type(old_vals), num_rows, threshold, new_row_ptrs,
new_row_idxs, new_col_idxs, as_hip_type(new_vals), lower);
as_hip_type(old_vals), num_rows, as_hip_type(threshold),
new_row_ptrs, new_row_idxs, new_col_idxs, as_hip_type(new_vals),
lower);
}
}

Expand Down
6 changes: 4 additions & 2 deletions hip/factorization/par_ilut_select_common.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,14 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
// pick sample, build searchtree
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::build_searchtree), 1,
bucket_count, 0, 0, as_hip_type(values), size, tree);
bucket_count, 0, 0, as_hip_type(values), size,
as_hip_type(tree));
// determine bucket sizes
if (num_blocks > 0) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::count_buckets), num_blocks,
default_block_size, 0, 0, as_hip_type(values), size,
tree, partial_counts, oracles, items_per_thread);
as_hip_type(tree), partial_counts, oracles,
items_per_thread);
}
// compute prefix sum and total sum over block-local values
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::block_prefix_sum), bucket_count,
Expand Down
6 changes: 3 additions & 3 deletions hip/factorization/par_ilut_select_kernel.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void sampleselect_filter(const ValueType* values, IndexType size,
if (num_blocks > 0) {
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::filter_bucket), num_blocks,
default_block_size, 0, 0, as_hip_type(values), size,
bucket, oracles, partial_counts, out,
bucket, oracles, partial_counts, as_hip_type(out),
items_per_thread);
}
}
Expand Down Expand Up @@ -176,8 +176,8 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,
// base case
auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel::basecase_select), 1,
kernel::basecase_block_size, 0, 0, tmp22, bucket.size,
rank, out_ptr);
kernel::basecase_block_size, 0, 0, as_hip_type(tmp22),
bucket.size, rank, as_hip_type(out_ptr));
threshold = exec->copy_val_to_host(out_ptr);
}

Expand Down
2 changes: 1 addition & 1 deletion hip/solver/idr_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,7 @@ void compute_omega(
{
const auto grid_dim = ceildiv(nrhs, config::warp_size);
hipLaunchKernelGGL(HIP_KERNEL_NAME(compute_omega_kernel), grid_dim,
config::warp_size, 0, 0, nrhs, kappa,
config::warp_size, 0, 0, nrhs, as_hip_type(kappa),
as_hip_type(tht->get_const_values()),
as_hip_type(residual_norm->get_const_values()),
as_hip_type(omega->get_values()),
Expand Down
10 changes: 5 additions & 5 deletions hip/solver/multigrid_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,11 @@ void kcycle_check_stop(std::shared_ptr<const DefaultExecutor> exec,
const auto nrhs = new_norm->get_size()[1];
const auto grid = ceildiv(nrhs, default_block_size);
if (grid > 0) {
hipLaunchKernelGGL(kernel::kcycle_check_stop_kernel, grid,
default_block_size, 0, 0, nrhs,
as_hip_type(old_norm->get_const_values()),
as_hip_type(new_norm->get_const_values()), rel_tol,
as_hip_type(dis_stop.get_data()));
hipLaunchKernelGGL(
kernel::kcycle_check_stop_kernel, grid, default_block_size, 0, 0,
nrhs, as_hip_type(old_norm->get_const_values()),
as_hip_type(new_norm->get_const_values()), as_hip_type(rel_tol),
as_hip_type(dis_stop.get_data()));
}
is_stop = exec->copy_val_to_host(dis_stop.get_const_data());
}
Expand Down
17 changes: 8 additions & 9 deletions hip/stop/residual_norm_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void residual_norm(std::shared_ptr<const HipExecutor> exec,

if (grid_size > 0) {
hipLaunchKernelGGL((residual_norm_kernel), grid_size, block_size, 0, 0,
tau->get_size()[1], rel_residual_goal,
tau->get_size()[1], as_hip_type(rel_residual_goal),
as_hip_type(tau->get_const_values()),
as_hip_type(orig_tau->get_const_values()),
stoppingId, setFinalized,
Expand Down Expand Up @@ -143,13 +143,12 @@ constexpr int default_block_size = 512;

template <typename ValueType>
__global__
__launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
size_type num_cols, remove_complex<ValueType> rel_residual_goal,
const ValueType* __restrict__ tau,
const remove_complex<ValueType>* __restrict__ orig_tau,
uint8 stoppingId, bool setFinalized,
stopping_status* __restrict__ stop_status,
bool* __restrict__ device_storage)
__launch_bounds__(default_block_size) void implicit_residual_norm_kernel(
size_type num_cols, remove_complex<ValueType> rel_residual_goal,
const ValueType* __restrict__ tau,
const remove_complex<ValueType>* __restrict__ orig_tau, uint8 stoppingId,
bool setFinalized, stopping_status* __restrict__ stop_status,
bool* __restrict__ device_storage)
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < num_cols) {
Expand Down Expand Up @@ -192,7 +191,7 @@ void implicit_residual_norm(
if (grid_size > 0) {
hipLaunchKernelGGL(
(implicit_residual_norm_kernel), grid_size, block_size, 0, 0,
tau->get_size()[1], rel_residual_goal,
tau->get_size()[1], as_hip_type(rel_residual_goal),
as_hip_type(tau->get_const_values()),
as_hip_type(orig_tau->get_const_values()), stoppingId, setFinalized,
as_hip_type(stop_status->get_data()),
Expand Down

0 comments on commit 37f50b4

Please sign in to comment.