diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 5cfa55ca687..c16c80d2d6a 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -5,6 +5,7 @@ set(CUDA_HIP_SOURCES base/index_set_kernels.cpp components/prefix_sum_kernels.cpp distributed/assembly_kernels.cpp + distributed/dd_matrix_kernels.cpp distributed/index_map_kernels.cpp distributed/matrix_kernels.cpp distributed/partition_helpers_kernels.cpp diff --git a/common/cuda_hip/distributed/dd_matrix_kernels.cpp b/common/cuda_hip/distributed/dd_matrix_kernels.cpp new file mode 100644 index 00000000000..711a858995a --- /dev/null +++ b/common/cuda_hip/distributed/dd_matrix_kernels.cpp @@ -0,0 +1,136 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/distributed/dd_matrix_kernels.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/components/atomic.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace distributed_dd_matrix { + + +template +void filter_non_owning_idxs( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + const experimental::distributed::Partition* + col_partition, + comm_index_type local_part, array& non_local_row_idxs, + array& non_local_col_idxs) +{ + auto input_vals = input.get_const_values(); + auto row_part_ids = row_partition->get_part_ids(); + auto col_part_ids = col_partition->get_part_ids(); + const auto* row_range_bounds = row_partition->get_range_bounds(); + const auto* col_range_bounds = col_partition->get_range_bounds(); + const auto* row_range_starting_indices = + row_partition->get_range_starting_indices(); + const auto* col_range_starting_indices = + col_partition->get_range_starting_indices(); + const auto num_row_ranges = row_partition->get_num_ranges(); + const auto num_col_ranges = col_partition->get_num_ranges(); + const auto num_input_elements = input.get_num_stored_elements(); + + auto policy = thrust_policy(exec); + + // precompute the row and column range id of each input element + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + array row_range_ids{exec, num_input_elements}; + thrust::upper_bound(policy, row_range_bounds + 1, + row_range_bounds + num_row_ranges + 1, input_row_idxs, + input_row_idxs + num_input_elements, + row_range_ids.get_data()); + array col_range_ids{exec, input.get_num_stored_elements()}; + thrust::upper_bound(policy, col_range_bounds + 1, + col_range_bounds + num_col_ranges + 1, input_col_idxs, + input_col_idxs + num_input_elements, + col_range_ids.get_data()); + + // count number of non local row and column indices. + auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple( + row_range_ids.get_const_data(), col_range_ids.get_const_data())); + auto num_elements_pair = thrust::transform_reduce( + policy, range_ids_it, range_ids_it + num_input_elements, + [local_part, row_part_ids, col_part_ids] __host__ __device__( + const thrust::tuple& tuple) { + auto row_part = row_part_ids[thrust::get<0>(tuple)]; + auto col_part = col_part_ids[thrust::get<1>(tuple)]; + bool is_local_row = row_part == local_part; + bool is_local_col = col_part == local_part; + return thrust::make_tuple( + is_local_row ? size_type{0} : size_type{1}, + is_local_col ? size_type{0} : size_type{1}); + }, + thrust::make_tuple(size_type{}, size_type{}), + [] __host__ __device__(const thrust::tuple& a, + const thrust::tuple& b) { + return thrust::make_tuple(thrust::get<0>(a) + thrust::get<0>(b), + thrust::get<1>(a) + thrust::get<1>(b)); + }); + auto n_non_local_col_idxs = thrust::get<0>(num_elements_pair); + auto n_non_local_row_idxs = thrust::get<1>(num_elements_pair); + + // define global-to-local maps for row and column indices + auto map_to_local_row = + [row_range_bounds, row_range_starting_indices] __host__ __device__( + const GlobalIndexType row, const size_type range_id) { + return static_cast(row - + row_range_bounds[range_id]) + + row_range_starting_indices[range_id]; + }; + auto map_to_local_col = + [col_range_bounds, col_range_starting_indices] __host__ __device__( + const GlobalIndexType col, const size_type range_id) { + return static_cast(col - + col_range_bounds[range_id]) + + col_range_starting_indices[range_id]; + }; + + non_local_col_idxs.resize_and_reset(n_non_local_col_idxs); + non_local_row_idxs.resize_and_reset(n_non_local_row_idxs); + thrust::copy_if(policy, input_col_idxs, input_col_idxs + num_input_elements, + range_ids_it, non_local_col_idxs.get_data(), + [local_part, col_part_ids] __host__ __device__( + const thrust::tuple& tuple) { + auto col_part = col_part_ids[thrust::get<1>(tuple)]; + return col_part != local_part; + }); + thrust::copy_if(policy, input_row_idxs, input_row_idxs + num_input_elements, + range_ids_it, non_local_row_idxs.get_data(), + [local_part, row_part_ids] __host__ __device__( + const thrust::tuple& tuple) { + auto row_part = row_part_ids[thrust::get<0>(tuple)]; + return row_part != local_part; + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILTER_NON_OWNING_IDXS); + + +} // namespace distributed_dd_matrix +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 7901edf5341..133e92af639 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -142,6 +142,7 @@ if(GINKGO_BUILD_MPI) distributed/vector_cache.cpp mpi/exception.cpp distributed/assembly.cpp + distributed/dd_matrix.cpp distributed/matrix.cpp distributed/partition_helpers.cpp distributed/vector.cpp diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 480bec0b278..a6b3edc1ec8 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -17,6 +17,7 @@ #include "core/components/prefix_sum_kernels.hpp" #include "core/components/reduce_array_kernels.hpp" #include "core/distributed/assembly_kernels.hpp" +#include "core/distributed/dd_matrix_kernels.hpp" #include "core/distributed/index_map_kernels.hpp" #include "core/distributed/matrix_kernels.hpp" #include "core/distributed/partition_helpers_kernels.hpp" @@ -359,6 +360,15 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( } // namespace distributed_matrix +namespace distributed_dd_matrix { + + +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( + GKO_DECLARE_FILTER_NON_OWNING_IDXS); + + +} // namespace distributed_dd_matrix + namespace batch_multi_vector { diff --git a/core/distributed/dd_matrix.cpp b/core/distributed/dd_matrix.cpp new file mode 100644 index 00000000000..8a28d8c1c93 --- /dev/null +++ b/core/distributed/dd_matrix.cpp @@ -0,0 +1,508 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "ginkgo/core/distributed/dd_matrix.hpp" + +#include +#include +#include +#include +#include + +#include "core/components/fill_array_kernels.hpp" +#include "core/components/prefix_sum_kernels.hpp" +#include "core/distributed/dd_matrix_kernels.hpp" + +namespace gko { +namespace experimental { +namespace distributed { +namespace dd_matrix { +namespace { + + +GKO_REGISTER_OPERATION(filter_non_owning_idxs, + distributed_dd_matrix::filter_non_owning_idxs); +GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array); +GKO_REGISTER_OPERATION(prefix_sum_nonnegative, + components::prefix_sum_nonnegative); + + +} // namespace +} // namespace dd_matrix + + +template +DdMatrix::DdMatrix( + std::shared_ptr exec, mpi::communicator comm) + : DdMatrix(exec, comm, + gko::matrix::Csr::create(exec)) +{} + + +template +DdMatrix::DdMatrix( + std::shared_ptr exec, mpi::communicator comm, + ptr_param matrix_template) + : EnableLinOp< + DdMatrix>{exec}, + DistributedBase{comm}, + send_offsets_(comm.size() + 1), + send_sizes_(comm.size()), + recv_offsets_(comm.size() + 1), + recv_sizes_(comm.size()), + gather_idxs_{exec}, + non_local_to_global_{exec}, + one_scalar_{}, + local_mtx_{matrix_template->clone(exec)} +{ + GKO_ASSERT( + (dynamic_cast*>( + local_mtx_.get()))); + one_scalar_.init(exec, dim<2>{1, 1}); + one_scalar_->fill(one()); +} + + +template +DdMatrix::DdMatrix( + std::shared_ptr exec, mpi::communicator comm, dim<2> size, + std::shared_ptr local_linop) + : EnableLinOp< + DdMatrix>{exec}, + DistributedBase{comm}, + send_offsets_(comm.size() + 1), + send_sizes_(comm.size()), + recv_offsets_(comm.size() + 1), + recv_sizes_(comm.size()), + gather_idxs_{exec}, + non_local_to_global_{exec}, + one_scalar_{} +{ + this->set_size(size); + one_scalar_.init(exec, dim<2>{1, 1}); + one_scalar_->fill(one()); + local_mtx_ = std::move(local_linop); +} + + +template +std::unique_ptr> +DdMatrix::create( + std::shared_ptr exec, mpi::communicator comm) +{ + return std::unique_ptr{new DdMatrix{exec, comm}}; +} + + +template +std::unique_ptr> +DdMatrix::create( + std::shared_ptr exec, mpi::communicator comm, + ptr_param matrix_template) +{ + return std::unique_ptr{new DdMatrix{exec, comm, matrix_template}}; +} + + +template +void DdMatrix::convert_to( + DdMatrix, local_index_type, + global_index_type>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void DdMatrix::move_to( + DdMatrix, local_index_type, + global_index_type>* result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} + + +template +void DdMatrix::read_distributed( + const device_matrix_data& data, + std::shared_ptr> + row_partition, + std::shared_ptr> + col_partition) +{ + const auto comm = this->get_communicator(); + GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size()); + GKO_ASSERT_EQ(data.get_size()[1], col_partition->get_size()); + GKO_ASSERT_EQ(comm.size(), row_partition->get_num_parts()); + GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts()); + auto exec = this->get_executor(); + auto local_part = comm.rank(); + auto use_host_buffer = mpi::requires_host_buffer(exec, comm); + auto tmp_row_partition = make_temporary_clone(exec, row_partition); + auto tmp_col_partition = make_temporary_clone(exec, col_partition); + + // set up LinOp sizes + auto global_num_rows = row_partition->get_size(); + auto global_num_cols = col_partition->get_size(); + dim<2> global_dim{global_num_rows, global_num_cols}; + this->set_size(global_dim); + + size_type num_parts = comm.size(); + array non_owning_row_idxs{exec}; + array non_owning_col_idxs{exec}; + device_matrix_data data_copy{exec, data}; + auto arrays = data_copy.empty_out(); + + exec->run(dd_matrix::make_filter_non_owning_idxs( + data, make_temporary_clone(exec, row_partition).get(), + make_temporary_clone(exec, col_partition).get(), local_part, + non_owning_row_idxs, non_owning_col_idxs)); + + auto col_map = gko::experimental::distributed::index_map( + exec, col_partition, local_part, non_owning_col_idxs); + auto row_map = gko::experimental::distributed::index_map( + exec, row_partition, local_part, non_owning_row_idxs); + + GlobalIndexType local_num_cols = + col_map.get_local_size() + col_map.get_non_local_size(); + GlobalIndexType local_num_rows = + row_map.get_local_size() + row_map.get_non_local_size(); + auto local_col_idxs = col_map.map_to_local( + arrays.col_idxs, gko::experimental::distributed::index_space::combined); + auto local_row_idxs = row_map.map_to_local( + arrays.row_idxs, gko::experimental::distributed::index_space::combined); + + // Construct the local diagonal block. + device_matrix_data local_data{ + exec, + dim<2>{static_cast(local_num_rows), + static_cast(local_num_cols)}, + local_row_idxs, local_col_idxs, arrays.values}; + local_data.sort_row_major(); + as>(this->local_mtx_) + ->read(std::move(local_data)); + + // Gather local sizes from all ranks and build the partition in the enriched + // space. + array range_bounds{ + use_host_buffer ? exec->get_master() : exec, num_parts + 1}; + comm.all_gather(exec, &local_num_rows, 1, range_bounds.get_data(), 1); + range_bounds.set_executor(exec); + exec->run(dd_matrix::make_prefix_sum_nonnegative(range_bounds.get_data(), + num_parts + 1)); + auto large_partition = + share(Partition::build_from_contiguous( + exec, range_bounds)); + + // Build the restricion and prolongation operators. + array remote_idxs{exec, 0}; + auto enriched_map = + gko::experimental::distributed::index_map( + exec, large_partition, local_part, remote_idxs); + array local_idxs{exec, + static_cast(local_num_rows)}; + exec->run(dd_matrix::make_fill_seq_array( + local_idxs.get_data(), static_cast(local_num_rows))); + auto restrict_col_idxs = + col_map.map_to_global(local_idxs, index_space::combined); + auto restrict_row_idxs = + enriched_map.map_to_global(local_idxs, index_space::combined); + array restrict_values{exec, + static_cast(local_num_rows)}; + restrict_values.fill(one()); + device_matrix_data restrict_data{ + exec, dim<2>{large_partition->get_size(), col_partition->get_size()}, + std::move(restrict_row_idxs), std::move(restrict_col_idxs), + std::move(restrict_values)}; + restriction_ = + Matrix::create(exec, comm); + restriction_->read_distributed(restrict_data, large_partition, + col_partition); + auto prolongate_col_idxs = + enriched_map.map_to_global(local_idxs, index_space::combined); + auto prolongate_row_idxs = + row_map.map_to_global(local_idxs, index_space::combined); + array prolongate_values{exec, + static_cast(local_num_rows)}; + prolongate_values.fill(one()); + device_matrix_data prolongate_data{ + exec, dim<2>{row_partition->get_size(), large_partition->get_size()}, + std::move(prolongate_row_idxs), std::move(prolongate_col_idxs), + std::move(prolongate_values)}; + prolongation_ = + Matrix::create(exec, comm); + prolongation_->read_distributed(prolongate_data, row_partition, + large_partition, + assembly_mode::communicate); + + // Create buffers for SpMV + dim<2> global_buffer_size{large_partition->get_size(), 1u}; + dim<2> local_buffer_size{static_cast(local_num_rows), 1u}; + lhs_buffer_ = Vector::create(exec, comm, global_buffer_size, + local_buffer_size); + rhs_buffer_ = Vector::create(exec, comm, global_buffer_size, + local_buffer_size); +} + + +template +void DdMatrix::read_distributed( + const matrix_data& data, + std::shared_ptr> + row_partition, + std::shared_ptr> + col_partition) +{ + return this->read_distributed( + device_matrix_data::create_from_host( + this->get_executor(), data), + row_partition, col_partition); +} + + +template +void DdMatrix::read_distributed( + const matrix_data& data, + std::shared_ptr> + partition) +{ + return this->read_distributed( + device_matrix_data::create_from_host( + this->get_executor(), data), + partition, partition); +} + + +template +void DdMatrix::read_distributed( + const device_matrix_data& data, + std::shared_ptr> + partition) +{ + return this->read_distributed(data, partition, partition); +} + + +template +void DdMatrix::apply_impl( + const LinOp* b, LinOp* x) const +{ + auto exec = this->get_executor(); + const auto nrhs = x->get_size()[1]; + check_and_adjust_buffer_size(nrhs); + distributed::precision_dispatch_real_complex( + [this](const auto dense_b, auto dense_x) { + auto exec = this->get_executor(); + restriction_->apply(dense_b, lhs_buffer_); + + auto local_b = gko::matrix::Dense::create( + exec, lhs_buffer_->get_local_vector()->get_size(), + gko::make_array_view( + exec, + lhs_buffer_->get_local_vector()->get_num_stored_elements(), + lhs_buffer_->get_local_values()), + lhs_buffer_->get_local_vector()->get_stride()); + auto local_x = gko::matrix::Dense::create( + exec, rhs_buffer_->get_local_vector()->get_size(), + gko::make_array_view( + exec, + rhs_buffer_->get_local_vector()->get_num_stored_elements(), + rhs_buffer_->get_local_values()), + rhs_buffer_->get_local_vector()->get_stride()); + + local_mtx_->apply(local_b, local_x); + + prolongation_->apply(rhs_buffer_, dense_x); + }, + b, x); +} + + +template +void DdMatrix::apply_impl( + const LinOp* alpha, const LinOp* b, const LinOp* beta, LinOp* x) const +{ + auto exec = this->get_executor(); + const auto nrhs = x->get_size()[1]; + check_and_adjust_buffer_size(nrhs); + distributed::precision_dispatch_real_complex( + [this](const auto local_alpha, const auto dense_b, + const auto local_beta, auto dense_x) { + auto exec = this->get_executor(); + restriction_->apply(dense_b, lhs_buffer_); + + auto local_b = gko::matrix::Dense::create( + exec, lhs_buffer_->get_local_vector()->get_size(), + gko::make_array_view( + exec, + lhs_buffer_->get_local_vector()->get_num_stored_elements(), + lhs_buffer_->get_local_values()), + lhs_buffer_->get_local_vector()->get_stride()); + auto local_x = gko::matrix::Dense::create( + exec, rhs_buffer_->get_local_vector()->get_size(), + gko::make_array_view( + exec, + rhs_buffer_->get_local_vector()->get_num_stored_elements(), + rhs_buffer_->get_local_values()), + rhs_buffer_->get_local_vector()->get_stride()); + + local_mtx_->apply(local_b, local_x); + + prolongation_->apply(local_alpha, rhs_buffer_, local_beta, dense_x); + }, + alpha, b, beta, x); +} + + +template +void DdMatrix:: + check_and_adjust_buffer_size(const size_type nrhs) const +{ + auto exec = this->get_executor(); + auto comm = this->get_communicator(); + if (nrhs != rhs_buffer_->get_size()[1]) { + dim<2> local_buffer_size{rhs_buffer_->get_local_vector()->get_size()[0], + nrhs}; + dim<2> global_buffer_size{rhs_buffer_->get_size()[0], nrhs}; + lhs_buffer_ = Vector::create(exec, comm, global_buffer_size, + local_buffer_size); + rhs_buffer_ = Vector::create(exec, comm, global_buffer_size, + local_buffer_size); + } +} + + +template +void DdMatrix::col_scale( + ptr_param scaling_factors) +{ + auto exec = this->get_executor(); + check_and_adjust_buffer_size(1u); + size_type n_local_cols = local_mtx_->get_size()[1]; + restriction_->apply(scaling_factors, lhs_buffer_); + const auto scale_diag = gko::matrix::Diagonal::create_const( + exec, n_local_cols, + make_const_array_view(exec, n_local_cols, + lhs_buffer_->get_const_local_values())); + scale_diag->rapply(local_mtx_, local_mtx_); +} + + +template +void DdMatrix::row_scale( + ptr_param scaling_factors) +{ + auto exec = this->get_executor(); + check_and_adjust_buffer_size(1u); + size_type n_local_cols = local_mtx_->get_size()[1]; + restriction_->apply(scaling_factors, lhs_buffer_); + const auto scale_diag = gko::matrix::Diagonal::create_const( + exec, n_local_cols, + make_const_array_view(exec, n_local_cols, + lhs_buffer_->get_const_local_values())); + scale_diag->apply(local_mtx_, local_mtx_); +} + + +template +DdMatrix::DdMatrix( + const DdMatrix& other) + : EnableLinOp>{other.get_executor()}, + DistributedBase{other.get_communicator()} +{ + *this = other; +} + + +template +DdMatrix::DdMatrix( + DdMatrix&& other) noexcept + : EnableLinOp>{other.get_executor()}, + DistributedBase{other.get_communicator()} +{ + *this = std::move(other); +} + + +template +DdMatrix& +DdMatrix::operator=( + const DdMatrix& other) +{ + if (this != &other) { + GKO_ASSERT_EQ(other.get_communicator().size(), + this->get_communicator().size()); + this->set_size(other.get_size()); + local_mtx_->copy_from(other.local_mtx_); + gather_idxs_ = other.gather_idxs_; + send_offsets_ = other.send_offsets_; + recv_offsets_ = other.recv_offsets_; + send_sizes_ = other.send_sizes_; + recv_sizes_ = other.recv_sizes_; + non_local_to_global_ = other.non_local_to_global_; + one_scalar_.init(this->get_executor(), dim<2>{1, 1}); + one_scalar_->fill(one()); + } + return *this; +} + + +template +DdMatrix& +DdMatrix::operator=( + DdMatrix&& other) +{ + if (this != &other) { + GKO_ASSERT_EQ(other.get_communicator().size(), + this->get_communicator().size()); + this->set_size(other.get_size()); + other.set_size({}); + local_mtx_->move_from(other.local_mtx_); + gather_idxs_ = std::move(other.gather_idxs_); + send_offsets_ = std::move(other.send_offsets_); + recv_offsets_ = std::move(other.recv_offsets_); + send_sizes_ = std::move(other.send_sizes_); + recv_sizes_ = std::move(other.recv_sizes_); + non_local_to_global_ = std::move(other.non_local_to_global_); + one_scalar_.init(this->get_executor(), dim<2>{1, 1}); + one_scalar_->fill(one()); + } + return *this; +} + + +#define GKO_DECLARE_DISTRIBUTED_DD_MATRIX(ValueType, LocalIndexType, \ + GlobalIndexType) \ + class DdMatrix +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE( + GKO_DECLARE_DISTRIBUTED_DD_MATRIX); + + +} // namespace distributed +} // namespace experimental +} // namespace gko diff --git a/core/distributed/dd_matrix_kernels.hpp b/core/distributed/dd_matrix_kernels.hpp new file mode 100644 index 00000000000..57380065f1c --- /dev/null +++ b/core/distributed/dd_matrix_kernels.hpp @@ -0,0 +1,55 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_CORE_DISTRIBUTED_DD_MATRIX_KERNELS_HPP_ +#define GKO_CORE_DISTRIBUTED_DD_MATRIX_KERNELS_HPP_ + + +#include +#include +#include +#include +#include + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_FILTER_NON_OWNING_IDXS(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void filter_non_owning_idxs( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* col_partition, \ + comm_index_type local_part, \ + array& non_local_row_idxs, \ + array& non_local_col_idxs) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + using comm_index_type = experimental::distributed::comm_index_type; \ + template \ + GKO_DECLARE_FILTER_NON_OWNING_IDXS(ValueType, LocalIndexType, \ + GlobalIndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(distributed_dd_matrix, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_ diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 81a2a6034ea..f564eb00529 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -21,6 +21,7 @@ target_sources(ginkgo_dpcpp base/version.dp.cpp components/prefix_sum_kernels.dp.cpp distributed/assembly_kernels.dp.cpp + distributed/dd_matrix_kernels.dp.cpp distributed/index_map_kernels.dp.cpp distributed/matrix_kernels.dp.cpp distributed/partition_helpers_kernels.dp.cpp diff --git a/dpcpp/distributed/dd_matrix_kernels.dp.cpp b/dpcpp/distributed/dd_matrix_kernels.dp.cpp new file mode 100644 index 00000000000..a5b4dda5fd5 --- /dev/null +++ b/dpcpp/distributed/dd_matrix_kernels.dp.cpp @@ -0,0 +1,34 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/distributed/dd_matrix_kernels.hpp" + +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace distributed_dd_matrix { + + +template +void filter_non_owning_idxs( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + const experimental::distributed::Partition* + col_partition, + comm_index_type local_part, array& non_local_row_idxs, + array& non_local_col_idxs) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_FILTER_NON_OWNING_IDXS); + + +} // namespace distributed_dd_matrix +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/include/ginkgo/core/distributed/dd_matrix.hpp b/include/ginkgo/core/distributed/dd_matrix.hpp new file mode 100644 index 00000000000..0ed7a431985 --- /dev/null +++ b/include/ginkgo/core/distributed/dd_matrix.hpp @@ -0,0 +1,467 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_DD_MATRIX_HPP_ +#define GKO_PUBLIC_CORE_DISTRIBUTED_DD_MATRIX_HPP_ + + +#include + + +#if GINKGO_BUILD_MPI + + +#include +#include +#include +#include +#include +#include +#include + + +namespace gko { +namespace matrix { + + +template +class Csr; + + +} + + +namespace detail { + + +/** + * Helper struct to test if the Builder type has a function create(std::shared_ptr). + */ +template +struct is_matrix_type_builder; + + +template