diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 5cfa55ca687..c16c80d2d6a 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -5,6 +5,7 @@ set(CUDA_HIP_SOURCES
     base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
     distributed/assembly_kernels.cpp
+    distributed/dd_matrix_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/common/cuda_hip/distributed/dd_matrix_kernels.cpp b/common/cuda_hip/distributed/dd_matrix_kernels.cpp
new file mode 100644
index 00000000000..711a858995a
--- /dev/null
+++ b/common/cuda_hip/distributed/dd_matrix_kernels.cpp
@@ -0,0 +1,136 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/dd_matrix_kernels.hpp"
+
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/components/atomic.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace distributed_dd_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void filter_non_owning_idxs(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<GlobalIndexType>& non_local_row_idxs,
+    array<GlobalIndexType>& non_local_col_idxs)
+{
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+    auto col_part_ids = col_partition->get_part_ids();
+    const auto* row_range_bounds = row_partition->get_range_bounds();
+    const auto* col_range_bounds = col_partition->get_range_bounds();
+    const auto* row_range_starting_indices =
+        row_partition->get_range_starting_indices();
+    const auto* col_range_starting_indices =
+        col_partition->get_range_starting_indices();
+    const auto num_row_ranges = row_partition->get_num_ranges();
+    const auto num_col_ranges = col_partition->get_num_ranges();
+    const auto num_input_elements = input.get_num_stored_elements();
+
+    auto policy = thrust_policy(exec);
+
+    // precompute the row and column range id of each input element
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    array<size_type> row_range_ids{exec, num_input_elements};
+    thrust::upper_bound(policy, row_range_bounds + 1,
+                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
+                        input_row_idxs + num_input_elements,
+                        row_range_ids.get_data());
+    array<size_type> col_range_ids{exec, input.get_num_stored_elements()};
+    thrust::upper_bound(policy, col_range_bounds + 1,
+                        col_range_bounds + num_col_ranges + 1, input_col_idxs,
+                        input_col_idxs + num_input_elements,
+                        col_range_ids.get_data());
+
+    // count number of non local row and column indices.
+    auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple(
+        row_range_ids.get_const_data(), col_range_ids.get_const_data()));
+    auto num_elements_pair = thrust::transform_reduce(
+        policy, range_ids_it, range_ids_it + num_input_elements,
+        [local_part, row_part_ids, col_part_ids] __host__ __device__(
+            const thrust::tuple<size_type, size_type>& tuple) {
+            auto row_part = row_part_ids[thrust::get<0>(tuple)];
+            auto col_part = col_part_ids[thrust::get<1>(tuple)];
+            bool is_local_row = row_part == local_part;
+            bool is_local_col = col_part == local_part;
+            return thrust::make_tuple(
+                is_local_row ? size_type{0} : size_type{1},
+                is_local_col ? size_type{0} : size_type{1});
+        },
+        thrust::make_tuple(size_type{}, size_type{}),
+        [] __host__ __device__(const thrust::tuple<size_type, size_type>& a,
+                               const thrust::tuple<size_type, size_type>& b) {
+            return thrust::make_tuple(thrust::get<0>(a) + thrust::get<0>(b),
+                                      thrust::get<1>(a) + thrust::get<1>(b));
+        });
+    auto n_non_local_col_idxs = thrust::get<0>(num_elements_pair);
+    auto n_non_local_row_idxs = thrust::get<1>(num_elements_pair);
+
+    // define global-to-local maps for row and column indices
+    auto map_to_local_row =
+        [row_range_bounds, row_range_starting_indices] __host__ __device__(
+            const GlobalIndexType row, const size_type range_id) {
+            return static_cast<LocalIndexType>(row -
+                                               row_range_bounds[range_id]) +
+                   row_range_starting_indices[range_id];
+        };
+    auto map_to_local_col =
+        [col_range_bounds, col_range_starting_indices] __host__ __device__(
+            const GlobalIndexType col, const size_type range_id) {
+            return static_cast<LocalIndexType>(col -
+                                               col_range_bounds[range_id]) +
+                   col_range_starting_indices[range_id];
+        };
+
+    non_local_col_idxs.resize_and_reset(n_non_local_col_idxs);
+    non_local_row_idxs.resize_and_reset(n_non_local_row_idxs);
+    thrust::copy_if(policy, input_col_idxs, input_col_idxs + num_input_elements,
+                    range_ids_it, non_local_col_idxs.get_data(),
+                    [local_part, col_part_ids] __host__ __device__(
+                        const thrust::tuple<size_type, size_type>& tuple) {
+                        auto col_part = col_part_ids[thrust::get<1>(tuple)];
+                        return col_part != local_part;
+                    });
+    thrust::copy_if(policy, input_row_idxs, input_row_idxs + num_input_elements,
+                    range_ids_it, non_local_row_idxs.get_data(),
+                    [local_part, row_part_ids] __host__ __device__(
+                        const thrust::tuple<size_type, size_type>& tuple) {
+                        auto row_part = row_part_ids[thrust::get<0>(tuple)];
+                        return row_part != local_part;
+                    });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILTER_NON_OWNING_IDXS);
+
+
+}  // namespace distributed_dd_matrix
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 7901edf5341..133e92af639 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -142,6 +142,7 @@ if(GINKGO_BUILD_MPI)
         distributed/vector_cache.cpp
         mpi/exception.cpp
         distributed/assembly.cpp
+        distributed/dd_matrix.cpp
         distributed/matrix.cpp
         distributed/partition_helpers.cpp
         distributed/vector.cpp
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 480bec0b278..a6b3edc1ec8 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -17,6 +17,7 @@
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/components/reduce_array_kernels.hpp"
 #include "core/distributed/assembly_kernels.hpp"
+#include "core/distributed/dd_matrix_kernels.hpp"
 #include "core/distributed/index_map_kernels.hpp"
 #include "core/distributed/matrix_kernels.hpp"
 #include "core/distributed/partition_helpers_kernels.hpp"
@@ -359,6 +360,15 @@ GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
 
 }  // namespace distributed_matrix
 
+namespace distributed_dd_matrix {
+
+
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+    GKO_DECLARE_FILTER_NON_OWNING_IDXS);
+
+
+}  // namespace distributed_dd_matrix
+
 
 namespace batch_multi_vector {
 
diff --git a/core/distributed/dd_matrix.cpp b/core/distributed/dd_matrix.cpp
new file mode 100644
index 00000000000..8a28d8c1c93
--- /dev/null
+++ b/core/distributed/dd_matrix.cpp
@@ -0,0 +1,508 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "ginkgo/core/distributed/dd_matrix.hpp"
+
+#include <ginkgo/core/base/precision_dispatch.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/diagonal.hpp>
+
+#include "core/components/fill_array_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "core/distributed/dd_matrix_kernels.hpp"
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+namespace dd_matrix {
+namespace {
+
+
+GKO_REGISTER_OPERATION(filter_non_owning_idxs,
+                       distributed_dd_matrix::filter_non_owning_idxs);
+GKO_REGISTER_OPERATION(fill_seq_array, components::fill_seq_array);
+GKO_REGISTER_OPERATION(prefix_sum_nonnegative,
+                       components::prefix_sum_nonnegative);
+
+
+}  // namespace
+}  // namespace dd_matrix
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::DdMatrix(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm)
+    : DdMatrix(exec, comm,
+               gko::matrix::Csr<ValueType, LocalIndexType>::create(exec))
+{}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::DdMatrix(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm,
+    ptr_param<const LinOp> matrix_template)
+    : EnableLinOp<
+          DdMatrix<value_type, local_index_type, global_index_type>>{exec},
+      DistributedBase{comm},
+      send_offsets_(comm.size() + 1),
+      send_sizes_(comm.size()),
+      recv_offsets_(comm.size() + 1),
+      recv_sizes_(comm.size()),
+      gather_idxs_{exec},
+      non_local_to_global_{exec},
+      one_scalar_{},
+      local_mtx_{matrix_template->clone(exec)}
+{
+    GKO_ASSERT(
+        (dynamic_cast<ReadableFromMatrixData<ValueType, LocalIndexType>*>(
+            local_mtx_.get())));
+    one_scalar_.init(exec, dim<2>{1, 1});
+    one_scalar_->fill(one<value_type>());
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::DdMatrix(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm, dim<2> size,
+    std::shared_ptr<LinOp> local_linop)
+    : EnableLinOp<
+          DdMatrix<value_type, local_index_type, global_index_type>>{exec},
+      DistributedBase{comm},
+      send_offsets_(comm.size() + 1),
+      send_sizes_(comm.size()),
+      recv_offsets_(comm.size() + 1),
+      recv_sizes_(comm.size()),
+      gather_idxs_{exec},
+      non_local_to_global_{exec},
+      one_scalar_{}
+{
+    this->set_size(size);
+    one_scalar_.init(exec, dim<2>{1, 1});
+    one_scalar_->fill(one<value_type>());
+    local_mtx_ = std::move(local_linop);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+std::unique_ptr<DdMatrix<ValueType, LocalIndexType, GlobalIndexType>>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::create(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm)
+{
+    return std::unique_ptr<DdMatrix>{new DdMatrix{exec, comm}};
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+std::unique_ptr<DdMatrix<ValueType, LocalIndexType, GlobalIndexType>>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::create(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm,
+    ptr_param<const LinOp> matrix_template)
+{
+    return std::unique_ptr<DdMatrix>{new DdMatrix{exec, comm, matrix_template}};
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::convert_to(
+    DdMatrix<next_precision_base<value_type>, local_index_type,
+             global_index_type>* result) const
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->local_mtx_->copy_from(this->local_mtx_);
+    result->gather_idxs_ = this->gather_idxs_;
+    result->send_offsets_ = this->send_offsets_;
+    result->recv_offsets_ = this->recv_offsets_;
+    result->recv_sizes_ = this->recv_sizes_;
+    result->send_sizes_ = this->send_sizes_;
+    result->non_local_to_global_ = this->non_local_to_global_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::move_to(
+    DdMatrix<next_precision_base<value_type>, local_index_type,
+             global_index_type>* result)
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->local_mtx_->move_from(this->local_mtx_);
+    result->gather_idxs_ = std::move(this->gather_idxs_);
+    result->send_offsets_ = std::move(this->send_offsets_);
+    result->recv_offsets_ = std::move(this->recv_offsets_);
+    result->recv_sizes_ = std::move(this->recv_sizes_);
+    result->send_sizes_ = std::move(this->send_sizes_);
+    result->non_local_to_global_ = std::move(this->non_local_to_global_);
+    result->set_size(this->get_size());
+    this->set_size({});
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const device_matrix_data<value_type, global_index_type>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        row_partition,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        col_partition)
+{
+    const auto comm = this->get_communicator();
+    GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
+    GKO_ASSERT_EQ(data.get_size()[1], col_partition->get_size());
+    GKO_ASSERT_EQ(comm.size(), row_partition->get_num_parts());
+    GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts());
+    auto exec = this->get_executor();
+    auto local_part = comm.rank();
+    auto use_host_buffer = mpi::requires_host_buffer(exec, comm);
+    auto tmp_row_partition = make_temporary_clone(exec, row_partition);
+    auto tmp_col_partition = make_temporary_clone(exec, col_partition);
+
+    // set up LinOp sizes
+    auto global_num_rows = row_partition->get_size();
+    auto global_num_cols = col_partition->get_size();
+    dim<2> global_dim{global_num_rows, global_num_cols};
+    this->set_size(global_dim);
+
+    size_type num_parts = comm.size();
+    array<GlobalIndexType> non_owning_row_idxs{exec};
+    array<GlobalIndexType> non_owning_col_idxs{exec};
+    device_matrix_data<value_type, global_index_type> data_copy{exec, data};
+    auto arrays = data_copy.empty_out();
+
+    exec->run(dd_matrix::make_filter_non_owning_idxs(
+        data, make_temporary_clone(exec, row_partition).get(),
+        make_temporary_clone(exec, col_partition).get(), local_part,
+        non_owning_row_idxs, non_owning_col_idxs));
+
+    auto col_map = gko::experimental::distributed::index_map<LocalIndexType,
+                                                             GlobalIndexType>(
+        exec, col_partition, local_part, non_owning_col_idxs);
+    auto row_map = gko::experimental::distributed::index_map<LocalIndexType,
+                                                             GlobalIndexType>(
+        exec, row_partition, local_part, non_owning_row_idxs);
+
+    GlobalIndexType local_num_cols =
+        col_map.get_local_size() + col_map.get_non_local_size();
+    GlobalIndexType local_num_rows =
+        row_map.get_local_size() + row_map.get_non_local_size();
+    auto local_col_idxs = col_map.map_to_local(
+        arrays.col_idxs, gko::experimental::distributed::index_space::combined);
+    auto local_row_idxs = row_map.map_to_local(
+        arrays.row_idxs, gko::experimental::distributed::index_space::combined);
+
+    // Construct the local diagonal block.
+    device_matrix_data<value_type, local_index_type> local_data{
+        exec,
+        dim<2>{static_cast<size_type>(local_num_rows),
+               static_cast<size_type>(local_num_cols)},
+        local_row_idxs, local_col_idxs, arrays.values};
+    local_data.sort_row_major();
+    as<ReadableFromMatrixData<ValueType, LocalIndexType>>(this->local_mtx_)
+        ->read(std::move(local_data));
+
+    // Gather local sizes from all ranks and build the partition in the enriched
+    // space.
+    array<GlobalIndexType> range_bounds{
+        use_host_buffer ? exec->get_master() : exec, num_parts + 1};
+    comm.all_gather(exec, &local_num_rows, 1, range_bounds.get_data(), 1);
+    range_bounds.set_executor(exec);
+    exec->run(dd_matrix::make_prefix_sum_nonnegative(range_bounds.get_data(),
+                                                     num_parts + 1));
+    auto large_partition =
+        share(Partition<LocalIndexType, GlobalIndexType>::build_from_contiguous(
+            exec, range_bounds));
+
+    // Build the restricion and prolongation operators.
+    array<GlobalIndexType> remote_idxs{exec, 0};
+    auto enriched_map =
+        gko::experimental::distributed::index_map<LocalIndexType,
+                                                  GlobalIndexType>(
+            exec, large_partition, local_part, remote_idxs);
+    array<LocalIndexType> local_idxs{exec,
+                                     static_cast<size_type>(local_num_rows)};
+    exec->run(dd_matrix::make_fill_seq_array(
+        local_idxs.get_data(), static_cast<size_type>(local_num_rows)));
+    auto restrict_col_idxs =
+        col_map.map_to_global(local_idxs, index_space::combined);
+    auto restrict_row_idxs =
+        enriched_map.map_to_global(local_idxs, index_space::combined);
+    array<ValueType> restrict_values{exec,
+                                     static_cast<size_type>(local_num_rows)};
+    restrict_values.fill(one<ValueType>());
+    device_matrix_data<ValueType, GlobalIndexType> restrict_data{
+        exec, dim<2>{large_partition->get_size(), col_partition->get_size()},
+        std::move(restrict_row_idxs), std::move(restrict_col_idxs),
+        std::move(restrict_values)};
+    restriction_ =
+        Matrix<ValueType, LocalIndexType, GlobalIndexType>::create(exec, comm);
+    restriction_->read_distributed(restrict_data, large_partition,
+                                   col_partition);
+    auto prolongate_col_idxs =
+        enriched_map.map_to_global(local_idxs, index_space::combined);
+    auto prolongate_row_idxs =
+        row_map.map_to_global(local_idxs, index_space::combined);
+    array<ValueType> prolongate_values{exec,
+                                       static_cast<size_type>(local_num_rows)};
+    prolongate_values.fill(one<ValueType>());
+    device_matrix_data<ValueType, GlobalIndexType> prolongate_data{
+        exec, dim<2>{row_partition->get_size(), large_partition->get_size()},
+        std::move(prolongate_row_idxs), std::move(prolongate_col_idxs),
+        std::move(prolongate_values)};
+    prolongation_ =
+        Matrix<ValueType, LocalIndexType, GlobalIndexType>::create(exec, comm);
+    prolongation_->read_distributed(prolongate_data, row_partition,
+                                    large_partition,
+                                    assembly_mode::communicate);
+
+    // Create buffers for SpMV
+    dim<2> global_buffer_size{large_partition->get_size(), 1u};
+    dim<2> local_buffer_size{static_cast<size_type>(local_num_rows), 1u};
+    lhs_buffer_ = Vector<ValueType>::create(exec, comm, global_buffer_size,
+                                            local_buffer_size);
+    rhs_buffer_ = Vector<ValueType>::create(exec, comm, global_buffer_size,
+                                            local_buffer_size);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const matrix_data<value_type, global_index_type>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        row_partition,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        col_partition)
+{
+    return this->read_distributed(
+        device_matrix_data<value_type, global_index_type>::create_from_host(
+            this->get_executor(), data),
+        row_partition, col_partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const matrix_data<ValueType, global_index_type>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        partition)
+{
+    return this->read_distributed(
+        device_matrix_data<value_type, global_index_type>::create_from_host(
+            this->get_executor(), data),
+        partition, partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const device_matrix_data<ValueType, GlobalIndexType>& data,
+    std::shared_ptr<const Partition<local_index_type, global_index_type>>
+        partition)
+{
+    return this->read_distributed(data, partition, partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
+    const LinOp* b, LinOp* x) const
+{
+    auto exec = this->get_executor();
+    const auto nrhs = x->get_size()[1];
+    check_and_adjust_buffer_size(nrhs);
+    distributed::precision_dispatch_real_complex<ValueType>(
+        [this](const auto dense_b, auto dense_x) {
+            auto exec = this->get_executor();
+            restriction_->apply(dense_b, lhs_buffer_);
+
+            auto local_b = gko::matrix::Dense<ValueType>::create(
+                exec, lhs_buffer_->get_local_vector()->get_size(),
+                gko::make_array_view(
+                    exec,
+                    lhs_buffer_->get_local_vector()->get_num_stored_elements(),
+                    lhs_buffer_->get_local_values()),
+                lhs_buffer_->get_local_vector()->get_stride());
+            auto local_x = gko::matrix::Dense<ValueType>::create(
+                exec, rhs_buffer_->get_local_vector()->get_size(),
+                gko::make_array_view(
+                    exec,
+                    rhs_buffer_->get_local_vector()->get_num_stored_elements(),
+                    rhs_buffer_->get_local_values()),
+                rhs_buffer_->get_local_vector()->get_stride());
+
+            local_mtx_->apply(local_b, local_x);
+
+            prolongation_->apply(rhs_buffer_, dense_x);
+        },
+        b, x);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
+    const LinOp* alpha, const LinOp* b, const LinOp* beta, LinOp* x) const
+{
+    auto exec = this->get_executor();
+    const auto nrhs = x->get_size()[1];
+    check_and_adjust_buffer_size(nrhs);
+    distributed::precision_dispatch_real_complex<ValueType>(
+        [this](const auto local_alpha, const auto dense_b,
+               const auto local_beta, auto dense_x) {
+            auto exec = this->get_executor();
+            restriction_->apply(dense_b, lhs_buffer_);
+
+            auto local_b = gko::matrix::Dense<ValueType>::create(
+                exec, lhs_buffer_->get_local_vector()->get_size(),
+                gko::make_array_view(
+                    exec,
+                    lhs_buffer_->get_local_vector()->get_num_stored_elements(),
+                    lhs_buffer_->get_local_values()),
+                lhs_buffer_->get_local_vector()->get_stride());
+            auto local_x = gko::matrix::Dense<ValueType>::create(
+                exec, rhs_buffer_->get_local_vector()->get_size(),
+                gko::make_array_view(
+                    exec,
+                    rhs_buffer_->get_local_vector()->get_num_stored_elements(),
+                    rhs_buffer_->get_local_values()),
+                rhs_buffer_->get_local_vector()->get_stride());
+
+            local_mtx_->apply(local_b, local_x);
+
+            prolongation_->apply(local_alpha, rhs_buffer_, local_beta, dense_x);
+        },
+        alpha, b, beta, x);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::
+    check_and_adjust_buffer_size(const size_type nrhs) const
+{
+    auto exec = this->get_executor();
+    auto comm = this->get_communicator();
+    if (nrhs != rhs_buffer_->get_size()[1]) {
+        dim<2> local_buffer_size{rhs_buffer_->get_local_vector()->get_size()[0],
+                                 nrhs};
+        dim<2> global_buffer_size{rhs_buffer_->get_size()[0], nrhs};
+        lhs_buffer_ = Vector<ValueType>::create(exec, comm, global_buffer_size,
+                                                local_buffer_size);
+        rhs_buffer_ = Vector<ValueType>::create(exec, comm, global_buffer_size,
+                                                local_buffer_size);
+    }
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::col_scale(
+    ptr_param<const global_vector_type> scaling_factors)
+{
+    auto exec = this->get_executor();
+    check_and_adjust_buffer_size(1u);
+    size_type n_local_cols = local_mtx_->get_size()[1];
+    restriction_->apply(scaling_factors, lhs_buffer_);
+    const auto scale_diag = gko::matrix::Diagonal<ValueType>::create_const(
+        exec, n_local_cols,
+        make_const_array_view(exec, n_local_cols,
+                              lhs_buffer_->get_const_local_values()));
+    scale_diag->rapply(local_mtx_, local_mtx_);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::row_scale(
+    ptr_param<const global_vector_type> scaling_factors)
+{
+    auto exec = this->get_executor();
+    check_and_adjust_buffer_size(1u);
+    size_type n_local_cols = local_mtx_->get_size()[1];
+    restriction_->apply(scaling_factors, lhs_buffer_);
+    const auto scale_diag = gko::matrix::Diagonal<ValueType>::create_const(
+        exec, n_local_cols,
+        make_const_array_view(exec, n_local_cols,
+                              lhs_buffer_->get_const_local_values()));
+    scale_diag->apply(local_mtx_, local_mtx_);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::DdMatrix(
+    const DdMatrix& other)
+    : EnableLinOp<DdMatrix<value_type, local_index_type,
+                           global_index_type>>{other.get_executor()},
+      DistributedBase{other.get_communicator()}
+{
+    *this = other;
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::DdMatrix(
+    DdMatrix&& other) noexcept
+    : EnableLinOp<DdMatrix<value_type, local_index_type,
+                           global_index_type>>{other.get_executor()},
+      DistributedBase{other.get_communicator()}
+{
+    *this = std::move(other);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>&
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::operator=(
+    const DdMatrix& other)
+{
+    if (this != &other) {
+        GKO_ASSERT_EQ(other.get_communicator().size(),
+                      this->get_communicator().size());
+        this->set_size(other.get_size());
+        local_mtx_->copy_from(other.local_mtx_);
+        gather_idxs_ = other.gather_idxs_;
+        send_offsets_ = other.send_offsets_;
+        recv_offsets_ = other.recv_offsets_;
+        send_sizes_ = other.send_sizes_;
+        recv_sizes_ = other.recv_sizes_;
+        non_local_to_global_ = other.non_local_to_global_;
+        one_scalar_.init(this->get_executor(), dim<2>{1, 1});
+        one_scalar_->fill(one<value_type>());
+    }
+    return *this;
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>&
+DdMatrix<ValueType, LocalIndexType, GlobalIndexType>::operator=(
+    DdMatrix&& other)
+{
+    if (this != &other) {
+        GKO_ASSERT_EQ(other.get_communicator().size(),
+                      this->get_communicator().size());
+        this->set_size(other.get_size());
+        other.set_size({});
+        local_mtx_->move_from(other.local_mtx_);
+        gather_idxs_ = std::move(other.gather_idxs_);
+        send_offsets_ = std::move(other.send_offsets_);
+        recv_offsets_ = std::move(other.recv_offsets_);
+        send_sizes_ = std::move(other.send_sizes_);
+        recv_sizes_ = std::move(other.recv_sizes_);
+        non_local_to_global_ = std::move(other.non_local_to_global_);
+        one_scalar_.init(this->get_executor(), dim<2>{1, 1});
+        one_scalar_->fill(one<value_type>());
+    }
+    return *this;
+}
+
+
+#define GKO_DECLARE_DISTRIBUTED_DD_MATRIX(ValueType, LocalIndexType, \
+                                          GlobalIndexType)           \
+    class DdMatrix<ValueType, LocalIndexType, GlobalIndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE_BASE(
+    GKO_DECLARE_DISTRIBUTED_DD_MATRIX);
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
diff --git a/core/distributed/dd_matrix_kernels.hpp b/core/distributed/dd_matrix_kernels.hpp
new file mode 100644
index 00000000000..57380065f1c
--- /dev/null
+++ b/core/distributed/dd_matrix_kernels.hpp
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_CORE_DISTRIBUTED_DD_MATRIX_KERNELS_HPP_
+#define GKO_CORE_DISTRIBUTED_DD_MATRIX_KERNELS_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_FILTER_NON_OWNING_IDXS(ValueType, LocalIndexType, \
+                                           GlobalIndexType)           \
+    void filter_non_owning_idxs(                                      \
+        std::shared_ptr<const DefaultExecutor> exec,                  \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,  \
+        const experimental::distributed::Partition<                   \
+            LocalIndexType, GlobalIndexType>* row_partition,          \
+        const experimental::distributed::Partition<                   \
+            LocalIndexType, GlobalIndexType>* col_partition,          \
+        comm_index_type local_part,                                   \
+        array<GlobalIndexType>& non_local_row_idxs,                   \
+        array<GlobalIndexType>& non_local_col_idxs)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
+    using comm_index_type = experimental::distributed::comm_index_type; \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_FILTER_NON_OWNING_IDXS(ValueType, LocalIndexType,       \
+                                       GlobalIndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(distributed_dd_matrix,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 81a2a6034ea..f564eb00529 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -21,6 +21,7 @@ target_sources(ginkgo_dpcpp
     base/version.dp.cpp
     components/prefix_sum_kernels.dp.cpp
     distributed/assembly_kernels.dp.cpp
+    distributed/dd_matrix_kernels.dp.cpp
     distributed/index_map_kernels.dp.cpp
     distributed/matrix_kernels.dp.cpp
     distributed/partition_helpers_kernels.dp.cpp
diff --git a/dpcpp/distributed/dd_matrix_kernels.dp.cpp b/dpcpp/distributed/dd_matrix_kernels.dp.cpp
new file mode 100644
index 00000000000..a5b4dda5fd5
--- /dev/null
+++ b/dpcpp/distributed/dd_matrix_kernels.dp.cpp
@@ -0,0 +1,34 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/dd_matrix_kernels.hpp"
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace distributed_dd_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void filter_non_owning_idxs(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<GlobalIndexType>& non_local_row_idxs,
+    array<GlobalIndexType>& non_local_col_idxs) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILTER_NON_OWNING_IDXS);
+
+
+}  // namespace distributed_dd_matrix
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/include/ginkgo/core/distributed/dd_matrix.hpp b/include/ginkgo/core/distributed/dd_matrix.hpp
new file mode 100644
index 00000000000..0ed7a431985
--- /dev/null
+++ b/include/ginkgo/core/distributed/dd_matrix.hpp
@@ -0,0 +1,467 @@
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_DD_MATRIX_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_DD_MATRIX_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+#include <ginkgo/core/base/dense_cache.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/base/std_extensions.hpp>
+#include <ginkgo/core/distributed/base.hpp>
+#include <ginkgo/core/distributed/index_map.hpp>
+#include <ginkgo/core/distributed/matrix.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+template <typename ValueType, typename IndexType>
+class Csr;
+
+
+}
+
+
+namespace detail {
+
+
+/**
+ * Helper struct to test if the Builder type has a function create<ValueType,
+ * IndexType>(std::shared_ptr<const Executor>).
+ */
+template <typename Builder, typename ValueType, typename IndexType,
+          typename FourthType>
+struct is_matrix_type_builder;
+
+
+template <template <typename, typename> class MatrixType,
+          typename... CreateArgs>
+struct MatrixTypeBuilderFromValueAndIndex;
+
+
+}  // namespace detail
+
+
+namespace experimental {
+namespace distributed {
+
+
+template <typename LocalIndexType, typename GlobalIndexType>
+class Partition;
+template <typename ValueType>
+class Vector;
+
+
+/**
+ * The Matrix class defines a (MPI-)distributed matrix.
+ *
+ * The matrix is stored in a row-wise distributed format.
+ * Each process owns a specific set of rows, where the assignment of rows is
+ * defined by a row Partition. The following depicts the distribution of
+ * global rows according to their assigned part-id (which will usually be the
+ * owning process id):
+ * ```
+ * Part-Id  Global Rows                   Part-Id  Local Rows
+ * 0        | .. 1  2  .. .. .. |         0        | .. 1  2  .. .. .. |
+ * 1        | 3  4  .. .. .. .. |                  | 13 .. .. .. 14 .. |
+ * 2        | .. 5  6  ..  7 .. |  ---->  1        | 3  4  .. .. .. .. |
+ * 2        | .. .. .. 8  ..  9 |  ---->           | .. .. .. 10 11 12 |
+ * 1        | .. .. .. 10 11 12 |         2        | .. 5  6  ..  7 .. |
+ * 0        | 13 .. .. .. 14 .. |                  | .. .. .. 8  ..  9 |
+ * ```
+ * The local rows are further split into two matrices on each process.
+ * One matrix, called `local`, contains only entries from columns that are
+ * also owned by the process, while the other one, called `non_local`,
+ * contains entries from columns that are not owned by the process. The
+ * non-local matrix is stored in a compressed format, where empty columns are
+ * discarded and the remaining columns are renumbered. This splitting is
+ * depicted in the following:
+ * ```
+ * Part-Id  Global                            Local      Non-Local
+ * 0        | .. 1  ! 2  .. ! .. .. |         | .. 1  |  | 2  |
+ * 0        | 3  4  ! .. .. ! .. .. |         | 3  4  |  | .. |
+ *          |-----------------------|
+ * 1        | .. 5  ! 6  .. ! 7  .. |  ---->  | 6  .. |  | 5  7  .. |
+ * 1        | .. .. ! .. 8  ! ..  9 |  ---->  | 8  .. |  | .. .. 9  |
+ *          |-----------------------|
+ * 2        | .. .. ! .. 10 ! 11 12 |         | 11 12 |  | .. 10 |
+ * 2        | 13 .. ! .. .. ! 14 .. |         | 14 .. |  | 13 .. |
+ * ```
+ * This uses the same ownership of the columns as for the rows.
+ * Additionally, the ownership of the columns may be explicitly defined with an
+ * second column partition. If that is not provided, the same row partition will
+ * be used for the columns. Using a column partition also allows to create
+ * non-square matrices, like the one below:
+ * ```
+ * Part-Id  Global                  Local      Non-Local
+ * P_R/P_C    2  2  0  1
+ * 0        | .. 1  2  .. |         | 2  |     | 1  .. |
+ * 0        | 3  4  .. .. |         | .. |     | 3  4  |
+ *          |-------------|
+ * 1        | .. 5  6  .. |  ---->  | .. |     | 6  5  |
+ * 1        | .. .. .. 8  |  ---->  | 8  |     | .. .. |
+ *          |-------------|
+ * 2        | .. .. .. 10 |         | .. .. |  | 10 |
+ * 2        | 13 .. .. .. |         | 13 .. |  | .. |
+ * ```
+ * Here `P_R` denotes the row partition and `P_C` denotes the column partition.
+ *
+ * The Matrix should be filled using the read_distributed method, e.g.
+ * ```
+ * auto part = Partition<...>::build_from_mapping(...);
+ * auto mat = Matrix<...>::create(exec, comm);
+ * mat->read_distributed(matrix_data, part);
+ * ```
+ * or if different partitions for the rows and columns are used:
+ * ```
+ * auto row_part = Partition<...>::build_from_mapping(...);
+ * auto col_part = Partition<...>::build_from_mapping(...);
+ * auto mat = Matrix<...>::create(exec, comm);
+ * mat->read_distributed(matrix_data, row_part, col_part);
+ * ```
+ * This will set the dimensions of the global and local matrices automatically
+ * by deducing the sizes from the partitions.
+ *
+ * By default the Matrix type uses Csr for both stored matrices. It is possible
+ * to explicitly change the datatype for the stored matrices, with the
+ * constraint that the new type should implement the LinOp and
+ * ReadableFromMatrixData interface. The type can be set by:
+ * ```
+ * auto mat = Matrix<ValueType, LocalIndexType[, ...]>::create(
+ *   exec, comm,
+ *   Ell<ValueType, LocalIndexType>::create(exec).get(),
+ *   Coo<ValueType, LocalIndexType>::create(exec).get());
+ * ```
+ * Alternatively, the helper function with_matrix_type can be used:
+ * ```
+ * auto mat = Matrix<ValueType, LocalIndexType>::create(
+ *   exec, comm,
+ *   with_matrix_type<Ell>(),
+ *   with_matrix_type<Coo>());
+ * ```
+ * @see with_matrix_type
+ *
+ * The Matrix LinOp supports the following operations:
+ * ```cpp
+ * experimental::distributed::Matrix *A;       // distributed matrix
+ * experimental::distributed::Vector *b, *x;   // distributed multi-vectors
+ * matrix::Dense *alpha, *beta;  // scalars of dimension 1x1
+ *
+ * // Applying to distributed multi-vectors computes an SpMV/SpMM product
+ * A->apply(b, x)              // x = A*b
+ * A->apply(alpha, b, beta, x) // x = alpha*A*b + beta*x
+ * ```
+ *
+ * @tparam ValueType  The underlying value type.
+ * @tparam LocalIndexType  The index type used by the local matrices.
+ * @tparam GlobalIndexType  The type for global indices.
+ */
+template <typename ValueType = default_precision,
+          typename LocalIndexType = int32, typename GlobalIndexType = int64>
+class DdMatrix
+    : public EnableLinOp<DdMatrix<ValueType, LocalIndexType, GlobalIndexType>>,
+      public ConvertibleTo<DdMatrix<next_precision_base<ValueType>,
+                                    LocalIndexType, GlobalIndexType>>,
+      public DistributedBase {
+    friend class EnablePolymorphicObject<DdMatrix, LinOp>;
+    friend class DdMatrix<next_precision_base<ValueType>, LocalIndexType,
+                          GlobalIndexType>;
+
+public:
+    using value_type = ValueType;
+    using index_type = GlobalIndexType;
+    using local_index_type = LocalIndexType;
+    using global_index_type = GlobalIndexType;
+    using global_matrix_type =
+        Matrix<ValueType, LocalIndexType, GlobalIndexType>;
+    using global_vector_type =
+        gko::experimental::distributed::Vector<ValueType>;
+    using local_vector_type = typename global_vector_type::local_vector_type;
+
+    using EnableLinOp<DdMatrix>::convert_to;
+    using EnableLinOp<DdMatrix>::move_to;
+    using ConvertibleTo<DdMatrix<next_precision_base<ValueType>, LocalIndexType,
+                                 GlobalIndexType>>::convert_to;
+    using ConvertibleTo<DdMatrix<next_precision_base<ValueType>, LocalIndexType,
+                                 GlobalIndexType>>::move_to;
+
+    void convert_to(DdMatrix<next_precision_base<value_type>, local_index_type,
+                             global_index_type>* result) const override;
+
+    void move_to(DdMatrix<next_precision_base<value_type>, local_index_type,
+                          global_index_type>* result) override;
+
+    /**
+     * Reads a square matrix from the device_matrix_data structure and a global
+     * partition.
+     *
+     * The global size of the final matrix is inferred from the size of the
+     * partition. Both the number of rows and columns of the device_matrix_data
+     * are ignored.
+     *
+     * @note The matrix data can contain entries for rows other than those owned
+     *        by the process. Entries for those rows are discarded.
+     *
+     * @param data  The device_matrix_data structure.
+     * @param partition  The global row and column partition.
+     *
+     * @return the index_map induced by the partitions and the matrix structure
+     */
+    void read_distributed(
+        const device_matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            partition);
+
+    /**
+     * Reads a square matrix from the matrix_data structure and a global
+     * partition.
+     *
+     * @see read_distributed
+     *
+     * @note For efficiency it is advised to use the device_matrix_data
+     * overload.
+     */
+    void read_distributed(
+        const matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            partition);
+
+    /**
+     * Reads a matrix from the device_matrix_data structure, a global row
+     * partition, and a global column partition.
+     *
+     * The global size of the final matrix is inferred from the size of the row
+     * partition and the size of the column partition. Both the number of rows
+     * and columns of the device_matrix_data are ignored.
+     *
+     * @note The matrix data can contain entries for rows other than those owned
+     *        by the process. Entries for those rows are discarded.
+     *
+     * @param data  The device_matrix_data structure.
+     * @param row_partition  The global row partition.
+     * @param col_partition  The global col partition.
+     *
+     * @return the index_map induced by the partitions and the matrix structure
+     */
+    void read_distributed(
+        const device_matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            row_partition,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            col_partition);
+
+    /**
+     * Reads a matrix from the matrix_data structure, a global row partition,
+     * and a global column partition.
+     *
+     * @see read_distributed
+     *
+     * @note For efficiency it is advised to use the device_matrix_data
+     * overload.
+     */
+    void read_distributed(
+        const matrix_data<value_type, global_index_type>& data,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            row_partition,
+        std::shared_ptr<const Partition<local_index_type, global_index_type>>
+            col_partition);
+
+    /**
+     * Get read access to the stored local matrix.
+     *
+     * @return  Shared pointer to the stored local matrix
+     */
+    std::shared_ptr<const LinOp> get_local_matrix() const { return local_mtx_; }
+
+    /**
+     * Get read access to the stored restriction operator.
+     *
+     * @return  Shared pointer to the stored restriction operator.
+     */
+    std::shared_ptr<const global_matrix_type> get_restriction() const
+    {
+        return restriction_;
+    }
+
+    /**
+     * Get read access to the stored prolongation operator.
+     *
+     * @return  Shared pointer to the stored prolongation operator.
+     */
+    std::shared_ptr<const global_matrix_type> get_prolongation() const
+    {
+        return prolongation_;
+    }
+
+    /**
+     * Copy constructs a Matrix.
+     *
+     * @param other  Matrix to copy from.
+     */
+    DdMatrix(const DdMatrix& other);
+
+    /**
+     * Move constructs a Matrix.
+     *
+     * @param other  Matrix to move from.
+     */
+    DdMatrix(DdMatrix&& other) noexcept;
+
+    /**
+     * Copy assigns a Matrix.
+     *
+     * @param other  Matrix to copy from, has to have a communicator of the same
+     *               size as this.
+     *
+     * @return  this.
+     */
+    DdMatrix& operator=(const DdMatrix& other);
+
+    /**
+     * Move assigns a Matrix.
+     *
+     * @param other  Matrix to move from, has to have a communicator of the same
+     *               size as this.
+     *
+     * @return  this.
+     */
+    DdMatrix& operator=(DdMatrix&& other);
+
+    /**
+     * Creates an empty distributed matrix.
+     *
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     *              The default is the MPI_COMM_WORLD.
+     *
+     * @return A smart pointer to the newly created matrix.
+     */
+    static std::unique_ptr<DdMatrix> create(
+        std::shared_ptr<const Executor> exec, mpi::communicator comm);
+
+    /**
+     * Creates an empty distributed matrix with specified type
+     * for local matrices.
+     *
+     * @note This is mainly a convenience wrapper for
+     *       Matrix(std::shared_ptr<const Executor>, mpi::communicator, const
+     *       LinOp*)
+     *
+     * @tparam MatrixType  A type that has a `create<ValueType,
+     *                     IndexType>(exec)` function to create a smart pointer
+     *                     of a type derived from LinOp and
+     *                     ReadableFromMatrixData. @see with_matrix_type
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     * @param matrix_template  the local matrices will be constructed with the
+     *                         same type as `create` returns. It should be the
+     *                         return value of make_matrix_template.
+     *
+     * @return A smart pointer to the newly created matrix.
+     */
+    template <typename MatrixType,
+              typename = std::enable_if_t<detail::is_matrix_type_builder<
+                  MatrixType, ValueType, LocalIndexType, void>::value>>
+    static std::unique_ptr<DdMatrix> create(
+        std::shared_ptr<const Executor> exec, mpi::communicator comm,
+        MatrixType matrix_template)
+    {
+        return create(
+            exec, comm,
+            matrix_template.template create<ValueType, LocalIndexType>(exec));
+    }
+
+    /**
+     * Creates an empty distributed matrix with specified type
+     * for local matrices.
+     *
+     * @note It internally clones the passed in matrix_template. Therefore, the
+     *       LinOp should be empty.
+     *
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     * @param matrix_template  the local matrices will be constructed with the
+     *                         same runtime type.
+     *
+     * @return A smart pointer to the newly created matrix.
+     */
+    static std::unique_ptr<DdMatrix> create(
+        std::shared_ptr<const Executor> exec, mpi::communicator comm,
+        ptr_param<const LinOp> matrix_template);
+
+    /**
+     * Scales the columns of the matrix by the respective entries of the vector.
+     * The vector's row partition has to be the same as the matrix's column
+     * partition. The scaling is done in-place.
+     *
+     * @param scaling_factors  The vector containing the scaling factors.
+     */
+    void col_scale(ptr_param<const global_vector_type> scaling_factors);
+
+    /**
+     * Scales the rows of the matrix by the respective entries of the vector.
+     * The vector and the matrix have to have the same row partition.
+     * The scaling is done in-place.
+     *
+     * @param scaling_factors  The vector containing the scaling factors.
+     */
+    void row_scale(ptr_param<const global_vector_type> scaling_factors);
+
+protected:
+    explicit DdMatrix(std::shared_ptr<const Executor> exec,
+                      mpi::communicator comm);
+
+    explicit DdMatrix(std::shared_ptr<const Executor> exec,
+                      mpi::communicator comm,
+                      ptr_param<const LinOp> matrix_template);
+
+    explicit DdMatrix(std::shared_ptr<const Executor> exec,
+                      mpi::communicator comm, dim<2> size,
+                      std::shared_ptr<LinOp> local_linop);
+
+    void apply_impl(const LinOp* b, LinOp* x) const override;
+
+    void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
+                    LinOp* x) const override;
+
+private:
+    void check_and_adjust_buffer_size(const size_type nrhs) const;
+
+    std::vector<comm_index_type> send_offsets_;
+    std::vector<comm_index_type> send_sizes_;
+    std::vector<comm_index_type> recv_offsets_;
+    std::vector<comm_index_type> recv_sizes_;
+    array<local_index_type> gather_idxs_;
+    array<global_index_type> non_local_to_global_;
+    gko::detail::DenseCache<value_type> one_scalar_;
+    gko::detail::DenseCache<value_type> host_send_buffer_;
+    gko::detail::DenseCache<value_type> host_recv_buffer_;
+    gko::detail::DenseCache<value_type> send_buffer_;
+    gko::detail::DenseCache<value_type> recv_buffer_;
+    std::shared_ptr<global_matrix_type> restriction_;
+    std::shared_ptr<LinOp> local_mtx_;
+    std::shared_ptr<global_matrix_type> prolongation_;
+    mutable std::shared_ptr<global_vector_type> lhs_buffer_;
+    mutable std::shared_ptr<global_vector_type> rhs_buffer_;
+};
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
+
+
+#endif
+
+
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_DD_MATRIX_HPP_
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 56c2c0ba6fb..217d1810cf9 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -61,6 +61,7 @@
 
 #include <ginkgo/core/distributed/assembly.hpp>
 #include <ginkgo/core/distributed/base.hpp>
+#include <ginkgo/core/distributed/dd_matrix.hpp>
 #include <ginkgo/core/distributed/index_map.hpp>
 #include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 700a01116c9..887044ccf5f 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -11,6 +11,7 @@ target_sources(ginkgo_omp
     base/version.cpp
     components/prefix_sum_kernels.cpp
     distributed/assembly_kernels.cpp
+    distributed/dd_matrix_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/omp/distributed/dd_matrix_kernels.cpp b/omp/distributed/dd_matrix_kernels.cpp
new file mode 100644
index 00000000000..7376e9a46ae
--- /dev/null
+++ b/omp/distributed/dd_matrix_kernels.cpp
@@ -0,0 +1,127 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/dd_matrix_kernels.hpp"
+
+#include <omp.h>
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+#include "core/base/allocator.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+#include "reference/distributed/partition_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace distributed_dd_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void filter_non_owning_idxs(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<GlobalIndexType>& non_owning_row_idxs,
+    array<GlobalIndexType>& non_owning_col_idxs)
+{
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+    auto col_part_ids = col_partition->get_part_ids();
+    auto num_parts = row_partition->get_num_parts();
+    size_type row_range_id_hint = 0;
+    size_type col_range_id_hint = 0;
+
+    // store non-local entries with global column idxs
+    vector<GlobalIndexType> non_local_row_idxs(exec);
+    vector<GlobalIndexType> non_local_col_idxs(exec);
+
+    auto num_threads = static_cast<size_type>(omp_get_max_threads());
+    auto num_input = input.get_num_stored_elements();
+    auto size_per_thread = (num_input + num_threads - 1) / num_threads;
+    vector<size_type> non_local_col_offsets(num_threads, 0, exec);
+    vector<size_type> non_local_row_offsets(num_threads, 0, exec);
+
+#pragma omp parallel firstprivate(col_range_id_hint, row_range_id_hint)
+    {
+        vector<GlobalIndexType> thread_non_local_col_idxs(exec);
+        vector<GlobalIndexType> thread_non_local_row_idxs(exec);
+        auto thread_id = omp_get_thread_num();
+        auto thread_begin = thread_id * size_per_thread;
+        auto thread_end = std::min(thread_begin + size_per_thread, num_input);
+        // Count non local row and colunm idxs per thread
+        for (size_type i = thread_begin; i < thread_end; i++) {
+            auto global_col = input_col_idxs[i];
+            auto global_row = input_row_idxs[i];
+            col_range_id_hint =
+                find_range(global_col, col_partition, col_range_id_hint);
+            row_range_id_hint =
+                find_range(global_row, row_partition, row_range_id_hint);
+            if (col_part_ids[col_range_id_hint] != local_part) {
+                thread_non_local_col_idxs.push_back(global_col);
+            }
+            if (row_part_ids[row_range_id_hint] != local_part) {
+                thread_non_local_row_idxs.push_back(global_row);
+            }
+        }
+        non_local_col_offsets[thread_id] = thread_non_local_col_idxs.size();
+        non_local_row_offsets[thread_id] = thread_non_local_row_idxs.size();
+
+#pragma omp barrier
+#pragma omp single
+        {
+            // assign output ranges to the individual threads
+            size_type n_non_local_col_idxs{};
+            size_type n_non_local_row_idxs{};
+            for (size_type thread = 0; thread < num_threads; thread++) {
+                auto size_col_idxs = non_local_col_offsets[thread];
+                auto size_row_idxs = non_local_row_offsets[thread];
+                non_local_col_offsets[thread] = n_non_local_col_idxs;
+                non_local_row_offsets[thread] = n_non_local_row_idxs;
+                n_non_local_col_idxs += size_col_idxs;
+                n_non_local_row_idxs += size_row_idxs;
+            }
+            non_local_col_idxs.resize(n_non_local_col_idxs);
+            non_local_row_idxs.resize(n_non_local_row_idxs);
+        }
+        // write back the non_local idxs to the output ranges
+        auto col_counter = non_local_col_offsets[thread_id];
+        auto row_counter = non_local_row_offsets[thread_id];
+        for (const auto& non_local_col : thread_non_local_col_idxs) {
+            non_local_col_idxs[col_counter] = non_local_col;
+            col_counter++;
+        }
+        for (const auto& non_local_row : thread_non_local_row_idxs) {
+            non_local_row_idxs[row_counter] = non_local_row;
+            row_counter++;
+        }
+    }
+
+    non_owning_col_idxs.resize_and_reset(non_local_col_idxs.size());
+#pragma omp parallel for
+    for (size_type i = 0; i < non_local_col_idxs.size(); i++) {
+        non_owning_col_idxs.get_data()[i] = non_local_col_idxs[i];
+    }
+    non_owning_row_idxs.resize_and_reset(non_local_row_idxs.size());
+#pragma omp parallel for
+    for (size_type i = 0; i < non_local_row_idxs.size(); i++) {
+        non_owning_row_idxs.get_data()[i] = non_local_row_idxs[i];
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILTER_NON_OWNING_IDXS);
+
+
+}  // namespace distributed_dd_matrix
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 94e61d43d5c..51dc9f9a084 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -13,6 +13,7 @@ target_sources(ginkgo_reference
     components/precision_conversion_kernels.cpp
     components/prefix_sum_kernels.cpp
     distributed/assembly_kernels.cpp
+    distributed/dd_matrix_kernels.cpp
     distributed/index_map_kernels.cpp
     distributed/matrix_kernels.cpp
     distributed/partition_helpers_kernels.cpp
diff --git a/reference/distributed/dd_matrix_kernels.cpp b/reference/distributed/dd_matrix_kernels.cpp
new file mode 100644
index 00000000000..37a3fade4d6
--- /dev/null
+++ b/reference/distributed/dd_matrix_kernels.cpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/dd_matrix_kernels.hpp"
+
+#include "core/base/allocator.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "reference/distributed/partition_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace distributed_dd_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void filter_non_owning_idxs(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<GlobalIndexType>& non_owning_row_idxs,
+    array<GlobalIndexType>& non_owning_col_idxs)
+{
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto col_part_ids = col_partition->get_part_ids();
+    auto row_part_ids = row_partition->get_part_ids();
+
+    vector<GlobalIndexType> non_local_col_idxs(exec);
+    vector<GlobalIndexType> non_local_row_idxs(exec);
+    size_type col_range_id = 0;
+    size_type row_range_id = 0;
+    for (size_type i = 0; i < input.get_num_stored_elements(); ++i) {
+        auto global_col = input_col_idxs[i];
+        auto global_row = input_row_idxs[i];
+        col_range_id = find_range(global_col, col_partition, col_range_id);
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        if (col_part_ids[col_range_id] != local_part) {
+            non_local_col_idxs.push_back(global_col);
+        }
+        if (row_part_ids[row_range_id] != local_part) {
+            non_local_row_idxs.push_back(global_row);
+        }
+    }
+
+    non_owning_col_idxs.resize_and_reset(non_local_col_idxs.size());
+    for (size_type i = 0; i < non_local_col_idxs.size(); i++) {
+        non_owning_col_idxs.get_data()[i] = non_local_col_idxs[i];
+    }
+    non_owning_row_idxs.resize_and_reset(non_local_row_idxs.size());
+    for (size_type i = 0; i < non_local_row_idxs.size(); i++) {
+        non_owning_row_idxs.get_data()[i] = non_local_row_idxs[i];
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_FILTER_NON_OWNING_IDXS);
+
+
+}  // namespace distributed_dd_matrix
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/test/distributed/CMakeLists.txt b/reference/test/distributed/CMakeLists.txt
index 171974c01cb..793e31e7953 100644
--- a/reference/test/distributed/CMakeLists.txt
+++ b/reference/test/distributed/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_test(assembly_kernels)
+ginkgo_create_test(dd_matrix_kernels)
 ginkgo_create_test(index_map_kernels)
 ginkgo_create_test(matrix_kernels)
 ginkgo_create_test(partition_helpers_kernels)
diff --git a/reference/test/distributed/dd_matrix_kernels.cpp b/reference/test/distributed/dd_matrix_kernels.cpp
new file mode 100644
index 00000000000..0cedfc75113
--- /dev/null
+++ b/reference/test/distributed/dd_matrix_kernels.cpp
@@ -0,0 +1,215 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/distributed/dd_matrix_kernels.hpp"
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueLocalGlobalIndexType>
+class DdMatrix : public ::testing::Test {
+protected:
+    using value_type = typename std::tuple_element<
+        0, decltype(ValueLocalGlobalIndexType())>::type;
+    using local_index_type = typename std::tuple_element<
+        1, decltype(ValueLocalGlobalIndexType())>::type;
+    using global_index_type = typename std::tuple_element<
+        2, decltype(ValueLocalGlobalIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, local_index_type>;
+
+    DdMatrix()
+        : ref(gko::ReferenceExecutor::create()),
+          mapping{ref},
+          non_owning_row_idxs{ref},
+          non_owning_col_idxs{ref}
+    {
+        gko::size_type num_rows = 12;
+
+        // Matrix on rank 0
+        local_contributions.emplace_back(
+            gko::device_matrix_data<value_type, global_index_type>{
+                ref, gko::dim<2>{num_rows, num_rows},
+                gko::array<global_index_type>{
+                    ref, I<global_index_type>{0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+                                              3, 3, 3, 4, 4, 4, 4, 5, 5, 5}},
+                gko::array<global_index_type>{
+                    ref, I<global_index_type>{0, 1, 3, 0, 1, 2, 4, 1, 2, 5,
+                                              0, 3, 4, 1, 3, 4, 5, 2, 4, 5}},
+                gko::array<value_type>{
+                    ref, I<value_type>{2,    -1, -1,   -1, 3,    -1,   -1,
+                                       -1,   2,  -1,   -1, 1.5,  -0.5, -1,
+                                       -0.5, 2,  -0.5, -1, -0.5, 1.5}}});
+
+        // Matrix on rank 1
+        local_contributions.emplace_back(
+            gko::device_matrix_data<value_type, global_index_type>{
+                ref, gko::dim<2>{num_rows, num_rows},
+                gko::array<global_index_type>{
+                    ref, I<global_index_type>{3, 3, 3, 4, 4, 4, 4, 5, 5, 5,
+                                              6, 6, 6, 7, 7, 7, 7, 8, 8, 8}},
+                gko::array<global_index_type>{
+                    ref, I<global_index_type>{3, 4, 6, 3, 4, 5, 7, 4, 5, 8,
+                                              3, 6, 7, 4, 6, 7, 8, 5, 7, 8}},
+                gko::array<value_type>{
+                    ref, I<value_type>{1.5,  -0.5, -1,   -0.5, 2,    -0.5, -1,
+                                       -0.5, 1.5,  -1,   -1,   1.5,  -0.5, -1,
+                                       -0.5, 2,    -0.5, -1,   -0.5, 1.5}}});
+
+        // Matrix on rank 2
+        local_contributions.emplace_back(
+            gko::device_matrix_data<value_type, global_index_type>{
+                ref, gko::dim<2>{num_rows, num_rows},
+                gko::array<global_index_type>{
+                    ref,
+                    I<global_index_type>{6, 6, 6, 7,  7,  7,  7,  8,  8,  8,
+                                         9, 9, 9, 10, 10, 10, 10, 11, 11, 11}},
+                gko::array<global_index_type>{
+                    ref,
+                    I<global_index_type>{6, 7, 9,  6, 7, 8,  10, 7, 8,  11,
+                                         6, 9, 10, 7, 9, 10, 11, 8, 10, 11}},
+                gko::array<value_type>{
+                    ref, I<value_type>{1.5,  -0.5, -1, -0.5, 2,  -0.5, -1,
+                                       -0.5, 1.5,  -1, -1,   2,  -1,   -1,
+                                       -1,   3,    -1, -1,   -1, 2}}});
+    }
+
+    /**
+     * apply the `filter_non_owning_idxs` kernel and validate the result
+     * against provided reference values
+     *
+     * @param size  the expected global matrix size
+     * @param row_partition  the row partition passed to the kernel
+     * @param col_partition  the column partition passed to the kernel
+     * @param input_rows  the row indices passed to the kernel
+     * @param input_cols  the column indices passed to the kernel
+     * @param input_vals  the values passed to the kernel
+     * @param non_owning_rows  the reference non owning row idxs.
+     * @param non_owning_cols  the reference non owning col idxs.
+     */
+    void act_and_assert_filter_non_owning(
+        gko::dim<2> size,
+        gko::ptr_param<const gko::experimental::distributed::Partition<
+            local_index_type, global_index_type>>
+            row_partition,
+        gko::ptr_param<const gko::experimental::distributed::Partition<
+            local_index_type, global_index_type>>
+            col_partition,
+        std::initializer_list<std::initializer_list<global_index_type>>
+            non_owning_rows,
+        std::initializer_list<std::initializer_list<global_index_type>>
+            non_owning_cols)
+    {
+        std::vector<gko::array<global_index_type>> ref_non_owning_rows;
+        std::vector<gko::array<global_index_type>> ref_non_owning_cols;
+
+        for (auto entry : non_owning_rows) {
+            ref_non_owning_rows.emplace_back(
+                gko::array<global_index_type>{ref, entry});
+        }
+        for (auto entry : non_owning_cols) {
+            ref_non_owning_cols.emplace_back(
+                gko::array<global_index_type>{ref, entry});
+        }
+
+        for (comm_index_type part = 0; part < row_partition->get_num_parts();
+             ++part) {
+            gko::kernels::reference::distributed_dd_matrix::
+                filter_non_owning_idxs(ref, this->local_contributions[part],
+                                       row_partition.get(), col_partition.get(),
+                                       part, non_owning_row_idxs,
+                                       non_owning_col_idxs);
+
+            GKO_ASSERT_ARRAY_EQ(non_owning_col_idxs, ref_non_owning_cols[part]);
+            GKO_ASSERT_ARRAY_EQ(non_owning_row_idxs, ref_non_owning_rows[part]);
+        }
+    }
+
+    // template <typename A1, typename A2, typename A3, typename Data2>
+    // void assert_device_matrix_data_equal(A1& row_idxs, A2& col_idxs, A3&
+    // values,
+    //                                      Data2& second)
+    // {
+    //     auto array_second = second.empty_out();
+
+    //     GKO_ASSERT_ARRAY_EQ(row_idxs, array_second.row_idxs);
+    //     GKO_ASSERT_ARRAY_EQ(col_idxs, array_second.col_idxs);
+    //     GKO_ASSERT_ARRAY_EQ(values, array_second.values);
+    // }
+
+    // gko::device_matrix_data<value_type, global_index_type>
+    // create_input_not_full_rank()
+    // {
+    //     return gko::device_matrix_data<value_type, global_index_type>{
+    //         this->ref, gko::dim<2>{7, 7},
+    //         gko::array<global_index_type>{ref, {0, 0, 2, 3, 3, 4, 4, 5, 5,
+    //         6}}, gko::array<global_index_type>{ref, {0, 3, 2, 0, 3, 4, 6, 4,
+    //         5, 5}}, gko::array<value_type>{ref, {1, 2, 5, 6, 7, 8, 9, 10, 11,
+    //         12}}};
+    // }
+
+    // gko::device_matrix_data<value_type, global_index_type>
+    // create_input_full_rank()
+    // {
+    //     return gko::device_matrix_data<value_type, global_index_type>{
+    //         this->ref, gko::dim<2>{7, 7},
+    //         gko::array<global_index_type>{ref,
+    //                                       {0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 5,
+    //                                       6}},
+    //         gko::array<global_index_type>{ref,
+    //                                       {0, 3, 1, 2, 2, 0, 3, 4, 6, 4, 5,
+    //                                       5}},
+    //         gko::array<value_type>{ref,
+    //                                {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}}};
+    // }
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    gko::array<comm_index_type> mapping;
+    gko::array<global_index_type> non_owning_row_idxs;
+    gko::array<global_index_type> non_owning_col_idxs;
+    std::vector<gko::device_matrix_data<value_type, global_index_type>>
+        local_contributions;
+};
+
+TYPED_TEST_SUITE(DdMatrix, gko::test::ValueLocalGlobalIndexTypes,
+                 TupleTypenameNameGenerator);
+
+
+TYPED_TEST(DdMatrix, FilterNonOwningIdxs)
+{
+    using git = typename TestFixture::global_index_type;
+    using lit = typename TestFixture::local_index_type;
+    this->mapping = {this->ref, {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+
+    this->act_and_assert_filter_non_owning(
+        gko::dim<2>{12, 12}, partition, partition,
+        {I<git>{4, 4, 4, 4, 5, 5, 5}, I<git>{3, 3, 3, 8, 8, 8},
+         I<git>{6, 6, 6, 7, 7, 7, 7}},
+        {I<git>{4, 5, 4, 4, 5, 4, 5}, I<git>{3, 3, 8, 3, 8, 8},
+         I<git>{6, 7, 6, 7, 7, 6, 7}});
+}
+
+
+}  // namespace
\ No newline at end of file
diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt
index 46b8294f550..d0b5887fb68 100644
--- a/test/mpi/CMakeLists.txt
+++ b/test/mpi/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_common_and_reference_test(assembly MPI_SIZE 3)
+ginkgo_create_common_and_reference_test(dd_matrix MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(partition_helpers MPI_SIZE 3)
 ginkgo_create_common_and_reference_test(vector MPI_SIZE 3)
diff --git a/test/mpi/dd_matrix.cpp b/test/mpi/dd_matrix.cpp
new file mode 100644
index 00000000000..def8bf9dac2
--- /dev/null
+++ b/test/mpi/dd_matrix.cpp
@@ -0,0 +1,353 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <array>
+#include <memory>
+#include <random>
+
+#include <mpi.h>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/distributed/dd_matrix.hpp>
+#include <ginkgo/core/distributed/matrix.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/log/logger.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/test/utils.hpp"
+#include "ginkgo/core/base/exception.hpp"
+#include "test/utils/mpi/common_fixture.hpp"
+
+
+#ifndef GKO_COMPILING_DPCPP
+
+
+template <typename ValueLocalGlobalIndexType>
+class DdMatrix : public CommonMpiTestFixture {
+protected:
+    using value_type = typename std::tuple_element<
+        0, decltype(ValueLocalGlobalIndexType())>::type;
+    using local_index_type = typename std::tuple_element<
+        1, decltype(ValueLocalGlobalIndexType())>::type;
+    using global_index_type = typename std::tuple_element<
+        2, decltype(ValueLocalGlobalIndexType())>::type;
+    using dd_mtx_type =
+        gko::experimental::distributed::DdMatrix<value_type, local_index_type,
+                                                 global_index_type>;
+    using dist_mtx_type =
+        gko::experimental::distributed::Matrix<value_type, local_index_type,
+                                               global_index_type>;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+    using local_matrix_type = gko::matrix::Csr<value_type, local_index_type>;
+    using Partition =
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
+    using matrix_data = gko::matrix_data<value_type, global_index_type>;
+    using local_matrix_data = gko::matrix_data<value_type, local_index_type>;
+    using dense_vec_type = gko::matrix::Dense<value_type>;
+
+
+    DdMatrix()
+        : size{12, 12},
+          dist_input{
+              {{size, {{0, 0, 2},    {0, 1, -1}, {0, 3, -1},   {1, 0, -1},
+                       {1, 1, 3},    {1, 2, -1}, {1, 4, -1},   {2, 1, -1},
+                       {2, 2, 2},    {2, 5, -1}, {3, 0, -1},   {3, 3, 1.5},
+                       {3, 4, -0.5}, {4, 1, -1}, {4, 3, -0.5}, {4, 4, 2},
+                       {4, 5, -0.5}, {5, 2, -1}, {5, 4, -0.5}, {5, 5, 1.5}}},
+               {size, {{3, 3, 1.5},  {3, 4, -0.5}, {3, 6, -1},   {4, 3, -0.5},
+                       {4, 4, 2},    {4, 5, -0.5}, {4, 7, -1},   {5, 4, -0.5},
+                       {5, 5, 1.5},  {5, 8, -1},   {6, 3, -1},   {6, 6, 1.5},
+                       {6, 7, -0.5}, {7, 4, -1},   {7, 6, -0.5}, {7, 7, 2},
+                       {7, 8, -0.5}, {8, 5, -1},   {8, 7, -0.5}, {8, 8, 1.5}}},
+               {size,
+                {{6, 6, 1.5},  {6, 7, -0.5}, {6, 9, -1},   {7, 6, -0.5},
+                 {7, 7, 2},    {7, 8, -0.5}, {7, 10, -1},  {8, 7, -0.5},
+                 {8, 8, 1.5},  {8, 11, -1},  {9, 6, -1},   {9, 9, 2},
+                 {9, 10, -1},  {10, 7, -1},  {10, 9, -1},  {10, 10, 3},
+                 {10, 11, -1}, {11, 8, -1},  {11, 10, -1}, {11, 11, 2}}}}},
+          local_size{6, 6},
+          local_result{
+              {{local_size,
+                {{0, 0, 2},    {0, 1, -1}, {0, 3, -1},   {1, 0, -1},
+                 {1, 1, 3},    {1, 2, -1}, {1, 4, -1},   {2, 1, -1},
+                 {2, 2, 2},    {2, 5, -1}, {3, 0, -1},   {3, 3, 1.5},
+                 {3, 4, -0.5}, {4, 1, -1}, {4, 3, -0.5}, {4, 4, 2},
+                 {4, 5, -0.5}, {5, 2, -1}, {5, 4, -0.5}, {5, 5, 1.5}}},
+               {local_size,
+                {{0, 0, 2},    {0, 1, -0.5}, {0, 3, -1},   {0, 4, -0.5},
+                 {1, 0, -0.5}, {1, 1, 1.5},  {1, 5, -1},   {2, 2, 1.5},
+                 {2, 3, -0.5}, {2, 4, -1},   {3, 0, -1},   {3, 2, -0.5},
+                 {3, 3, 2},    {3, 5, -0.5}, {4, 0, -0.5}, {4, 2, -1},
+                 {4, 4, 1.5},  {5, 1, -1},   {5, 3, -0.5}, {5, 5, 1.5}}},
+               {local_size,
+                {{0, 0, 1.5},  {0, 3, -1}, {0, 5, -0.5}, {1, 1, 2},
+                 {1, 2, -1},   {1, 4, -1}, {2, 1, -1},   {2, 2, 3},
+                 {2, 3, -1},   {2, 5, -1}, {3, 0, -1},   {3, 2, -1},
+                 {3, 3, 2},    {4, 1, -1}, {4, 4, 1.5},  {4, 5, -0.5},
+                 {5, 0, -0.5}, {5, 2, -1}, {5, 4, -0.5}, {5, 5, 2}}}}},
+          engine(42)
+    {
+        row_part = Partition::build_from_contiguous(
+            exec, gko::array<global_index_type>(
+                      exec, I<global_index_type>{0, 4, 8, 12}));
+
+        dist_mat = dd_mtx_type::create(exec, comm);
+        x = dist_vec_type::create(ref, comm);
+        y = dist_vec_type::create(ref, comm);
+    }
+
+    void SetUp() override { ASSERT_EQ(comm.size(), 3); }
+
+
+    gko::dim<2> size;
+    gko::dim<2> local_size;
+    std::shared_ptr<Partition> row_part;
+    std::shared_ptr<Partition> col_part;
+
+    gko::matrix_data<value_type, global_index_type> mat_input;
+    std::array<matrix_data, 3> dist_input;
+    std::array<local_matrix_data, 3> local_result;
+
+    std::unique_ptr<dd_mtx_type> dist_mat;
+
+    std::unique_ptr<dist_vec_type> x;
+    std::unique_ptr<dist_vec_type> y;
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(DdMatrix, gko::test::ValueLocalGlobalIndexTypes,
+                 TupleTypenameNameGenerator);
+
+
+TYPED_TEST(DdMatrix, ReadsDistributed)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    auto rank = this->comm.rank();
+    auto res_local = csr::create(this->exec);
+    res_local->read(this->local_result[rank]);
+    I<I<value_type>> local_restriction = {{1, 0, 0, 0}, {0, 1, 0, 0},
+                                          {0, 0, 1, 0}, {0, 0, 0, 1},
+                                          {0, 0, 0, 0}, {0, 0, 0, 0}};
+    I<I<value_type>> non_local_restriction = {{0, 0}, {0, 0}, {0, 0},
+                                              {0, 0}, {1, 0}, {0, 1}};
+    I<I<value_type>> local_prolongation = {{1, 0, 0, 0, 0, 0},
+                                           {0, 1, 0, 0, 0, 0},
+                                           {0, 0, 1, 0, 0, 0},
+                                           {0, 0, 0, 1, 0, 0}};
+    I<I<value_type>> non_local_prolongation[] = {
+        {{0}, {0}, {0}, {1}},
+        {{1, 0, 0, 0}, {0, 1, 0, 0}, {0, 0, 1, 0}, {0, 0, 0, 1}},
+        {{1}, {0}, {0}, {0}}};
+
+    this->dist_mat->read_distributed(this->dist_input[rank], this->row_part);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local, 0);
+    GKO_ASSERT_MTX_NEAR(
+        gko::as<csr>(this->dist_mat->get_restriction()->get_local_matrix()),
+        local_restriction, 0);
+    GKO_ASSERT_MTX_NEAR(
+        gko::as<csr>(this->dist_mat->get_restriction()->get_non_local_matrix()),
+        non_local_restriction, 0);
+    GKO_ASSERT_MTX_NEAR(
+        gko::as<csr>(this->dist_mat->get_prolongation()->get_local_matrix()),
+        local_prolongation, 0);
+    GKO_ASSERT_MTX_NEAR(
+        gko::as<csr>(
+            this->dist_mat->get_prolongation()->get_non_local_matrix()),
+        non_local_prolongation[rank], 0);
+}
+
+
+TYPED_TEST(DdMatrix, CanApplyToSingleVector)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{I<I<value_type>>{
+        {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}}};
+    I<I<value_type>> result[3] = {
+        {{-4}, {-3}, {-2}, {-1}}, {{0}, {1}, {-1}, {0}}, {{1}, {2}, {3}, {4}}};
+    auto rank = this->comm.rank();
+    this->dist_mat->read_distributed(this->dist_input[rank], this->row_part);
+    this->x->read_distributed(vec_md, this->row_part);
+    this->y->read_distributed(vec_md, this->row_part);
+
+    this->dist_mat->apply(this->x, this->y);
+
+    GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0);
+}
+
+
+TYPED_TEST(DdMatrix, CanApplyToMultipleVectors)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    auto vec_md =
+        gko::matrix_data<value_type, index_type>{I<I<value_type>>{{1, 2},
+                                                                  {2, 4},
+                                                                  {3, 6},
+                                                                  {4, 8},
+                                                                  {5, 10},
+                                                                  {6, 12},
+                                                                  {7, 14},
+                                                                  {8, 16},
+                                                                  {9, 18},
+                                                                  {10, 20},
+                                                                  {11, 22},
+                                                                  {12, 24}}};
+    I<I<value_type>> result[3] = {{{-4, -8}, {-3, -6}, {-2, -4}, {-1, -2}},
+                                  {{0, 0}, {1, 2}, {-1, -2}, {0, 0}},
+                                  {{1, 2}, {2, 4}, {3, 6}, {4, 8}}};
+    auto rank = this->comm.rank();
+    this->dist_mat->read_distributed(this->dist_input[rank], this->row_part);
+    this->x->read_distributed(vec_md, this->row_part);
+    this->y->read_distributed(vec_md, this->row_part);
+
+    this->dist_mat->apply(this->x, this->y);
+
+    GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0);
+}
+
+
+TYPED_TEST(DdMatrix, CanAdvancedApplyToSingleVector)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using dense_vec_type = typename TestFixture::dense_vec_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{I<I<value_type>>{
+        {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}}};
+    I<I<value_type>> result[3] = {
+        {{-3}, {-1}, {1}, {3}}, {{5}, {7}, {6}, {8}}, {{10}, {12}, {14}, {16}}};
+    auto rank = this->comm.rank();
+    this->dist_mat->read_distributed(this->dist_input[rank], this->row_part);
+    this->x->read_distributed(vec_md, this->row_part);
+    this->y->read_distributed(vec_md, this->row_part);
+    auto alpha = gko::initialize<dense_vec_type>({1.0}, this->exec);
+    auto beta = gko::initialize<dense_vec_type>({1.0}, this->exec);
+
+    this->dist_mat->apply(alpha, this->x, beta, this->y);
+
+    GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0);
+}
+
+
+TYPED_TEST(DdMatrix, CanAdvancedApplyToMultipleVectors)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using dense_vec_type = typename TestFixture::dense_vec_type;
+    auto vec_md =
+        gko::matrix_data<value_type, index_type>{I<I<value_type>>{{1, 2},
+                                                                  {2, 4},
+                                                                  {3, 6},
+                                                                  {4, 8},
+                                                                  {5, 10},
+                                                                  {6, 12},
+                                                                  {7, 14},
+                                                                  {8, 16},
+                                                                  {9, 18},
+                                                                  {10, 20},
+                                                                  {11, 22},
+                                                                  {12, 24}}};
+    I<I<value_type>> result[3] = {{{-3, -6}, {-1, -2}, {1, 2}, {3, 6}},
+                                  {{5, 10}, {7, 14}, {6, 12}, {8, 16}},
+                                  {{10, 20}, {12, 24}, {14, 28}, {16, 32}}};
+    auto rank = this->comm.rank();
+    this->dist_mat->read_distributed(this->dist_input[rank], this->row_part);
+    this->x->read_distributed(vec_md, this->row_part);
+    this->y->read_distributed(vec_md, this->row_part);
+    auto alpha = gko::initialize<dense_vec_type>({1.0}, this->exec);
+    auto beta = gko::initialize<dense_vec_type>({1.0}, this->exec);
+
+    this->dist_mat->apply(alpha, this->x, beta, this->y);
+
+    GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0);
+}
+
+
+TYPED_TEST(DdMatrix, CanColScale)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using matrix_data = typename TestFixture::local_matrix_data;
+    using csr = typename TestFixture::local_matrix_type;
+    auto local_size = gko::dim<2>{6, 6};
+    auto vec_md = gko::matrix_data<value_type, index_type>{I<I<value_type>>{
+        {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}}};
+    std::array<matrix_data, 3> scaled_result{
+        {{local_size,
+          {{0, 0, 2},  {0, 1, -2}, {0, 3, -4},   {1, 0, -1},   {1, 1, 6},
+           {1, 2, -3}, {1, 4, -5}, {2, 1, -2},   {2, 2, 6},    {2, 5, -6},
+           {3, 0, -1}, {3, 3, 6},  {3, 4, -2.5}, {4, 1, -2},   {4, 3, -2},
+           {4, 4, 10}, {4, 5, -3}, {5, 2, -3},   {5, 4, -2.5}, {5, 5, 9}}},
+         {local_size,
+          {{0, 0, 10}, {0, 1, -3},   {0, 3, -8},   {0, 4, -2},   {1, 0, -2.5},
+           {1, 1, 9},  {1, 5, -9},   {2, 2, 10.5}, {2, 3, -4},   {2, 4, -4},
+           {3, 0, -5}, {3, 2, -3.5}, {3, 3, 16},   {3, 5, -4.5}, {4, 0, -2.5},
+           {4, 2, -7}, {4, 4, 6},    {5, 1, -6},   {5, 3, -4},   {5, 5, 13.5}}},
+         {local_size, {{0, 0, 13.5}, {0, 3, -12}, {0, 5, -4},   {1, 1, 20},
+                       {1, 2, -11},  {1, 4, -7},  {2, 1, -10},  {2, 2, 33},
+                       {2, 3, -12},  {2, 5, -8},  {3, 0, -9},   {3, 2, -11},
+                       {3, 3, 24},   {4, 1, -10}, {4, 4, 10.5}, {4, 5, -4},
+                       {5, 0, -4.5}, {5, 2, -11}, {5, 4, -3.5}, {5, 5, 16}}}}};
+    auto rank = this->comm.rank();
+    auto res_local = csr::create(this->exec);
+    res_local->read(scaled_result[rank]);
+    this->dist_mat->read_distributed(this->dist_input[rank], this->row_part);
+    this->x->read_distributed(vec_md, this->row_part);
+
+    this->dist_mat->col_scale(this->x);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local, 0);
+}
+
+
+TYPED_TEST(DdMatrix, CanRowScale)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using matrix_data = typename TestFixture::local_matrix_data;
+    using csr = typename TestFixture::local_matrix_type;
+    auto local_size = gko::dim<2>{6, 6};
+    auto vec_md = gko::matrix_data<value_type, index_type>{I<I<value_type>>{
+        {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}}};
+    std::array<matrix_data, 3> scaled_result{
+        {{local_size,
+          {{0, 0, 2},  {0, 1, -1},   {0, 3, -1}, {1, 0, -2}, {1, 1, 6},
+           {1, 2, -2}, {1, 4, -2},   {2, 1, -3}, {2, 2, 6},  {2, 5, -3},
+           {3, 0, -4}, {3, 3, 6},    {3, 4, -2}, {4, 1, -5}, {4, 3, -2.5},
+           {4, 4, 10}, {4, 5, -2.5}, {5, 2, -6}, {5, 4, -3}, {5, 5, 9}}},
+         {local_size,
+          {{0, 0, 10}, {0, 1, -2.5}, {0, 3, -5},   {0, 4, -2.5}, {1, 0, -3},
+           {1, 1, 9},  {1, 5, -6},   {2, 2, 10.5}, {2, 3, -3.5}, {2, 4, -7},
+           {3, 0, -8}, {3, 2, -4},   {3, 3, 16},   {3, 5, -4},   {4, 0, -2},
+           {4, 2, -4}, {4, 4, 6},    {5, 1, -9},   {5, 3, -4.5}, {5, 5, 13.5}}},
+         {local_size,
+          {{0, 0, 13.5}, {0, 3, -9},  {0, 5, -4.5}, {1, 1, 20},  {1, 2, -10},
+           {1, 4, -10},  {2, 1, -11}, {2, 2, 33},   {2, 3, -11}, {2, 5, -11},
+           {3, 0, -12},  {3, 2, -12}, {3, 3, 24},   {4, 1, -7},  {4, 4, 10.5},
+           {4, 5, -3.5}, {5, 0, -4},  {5, 2, -8},   {5, 4, -4},  {5, 5, 16}}}}};
+    auto rank = this->comm.rank();
+    auto res_local = csr::create(this->exec);
+    res_local->read(scaled_result[rank]);
+    this->dist_mat->read_distributed(this->dist_input[rank], this->row_part);
+    this->x->read_distributed(vec_md, this->row_part);
+
+    this->dist_mat->row_scale(this->x);
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local, 0);
+}
+
+
+#endif
\ No newline at end of file