From d49e5e6a3575d4ccc5b629be85c0141e8a3c60d7 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Mon, 25 Mar 2024 06:01:31 +0100
Subject: [PATCH 1/4] debug bellman_fod

---
 cpp/CMakeLists.txt                      |   2 +
 cpp/src/traversal/bellman_ford_impl.cuh | 408 ++++++++++++++++++++++++
 cpp/src/traversal/bellman_ford_mg.cu    |  68 ++++
 cpp/src/traversal/bellman_ford_sg.cu    |  68 ++++
 4 files changed, 546 insertions(+)
 create mode 100644 cpp/src/traversal/bellman_ford_impl.cuh
 create mode 100644 cpp/src/traversal/bellman_ford_mg.cu
 create mode 100644 cpp/src/traversal/bellman_ford_sg.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6070621134d..540a784b129 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -280,6 +280,8 @@ set(CUGRAPH_SOURCES
     src/traversal/bfs_sg.cu
     src/traversal/bfs_mg.cu
     src/traversal/sssp_sg.cu
+    src/traversal/bellman_ford_sg.cu
+    src/traversal/bellman_ford_mg.cu
     src/traversal/od_shortest_distances_sg.cu
     src/traversal/sssp_mg.cu
     src/link_analysis/hits_sg.cu
diff --git a/cpp/src/traversal/bellman_ford_impl.cuh b/cpp/src/traversal/bellman_ford_impl.cuh
new file mode 100644
index 00000000000..c7bf56b208f
--- /dev/null
+++ b/cpp/src/traversal/bellman_ford_impl.cuh
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "prims/fill_edge_property.cuh"
+#include "prims/reduce_op.cuh"
+#include "prims/transform_e.cuh"
+#include "prims/transform_reduce_e_by_src_dst_key.cuh"
+#include "prims/update_edge_src_dst_property.cuh"
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+
+#include <raft/core/handle.hpp>
+
+#include <thrust/fill.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void bellman_ford(raft::handle_t const& handle,
+                  cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+                  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+                  vertex_t source,
+                  vertex_t* predecessors,
+                  weight_t* distances)
+{
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+  graph_view_t current_graph_view(graph_view);
+
+  // edge mask
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_even(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+
+  cugraph::edge_property_t<graph_view_t, bool> edge_masks_odd(handle, current_graph_view);
+  cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+
+  cugraph::transform_e(
+    handle,
+    current_graph_view,
+    edge_src_dummy_property_t{}.view(),
+    edge_dst_dummy_property_t{}.view(),
+    cugraph::edge_dummy_property_t{}.view(),
+    [] __device__(auto src, auto dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) {
+      return !(src == dst);  // mask out self-loop
+    },
+    edge_masks_even.mutable_view());
+
+  current_graph_view.attach_edge_mask(edge_masks_even.view());
+
+  // 2. initialize distances and predecessors
+
+  auto constexpr invalid_distance = std::numeric_limits<weight_t>::max();
+  auto constexpr invalid_vertex   = invalid_vertex_id<vertex_t>::value;
+
+  auto val_first = thrust::make_zip_iterator(thrust::make_tuple(distances, predecessors));
+  thrust::transform(
+    handle.get_thrust_policy(),
+    thrust::make_counting_iterator(current_graph_view.local_vertex_partition_range_first()),
+    thrust::make_counting_iterator(current_graph_view.local_vertex_partition_range_last()),
+    val_first,
+    [source] __device__(auto v) {
+      auto distance = invalid_distance;
+      if (v == source) { distance = weight_t{0.0}; }
+      return thrust::make_tuple(distance, invalid_vertex);
+    });
+
+  edge_src_property_t<graph_view_t, vertex_t> src_predecessor_cache(handle);
+  edge_src_property_t<graph_view_t, weight_t> src_distance_cache(handle);
+
+  edge_dst_property_t<graph_view_t, vertex_t> dst_predecessor_cache(handle);
+  edge_dst_property_t<graph_view_t, weight_t> dst_distance_cache(handle);
+
+  edge_dst_property_t<graph_view_t, vertex_t> dst_key_cache(handle);
+
+  rmm::device_uvector<vertex_t> local_vertices(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+
+  detail::sequence_fill(handle.get_stream(),
+                        local_vertices.begin(),
+                        local_vertices.size(),
+                        current_graph_view.local_vertex_partition_range_first());
+
+  // auto vertex_begin =
+  //   thrust::make_counting_iterator(current_graph_view.local_vertex_partition_range_first());
+  // auto vertex_end =
+  //   thrust::make_counting_iterator(current_graph_view.local_vertex_partition_range_last());
+
+  // thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, local_vertices.begin());
+
+  vertex_t itr_cnt = 0;
+  for (itr_cnt = 0; itr_cnt <= current_graph_view.number_of_vertices(); itr_cnt++) {
+    if constexpr (graph_view_t::is_multi_gpu) {
+      src_predecessor_cache =
+        edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
+      src_distance_cache = edge_src_property_t<graph_view_t, weight_t>(handle, current_graph_view);
+
+      dst_predecessor_cache =
+        edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
+      dst_distance_cache = edge_dst_property_t<graph_view_t, weight_t>(handle, current_graph_view);
+
+      dst_key_cache = edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
+
+      update_edge_src_property(handle, current_graph_view, predecessors, src_predecessor_cache);
+      update_edge_src_property(handle, current_graph_view, distances, src_distance_cache);
+
+      update_edge_dst_property(handle, current_graph_view, predecessors, dst_predecessor_cache);
+      update_edge_dst_property(handle, current_graph_view, distances, dst_distance_cache);
+
+      update_edge_dst_property(handle, current_graph_view, local_vertices.begin(), dst_key_cache);
+    }
+
+    auto src_input_property_values =
+      graph_view_t::is_multi_gpu
+        ? view_concat(src_predecessor_cache.view(), src_distance_cache.view())
+        : view_concat(detail::edge_major_property_view_t<vertex_t, vertex_t const*>(predecessors),
+                      detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances));
+
+    auto dst_input_property_values =
+      graph_view_t::is_multi_gpu
+        ? view_concat(dst_predecessor_cache.view(), dst_distance_cache.view())
+        : view_concat(
+            detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(predecessors,
+                                                                          vertex_t{0}),
+            detail::edge_minor_property_view_t<vertex_t, weight_t const*>(distances, weight_t{0}));
+
+    if (graph_view_t::is_multi_gpu) {
+      auto const comm_rank = handle.get_comms().get_rank();
+      auto const comm_size = handle.get_comms().get_size();
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto local_vertices_title = std::string("local_vertices_").append(std::to_string(comm_rank));
+      raft::print_device_vector(
+        local_vertices_title.c_str(), local_vertices.begin(), local_vertices.size(), std::cout);
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto distances_title = std::string("distances_").append(std::to_string(comm_rank));
+      raft::print_device_vector(distances_title.c_str(),
+                                distances,
+                                current_graph_view.local_vertex_partition_range_size(),
+                                std::cout);
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto predecessors_title = std::string("predecessors_").append(std::to_string(comm_rank));
+      raft::print_device_vector(predecessors_title.c_str(),
+                                predecessors,
+                                current_graph_view.local_vertex_partition_range_size(),
+                                std::cout);
+    }
+
+    rmm::device_uvector<vertex_t> edge_reduced_dst_keys(0, handle.get_stream());
+    rmm::device_uvector<weight_t> minimum_weights(0, handle.get_stream());
+    rmm::device_uvector<vertex_t> closest_preds(0, handle.get_stream());
+
+    std::forward_as_tuple(edge_reduced_dst_keys, std::tie(minimum_weights, closest_preds)) =
+      cugraph::transform_reduce_e_by_dst_key(
+        handle,
+        current_graph_view,
+        src_input_property_values,
+        dst_input_property_values,
+        *edge_weight_view,
+        graph_view_t::is_multi_gpu ? dst_key_cache.view()
+                                   : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
+                                       local_vertices.begin(), vertex_t{0}),
+        [] __device__(auto src,
+                      auto dst,
+                      thrust::tuple<vertex_t, weight_t> src_pred_dist,
+                      thrust::tuple<vertex_t, weight_t> dst_pred_dist,
+                      auto wt) {
+          auto src_pred = thrust::get<0>(src_pred_dist);
+          auto src_dist = thrust::get<1>(src_pred_dist);
+
+          auto dst_pred = thrust::get<0>(dst_pred_dist);
+          auto dst_dist = thrust::get<1>(dst_pred_dist);
+
+          printf(
+            "src = %d dst = %d wt = %f src_pred = %d dst_pred = %d, src_dist = %f dst_dist = %f\n",
+            static_cast<int>(src),
+            static_cast<int>(dst),
+            static_cast<float>(wt),
+            static_cast<int>(src_pred),
+            static_cast<int>(dst_pred),
+            static_cast<float>(src_dist),
+            static_cast<float>(dst_dist));
+
+          auto relax = (src_dist < invalid_distance) &&
+                       ((dst_dist == invalid_distance) || (dst_dist > (src_dist + wt)));
+
+          return relax ? thrust::make_tuple(src_dist + wt, src)
+                       : thrust::make_tuple(invalid_distance, invalid_vertex);
+        },
+        thrust::make_tuple(invalid_distance, invalid_vertex),
+        reduce_op::minimum<thrust::tuple<weight_t, vertex_t>>{},
+        true);
+
+    if constexpr (graph_view_t::is_multi_gpu) {
+      auto vertex_partition_range_lasts = current_graph_view.vertex_partition_range_lasts();
+
+      rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
+        vertex_partition_range_lasts.size(), handle.get_stream());
+
+      raft::update_device(d_vertex_partition_range_lasts.data(),
+                          vertex_partition_range_lasts.data(),
+                          vertex_partition_range_lasts.size(),
+                          handle.get_stream());
+
+      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
+      auto const major_comm_size = major_comm.get_size();
+      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
+      auto const minor_comm_size = minor_comm.get_size();
+
+      auto func = cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
+        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
+                                          d_vertex_partition_range_lasts.size()),
+        major_comm_size,
+        minor_comm_size};
+
+      rmm::device_uvector<size_t> d_tx_value_counts(0, handle.get_stream());
+
+      auto triplet_first = thrust::make_zip_iterator(
+        edge_reduced_dst_keys.begin(), minimum_weights.begin(), closest_preds.begin());
+
+      d_tx_value_counts = cugraph::groupby_and_count(
+        triplet_first,
+        triplet_first + edge_reduced_dst_keys.size(),
+        [func] __device__(auto val) { return func(thrust::get<0>(val)); },
+        handle.get_comms().get_size(),
+        std::numeric_limits<vertex_t>::max(),
+        handle.get_stream());
+
+      std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
+      raft::update_host(h_tx_value_counts.data(),
+                        d_tx_value_counts.data(),
+                        d_tx_value_counts.size(),
+                        handle.get_stream());
+      handle.sync_stream();
+
+      std::forward_as_tuple(std::tie(edge_reduced_dst_keys, minimum_weights, closest_preds),
+                            std::ignore) =
+        shuffle_values(
+          handle.get_comms(),
+          thrust::make_zip_iterator(
+            edge_reduced_dst_keys.begin(), minimum_weights.begin(), closest_preds.begin()),
+          h_tx_value_counts,
+          handle.get_stream());
+    }
+
+    using flag_t                        = uint8_t;
+    rmm::device_uvector<flag_t> updated = rmm::device_uvector<flag_t>(
+      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+
+    thrust::fill(handle.get_thrust_policy(), updated.begin(), updated.end(), flag_t{false});
+
+    thrust::for_each(
+      handle.get_thrust_policy(),
+      thrust::make_zip_iterator(thrust::make_tuple(
+        edge_reduced_dst_keys.begin(), minimum_weights.begin(), closest_preds.begin())),
+      thrust::make_zip_iterator(thrust::make_tuple(
+        edge_reduced_dst_keys.end(), minimum_weights.end(), closest_preds.end())),
+      [distances,
+       predecessors,
+       updated = updated.begin(),
+       v_first = current_graph_view.local_vertex_partition_range_first(),
+       v_last =
+         current_graph_view.local_vertex_partition_range_last()] __device__(auto v_dist_pred) {
+        auto v = thrust::get<0>(v_dist_pred);
+        if ((v < v_first) || (v >= v_last)) {
+          printf("%d out of range [%d %d)\n",
+                 static_cast<int>(v),
+                 static_cast<int>(v_first),
+                 static_cast<int>(v_last));
+        }
+        auto dist     = thrust::get<1>(v_dist_pred);
+        auto pred     = thrust::get<2>(v_dist_pred);
+        auto v_offset = v - v_first;
+        if (distances[v_offset] < dist) {
+          updated[v_offset]      = flag_t{true};
+          distances[v_offset]    = dist;
+          predecessors[v_offset] = pred;
+        }
+      });
+
+    int nr_of_updated_vertices =
+      thrust::count(handle.get_thrust_policy(), updated.begin(), updated.end(), flag_t{true});
+
+    if constexpr (graph_view_t::is_multi_gpu) {
+      nr_of_updated_vertices = host_scalar_allreduce(
+        handle.get_comms(), nr_of_updated_vertices, raft::comms::op_t::SUM, handle.get_stream());
+    }
+
+    if (nr_of_updated_vertices == 0) { break; }
+
+    if (graph_view_t::is_multi_gpu) {
+      auto const comm_rank = handle.get_comms().get_rank();
+      auto const comm_size = handle.get_comms().get_size();
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto edge_reduced_dst_keys_title =
+        std::string("edge_reduced_dst_keys_").append(std::to_string(comm_rank));
+      raft::print_device_vector(edge_reduced_dst_keys_title.c_str(),
+                                edge_reduced_dst_keys.begin(),
+                                edge_reduced_dst_keys.size(),
+                                std::cout);
+
+      auto minimum_weights_title =
+        std::string("minimum_weights_").append(std::to_string(comm_rank));
+      raft::print_device_vector(
+        minimum_weights_title.c_str(), minimum_weights.begin(), minimum_weights.size(), std::cout);
+
+      auto closest_preds_title = std::string("closest_preds_").append(std::to_string(comm_rank));
+      raft::print_device_vector(
+        closest_preds_title.c_str(), closest_preds.begin(), closest_preds.size(), std::cout);
+    }
+  }
+
+  if (itr_cnt == current_graph_view.local_vertex_partition_range_size()) {
+    std::cout << "Detected -ve cycle.\n";
+  }
+
+  ///
+
+  /*
+  vertex_t color_id = 0;
+  while (true) {
+    using flag_t                                 = uint8_t;
+    rmm::device_uvector<flag_t> is_vertex_in_mis = rmm::device_uvector<flag_t>(
+      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+    thrust::fill(handle.get_thrust_policy(), is_vertex_in_mis.begin(), is_vertex_in_mis.end(), 0);
+
+    if (current_graph_view.compute_number_of_edges(handle) == 0) { break; }
+
+    cugraph::edge_src_property_t<graph_view_t, flag_t> src_mis_flags(handle, current_graph_view);
+    cugraph::edge_dst_property_t<graph_view_t, flag_t> dst_mis_flags(handle, current_graph_view);
+
+    cugraph::update_edge_src_property(
+      handle, current_graph_view, is_vertex_in_mis.begin(), src_mis_flags);
+
+    cugraph::update_edge_dst_property(
+      handle, current_graph_view, is_vertex_in_mis.begin(), dst_mis_flags);
+
+    if (color_id % 2 == 0) {
+      cugraph::transform_e(
+        handle,
+        current_graph_view,
+        src_mis_flags.view(),
+        dst_mis_flags.view(),
+        cugraph::edge_dummy_property_t{}.view(),
+        [color_id] __device__(
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
+        },
+        edge_masks_odd.mutable_view());
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
+      current_graph_view.attach_edge_mask(edge_masks_odd.view());
+    } else {
+      cugraph::transform_e(
+        handle,
+        current_graph_view,
+        src_mis_flags.view(),
+        dst_mis_flags.view(),
+        cugraph::edge_dummy_property_t{}.view(),
+        [color_id] __device__(
+          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
+          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
+        },
+        edge_masks_even.mutable_view());
+
+      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
+      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
+      current_graph_view.attach_edge_mask(edge_masks_even.view());
+    }
+
+    color_id++;
+  }
+  */
+}
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void bellman_ford(raft::handle_t const& handle,
+                  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+                  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+                  vertex_t source,
+                  vertex_t* predecessors,
+                  weight_t* distances)
+{
+  detail::bellman_ford(handle, graph_view, edge_weight_view, source, predecessors, distances);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/traversal/bellman_ford_mg.cu b/cpp/src/traversal/bellman_ford_mg.cu
new file mode 100644
index 00000000000..328591728f4
--- /dev/null
+++ b/cpp/src/traversal/bellman_ford_mg.cu
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "bellman_ford_impl.cuh"
+
+namespace cugraph {
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  float* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  double* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  float* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  int64_t source,
+  int64_t* predecessors,
+  float* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  double* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  int64_t source,
+  int64_t* predecessors,
+  double* distances);
+
+}  // namespace cugraph
diff --git a/cpp/src/traversal/bellman_ford_sg.cu b/cpp/src/traversal/bellman_ford_sg.cu
new file mode 100644
index 00000000000..6a438913fd8
--- /dev/null
+++ b/cpp/src/traversal/bellman_ford_sg.cu
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "bellman_ford_impl.cuh"
+
+namespace cugraph {
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  float* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  double* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  float* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+  int64_t source,
+  int64_t* predecessors,
+  float* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  int32_t source,
+  int32_t* predecessors,
+  double* distances);
+
+template void bellman_ford(
+  raft::handle_t const& handle,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+  int64_t source,
+  int64_t* predecessors,
+  double* distances);
+
+}  // namespace cugraph

From ae361779d5b351e60c5ba301d0ffd236a9ada0f9 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Tue, 26 Mar 2024 00:32:07 +0100
Subject: [PATCH 2/4] bellman_ford to detect negative cycle

---
 cpp/src/traversal/bellman_ford_impl.cuh | 73 +++++++++++++++----------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/cpp/src/traversal/bellman_ford_impl.cuh b/cpp/src/traversal/bellman_ford_impl.cuh
index c7bf56b208f..ec144aa5011 100644
--- a/cpp/src/traversal/bellman_ford_impl.cuh
+++ b/cpp/src/traversal/bellman_ford_impl.cuh
@@ -104,8 +104,9 @@ void bellman_ford(raft::handle_t const& handle,
 
   // thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, local_vertices.begin());
 
-  vertex_t itr_cnt = 0;
-  for (itr_cnt = 0; itr_cnt <= current_graph_view.number_of_vertices(); itr_cnt++) {
+  vertex_t itr_cnt           = 0;
+  int nr_of_updated_vertices = 0;
+  while (itr_cnt < current_graph_view.number_of_vertices()) {
     if constexpr (graph_view_t::is_multi_gpu) {
       src_predecessor_cache =
         edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
@@ -199,8 +200,7 @@ void bellman_ford(raft::handle_t const& handle,
             static_cast<float>(src_dist),
             static_cast<float>(dst_dist));
 
-          auto relax = (src_dist < invalid_distance) &&
-                       ((dst_dist == invalid_distance) || (dst_dist > (src_dist + wt)));
+          auto relax = (src_dist < invalid_distance) && (dst_dist > (src_dist + wt));
 
           return relax ? thrust::make_tuple(src_dist + wt, src)
                        : thrust::make_tuple(invalid_distance, invalid_vertex);
@@ -261,6 +261,28 @@ void bellman_ford(raft::handle_t const& handle,
           handle.get_stream());
     }
 
+    if (graph_view_t::is_multi_gpu) {
+      auto const comm_rank = handle.get_comms().get_rank();
+      auto const comm_size = handle.get_comms().get_size();
+
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto edge_reduced_dst_keys_title =
+        std::string("edge_reduced_dst_keys_").append(std::to_string(comm_rank));
+      raft::print_device_vector(edge_reduced_dst_keys_title.c_str(),
+                                edge_reduced_dst_keys.begin(),
+                                edge_reduced_dst_keys.size(),
+                                std::cout);
+
+      auto minimum_weights_title =
+        std::string("minimum_weights_").append(std::to_string(comm_rank));
+      raft::print_device_vector(
+        minimum_weights_title.c_str(), minimum_weights.begin(), minimum_weights.size(), std::cout);
+
+      auto closest_preds_title = std::string("closest_preds_").append(std::to_string(comm_rank));
+      raft::print_device_vector(
+        closest_preds_title.c_str(), closest_preds.begin(), closest_preds.size(), std::cout);
+    }
+
     using flag_t                        = uint8_t;
     rmm::device_uvector<flag_t> updated = rmm::device_uvector<flag_t>(
       current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
@@ -281,22 +303,29 @@ void bellman_ford(raft::handle_t const& handle,
          current_graph_view.local_vertex_partition_range_last()] __device__(auto v_dist_pred) {
         auto v = thrust::get<0>(v_dist_pred);
         if ((v < v_first) || (v >= v_last)) {
-          printf("%d out of range [%d %d)\n",
+          printf("%d > > > > > out of range [%d %d)\n",
                  static_cast<int>(v),
                  static_cast<int>(v_first),
                  static_cast<int>(v_last));
         }
+
         auto dist     = thrust::get<1>(v_dist_pred);
         auto pred     = thrust::get<2>(v_dist_pred);
         auto v_offset = v - v_first;
-        if (distances[v_offset] < dist) {
+
+        printf(" vertex %d : [pred=%d dist = %f)\n",
+               static_cast<int>(v),
+               static_cast<int>(pred),
+               static_cast<float>(dist));
+
+        if (pred != invalid_vertex) {
           updated[v_offset]      = flag_t{true};
           distances[v_offset]    = dist;
           predecessors[v_offset] = pred;
         }
       });
 
-    int nr_of_updated_vertices =
+    nr_of_updated_vertices =
       thrust::count(handle.get_thrust_policy(), updated.begin(), updated.end(), flag_t{true});
 
     if constexpr (graph_view_t::is_multi_gpu) {
@@ -304,32 +333,18 @@ void bellman_ford(raft::handle_t const& handle,
         handle.get_comms(), nr_of_updated_vertices, raft::comms::op_t::SUM, handle.get_stream());
     }
 
-    if (nr_of_updated_vertices == 0) { break; }
-
-    if (graph_view_t::is_multi_gpu) {
-      auto const comm_rank = handle.get_comms().get_rank();
-      auto const comm_size = handle.get_comms().get_size();
+    itr_cnt++;
+    std::cout << "itr_cnt: " << itr_cnt << std::endl;
 
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto edge_reduced_dst_keys_title =
-        std::string("edge_reduced_dst_keys_").append(std::to_string(comm_rank));
-      raft::print_device_vector(edge_reduced_dst_keys_title.c_str(),
-                                edge_reduced_dst_keys.begin(),
-                                edge_reduced_dst_keys.size(),
-                                std::cout);
-
-      auto minimum_weights_title =
-        std::string("minimum_weights_").append(std::to_string(comm_rank));
-      raft::print_device_vector(
-        minimum_weights_title.c_str(), minimum_weights.begin(), minimum_weights.size(), std::cout);
-
-      auto closest_preds_title = std::string("closest_preds_").append(std::to_string(comm_rank));
-      raft::print_device_vector(
-        closest_preds_title.c_str(), closest_preds.begin(), closest_preds.size(), std::cout);
+    if (nr_of_updated_vertices == 0) {
+      std::cout << "No more updates\n";
+      break;
     }
   }
 
-  if (itr_cnt == current_graph_view.local_vertex_partition_range_size()) {
+  std::cout << "itr_cnt (out of loop) : " << itr_cnt << std::endl;
+
+  if ((itr_cnt == current_graph_view.number_of_vertices()) && (nr_of_updated_vertices > 0)) {
     std::cout << "Detected -ve cycle.\n";
   }
 

From 5691949a9332115761e0a42c6034c479034905e6 Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Sun, 31 Mar 2024 00:17:06 +0100
Subject: [PATCH 3/4] bellman_ford to detect negative cycle with queue

---
 cpp/src/traversal/bellman_ford_impl.cuh | 540 ++++++++++++------------
 1 file changed, 275 insertions(+), 265 deletions(-)

diff --git a/cpp/src/traversal/bellman_ford_impl.cuh b/cpp/src/traversal/bellman_ford_impl.cuh
index ec144aa5011..7edb29b27e9 100644
--- a/cpp/src/traversal/bellman_ford_impl.cuh
+++ b/cpp/src/traversal/bellman_ford_impl.cuh
@@ -19,7 +19,10 @@
 #include "prims/reduce_op.cuh"
 #include "prims/transform_e.cuh"
 #include "prims/transform_reduce_e_by_src_dst_key.cuh"
+#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh"
 #include "prims/update_edge_src_dst_property.cuh"
+#include "prims/update_v_frontier.cuh"
+#include "prims/vertex_frontier.cuh"
 
 #include <cugraph/algorithms.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
@@ -64,6 +67,8 @@ void bellman_ford(raft::handle_t const& handle,
 
   current_graph_view.attach_edge_mask(edge_masks_even.view());
 
+  bool debug_flag = current_graph_view.number_of_vertices() <= 7;
+
   // 2. initialize distances and predecessors
 
   auto constexpr invalid_distance = std::numeric_limits<weight_t>::max();
@@ -81,13 +86,30 @@ void bellman_ford(raft::handle_t const& handle,
       return thrust::make_tuple(distance, invalid_vertex);
     });
 
-  edge_src_property_t<graph_view_t, vertex_t> src_predecessor_cache(handle);
-  edge_src_property_t<graph_view_t, weight_t> src_distance_cache(handle);
+  // auto src_predecessor_cache =
+  //   graph_view_t::is_multi_gpu
+  //     ? edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view)
+  //     : edge_src_property_t<graph_view_t, vertex_t>(handle);
+
+  auto src_distance_cache =
+    graph_view_t::is_multi_gpu
+      ? edge_src_property_t<graph_view_t, weight_t>(handle, current_graph_view)
+      : edge_src_property_t<graph_view_t, weight_t>(handle);
+
+  // auto dst_predecessor_cache =
+  //   graph_view_t::is_multi_gpu
+  //     ? edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view)
+  //     : edge_dst_property_t<graph_view_t, vertex_t>(handle);
 
-  edge_dst_property_t<graph_view_t, vertex_t> dst_predecessor_cache(handle);
-  edge_dst_property_t<graph_view_t, weight_t> dst_distance_cache(handle);
+  // auto dst_distance_cache =
+  //   graph_view_t::is_multi_gpu
+  //     ? edge_dst_property_t<graph_view_t, weight_t>(handle, current_graph_view)
+  //     : edge_dst_property_t<graph_view_t, weight_t>(handle);
 
-  edge_dst_property_t<graph_view_t, vertex_t> dst_key_cache(handle);
+  // auto dst_key_cache = graph_view_t::is_multi_gpu
+  //                        ? edge_dst_property_t<graph_view_t, vertex_t>(handle,
+  //                        current_graph_view) : edge_dst_property_t<graph_view_t,
+  //                        vertex_t>(handle);
 
   rmm::device_uvector<vertex_t> local_vertices(
     current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
@@ -97,168 +119,181 @@ void bellman_ford(raft::handle_t const& handle,
                         local_vertices.size(),
                         current_graph_view.local_vertex_partition_range_first());
 
-  // auto vertex_begin =
-  //   thrust::make_counting_iterator(current_graph_view.local_vertex_partition_range_first());
-  // auto vertex_end =
-  //   thrust::make_counting_iterator(current_graph_view.local_vertex_partition_range_last());
+  constexpr size_t bucket_idx_curr = 0;
+  constexpr size_t bucket_idx_next = 1;
+  constexpr size_t num_buckets     = 2;
 
-  // thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, local_vertices.begin());
+  vertex_frontier_t<vertex_t, void, graph_view_t::is_multi_gpu, true> vertex_frontier(handle,
+                                                                                      num_buckets);
 
-  vertex_t itr_cnt           = 0;
-  int nr_of_updated_vertices = 0;
-  while (itr_cnt < current_graph_view.number_of_vertices()) {
-    if constexpr (graph_view_t::is_multi_gpu) {
-      src_predecessor_cache =
-        edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
-      src_distance_cache = edge_src_property_t<graph_view_t, weight_t>(handle, current_graph_view);
-
-      dst_predecessor_cache =
-        edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
-      dst_distance_cache = edge_dst_property_t<graph_view_t, weight_t>(handle, current_graph_view);
+  if (current_graph_view.in_local_vertex_partition_range_nocheck(source)) {
+    vertex_frontier.bucket(bucket_idx_curr).insert(source);
+  }
 
-      dst_key_cache = edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view);
+  rmm::device_uvector<vertex_t> enqueue_counter(
+    current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
 
-      update_edge_src_property(handle, current_graph_view, predecessors, src_predecessor_cache);
-      update_edge_src_property(handle, current_graph_view, distances, src_distance_cache);
+  thrust::fill(
+    handle.get_thrust_policy(), enqueue_counter.begin(), enqueue_counter.end(), vertex_t{0});
 
-      update_edge_dst_property(handle, current_graph_view, predecessors, dst_predecessor_cache);
-      update_edge_dst_property(handle, current_graph_view, distances, dst_distance_cache);
+  vertex_t itr_cnt           = 0;
+  int nr_of_updated_vertices = 0;
 
-      update_edge_dst_property(handle, current_graph_view, local_vertices.begin(), dst_key_cache);
+  vertex_t nr_vertices_n_plus_times = 0;
+  while (true) {
+    if constexpr (graph_view_t::is_multi_gpu) {
+      // update_edge_src_property(handle,
+      //                          current_graph_view,
+      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
+      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
+      //                          predecessors,
+      //                          src_predecessor_cache);
+      update_edge_src_property(handle,
+                               current_graph_view,
+                               vertex_frontier.bucket(bucket_idx_curr).begin(),
+                               vertex_frontier.bucket(bucket_idx_curr).end(),
+                               distances,
+                               src_distance_cache);
+
+      // update_edge_dst_property(handle,
+      //                          current_graph_view,
+      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
+      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
+      //                          predecessors,
+      //                          dst_predecessor_cache);
+      // update_edge_dst_property(handle,
+      //                          current_graph_view,
+      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
+      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
+      //                          distances,
+      //                          dst_distance_cache);
+
+      // update_edge_dst_property(handle,
+      //                          current_graph_view,
+      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
+      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
+      //                          local_vertices.begin(),
+      //                          dst_key_cache);
     }
 
+    /*
     auto src_input_property_values =
       graph_view_t::is_multi_gpu
-        ? view_concat(src_predecessor_cache.view(), src_distance_cache.view())
-        : view_concat(detail::edge_major_property_view_t<vertex_t, vertex_t const*>(predecessors),
-                      detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances));
-
-    auto dst_input_property_values =
-      graph_view_t::is_multi_gpu
-        ? view_concat(dst_predecessor_cache.view(), dst_distance_cache.view())
-        : view_concat(
-            detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(predecessors,
-                                                                          vertex_t{0}),
-            detail::edge_minor_property_view_t<vertex_t, weight_t const*>(distances, weight_t{0}));
+        // ? view_concat(src_predecessor_cache.view(), src_distance_cache.view())
+        ? src_distance_cache.view()
+        // : view_concat(detail::edge_major_property_view_t<vertex_t, vertex_t
+        // const*>(predecessors),
+        //               detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances));
+        : detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances);
+
+
+    auto dst_input_property_values = graph_view_t::is_multi_gpu
+      // ? view_concat(dst_predecessor_cache.view(), dst_distance_cache.view())
+      ? dst_distance_cache.view()
+        // : view_concat(
+        //     detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(predecessors,
+        //                                                                   vertex_t{0}),
+        //     detail::edge_minor_property_view_t<vertex_t, weight_t const*>(distances,
+        //     weight_t{0}));
+
+        //     : view_concat(
+        detail::edge_minor_property_view_t<vertex_t, weight_t const*>(distances, weight_t{0});
+      */
+    auto [new_frontier_vertex_buffer, distance_predecessor_buffer] =
+      transform_reduce_v_frontier_outgoing_e_by_dst(
+        handle,
+        current_graph_view,
+        vertex_frontier.bucket(bucket_idx_curr),
+        graph_view_t::is_multi_gpu
+          ? src_distance_cache.view()
+          : detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances),
+        edge_dst_dummy_property_t{}.view(),
+        *edge_weight_view,
+        [debug_flag,
+         distances,
+         v_first = current_graph_view.local_vertex_partition_range_first(),
+         v_last  = current_graph_view
+                    .local_vertex_partition_range_last()] __device__(auto src,
+                                                                     auto dst,
+                                                                     auto src_dist,
+                                                                     thrust::nullopt_t,
+                                                                     // auto dst_dist,
+                                                                     // thrust::tuple<vertex_t,
+                                                                     // weight_t> src_pred_dist,
+                                                                     // thrust::tuple<vertex_t,
+                                                                     // weight_t> dst_pred_dist,
+                                                                     auto wt) {
+          if (dst < v_first || dst >= v_last) {
+            printf("\n ****** dst = %d is not in this VP \n", static_cast<int>(dst));
+          }
+          // auto src_pred = thrust::get<0>(src_pred_dist);
+          // auto src_dist = thrust::get<1>(src_pred_dist);
+
+          // auto dst_pred = thrust::get<0>(dst_pred_dist);
+          // auto dst_dist = thrust::get<1>(dst_pred_dist);
+
+          auto dst_dist = distances[dst - v_first];
+
+          /*src_pred = %d dst_pred = %d,*/
+          if (debug_flag)
+            printf("src = %d dst = %d wt = %f  src_dist = %f dst_dist = %f\n",
+                   static_cast<int>(src),
+                   static_cast<int>(dst),
+                   static_cast<float>(wt),
+                   // static_cast<int>(src_pred),
+                   // static_cast<int>(dst_pred),
+                   static_cast<float>(src_dist),
+                   static_cast<float>(dst_dist));
+
+          auto relax = (dst_dist > (src_dist + wt));
+
+          return relax ? thrust::optional<thrust::tuple<weight_t, vertex_t>>{thrust::make_tuple(
+                           src_dist + wt, src)}
+                       : thrust::nullopt;
+        },
+        reduce_op::minimum<thrust::tuple<weight_t, vertex_t>>(),
+        true);
 
     if (graph_view_t::is_multi_gpu) {
       auto const comm_rank = handle.get_comms().get_rank();
       auto const comm_size = handle.get_comms().get_size();
 
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto local_vertices_title = std::string("local_vertices_").append(std::to_string(comm_rank));
-      raft::print_device_vector(
-        local_vertices_title.c_str(), local_vertices.begin(), local_vertices.size(), std::cout);
+      auto new_frontier_vertex_buffer_title = std::string("nvb_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(new_frontier_vertex_buffer_title.c_str(),
+                                  new_frontier_vertex_buffer.begin(),
+                                  new_frontier_vertex_buffer.size(),
+                                  std::cout);
+
+      auto key_buffer = thrust::get<0>(
+        cugraph::get_dataframe_buffer_cbegin(distance_predecessor_buffer).get_iterator_tuple());
+      auto value_buffer = thrust::get<1>(
+        cugraph::get_dataframe_buffer_cbegin(distance_predecessor_buffer).get_iterator_tuple());
 
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto distances_title = std::string("distances_").append(std::to_string(comm_rank));
-      raft::print_device_vector(distances_title.c_str(),
-                                distances,
-                                current_graph_view.local_vertex_partition_range_size(),
-                                std::cout);
+      auto key_buffer_title = std::string("key_buffer_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(
+          key_buffer_title.c_str(), key_buffer, new_frontier_vertex_buffer.size(), std::cout);
 
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto predecessors_title = std::string("predecessors_").append(std::to_string(comm_rank));
-      raft::print_device_vector(predecessors_title.c_str(),
-                                predecessors,
-                                current_graph_view.local_vertex_partition_range_size(),
-                                std::cout);
+      auto value_buffer_title = std::string("value_buffer_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(
+          value_buffer_title.c_str(), value_buffer, new_frontier_vertex_buffer.size(), std::cout);
     }
 
-    rmm::device_uvector<vertex_t> edge_reduced_dst_keys(0, handle.get_stream());
-    rmm::device_uvector<weight_t> minimum_weights(0, handle.get_stream());
-    rmm::device_uvector<vertex_t> closest_preds(0, handle.get_stream());
+    nr_of_updated_vertices = new_frontier_vertex_buffer.size();
 
-    std::forward_as_tuple(edge_reduced_dst_keys, std::tie(minimum_weights, closest_preds)) =
-      cugraph::transform_reduce_e_by_dst_key(
-        handle,
-        current_graph_view,
-        src_input_property_values,
-        dst_input_property_values,
-        *edge_weight_view,
-        graph_view_t::is_multi_gpu ? dst_key_cache.view()
-                                   : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
-                                       local_vertices.begin(), vertex_t{0}),
-        [] __device__(auto src,
-                      auto dst,
-                      thrust::tuple<vertex_t, weight_t> src_pred_dist,
-                      thrust::tuple<vertex_t, weight_t> dst_pred_dist,
-                      auto wt) {
-          auto src_pred = thrust::get<0>(src_pred_dist);
-          auto src_dist = thrust::get<1>(src_pred_dist);
-
-          auto dst_pred = thrust::get<0>(dst_pred_dist);
-          auto dst_dist = thrust::get<1>(dst_pred_dist);
-
-          printf(
-            "src = %d dst = %d wt = %f src_pred = %d dst_pred = %d, src_dist = %f dst_dist = %f\n",
-            static_cast<int>(src),
-            static_cast<int>(dst),
-            static_cast<float>(wt),
-            static_cast<int>(src_pred),
-            static_cast<int>(dst_pred),
-            static_cast<float>(src_dist),
-            static_cast<float>(dst_dist));
-
-          auto relax = (src_dist < invalid_distance) && (dst_dist > (src_dist + wt));
-
-          return relax ? thrust::make_tuple(src_dist + wt, src)
-                       : thrust::make_tuple(invalid_distance, invalid_vertex);
-        },
-        thrust::make_tuple(invalid_distance, invalid_vertex),
-        reduce_op::minimum<thrust::tuple<weight_t, vertex_t>>{},
-        true);
+    if (graph_view_t::is_multi_gpu) {
+      nr_of_updated_vertices = host_scalar_allreduce(
+        handle.get_comms(), nr_of_updated_vertices, raft::comms::op_t::SUM, handle.get_stream());
+    }
 
-    if constexpr (graph_view_t::is_multi_gpu) {
-      auto vertex_partition_range_lasts = current_graph_view.vertex_partition_range_lasts();
-
-      rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
-        vertex_partition_range_lasts.size(), handle.get_stream());
-
-      raft::update_device(d_vertex_partition_range_lasts.data(),
-                          vertex_partition_range_lasts.data(),
-                          vertex_partition_range_lasts.size(),
-                          handle.get_stream());
-
-      auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
-      auto const major_comm_size = major_comm.get_size();
-      auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
-      auto const minor_comm_size = minor_comm.get_size();
-
-      auto func = cugraph::detail::compute_gpu_id_from_int_vertex_t<vertex_t>{
-        raft::device_span<vertex_t const>(d_vertex_partition_range_lasts.data(),
-                                          d_vertex_partition_range_lasts.size()),
-        major_comm_size,
-        minor_comm_size};
-
-      rmm::device_uvector<size_t> d_tx_value_counts(0, handle.get_stream());
-
-      auto triplet_first = thrust::make_zip_iterator(
-        edge_reduced_dst_keys.begin(), minimum_weights.begin(), closest_preds.begin());
-
-      d_tx_value_counts = cugraph::groupby_and_count(
-        triplet_first,
-        triplet_first + edge_reduced_dst_keys.size(),
-        [func] __device__(auto val) { return func(thrust::get<0>(val)); },
-        handle.get_comms().get_size(),
-        std::numeric_limits<vertex_t>::max(),
-        handle.get_stream());
-
-      std::vector<size_t> h_tx_value_counts(d_tx_value_counts.size());
-      raft::update_host(h_tx_value_counts.data(),
-                        d_tx_value_counts.data(),
-                        d_tx_value_counts.size(),
-                        handle.get_stream());
-      handle.sync_stream();
-
-      std::forward_as_tuple(std::tie(edge_reduced_dst_keys, minimum_weights, closest_preds),
-                            std::ignore) =
-        shuffle_values(
-          handle.get_comms(),
-          thrust::make_zip_iterator(
-            edge_reduced_dst_keys.begin(), minimum_weights.begin(), closest_preds.begin()),
-          h_tx_value_counts,
-          handle.get_stream());
+    if (nr_of_updated_vertices == 0) {
+      std::cout << "no update break\n";
+      break;
     }
 
     if (graph_view_t::is_multi_gpu) {
@@ -266,146 +301,120 @@ void bellman_ford(raft::handle_t const& handle,
       auto const comm_size = handle.get_comms().get_size();
 
       RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto edge_reduced_dst_keys_title =
-        std::string("edge_reduced_dst_keys_").append(std::to_string(comm_rank));
-      raft::print_device_vector(edge_reduced_dst_keys_title.c_str(),
-                                edge_reduced_dst_keys.begin(),
-                                edge_reduced_dst_keys.size(),
-                                std::cout);
-
-      auto minimum_weights_title =
-        std::string("minimum_weights_").append(std::to_string(comm_rank));
-      raft::print_device_vector(
-        minimum_weights_title.c_str(), minimum_weights.begin(), minimum_weights.size(), std::cout);
-
-      auto closest_preds_title = std::string("closest_preds_").append(std::to_string(comm_rank));
-      raft::print_device_vector(
-        closest_preds_title.c_str(), closest_preds.begin(), closest_preds.size(), std::cout);
+      auto enqueue_counter_title = std::string("qc_b_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(enqueue_counter_title.c_str(),
+                                  enqueue_counter.begin(),
+                                  enqueue_counter.size(),
+                                  std::cout);
     }
 
-    using flag_t                        = uint8_t;
-    rmm::device_uvector<flag_t> updated = rmm::device_uvector<flag_t>(
-      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
+    thrust::for_each(handle.get_thrust_policy(),
+                     new_frontier_vertex_buffer.begin(),
+                     new_frontier_vertex_buffer.end(),
+                     [v_first         = current_graph_view.local_vertex_partition_range_first(),
+                      v_last          = current_graph_view.local_vertex_partition_range_last(),
+                      enqueue_counter = enqueue_counter.begin()] __device__(vertex_t v) {
+                       if (v < v_first || v >= v_last) {
+                         printf("\n enque conter: *** v = %d is not in this VP \n",
+                                static_cast<int>(v));
+                       }
+                       enqueue_counter[v - v_first] += 1;
+                     });
+
+    if (graph_view_t::is_multi_gpu) {
+      auto const comm_rank = handle.get_comms().get_rank();
+      auto const comm_size = handle.get_comms().get_size();
 
-    thrust::fill(handle.get_thrust_policy(), updated.begin(), updated.end(), flag_t{false});
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto enqueue_counter_title = std::string("qc_a_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(enqueue_counter_title.c_str(),
+                                  enqueue_counter.begin(),
+                                  enqueue_counter.size(),
+                                  std::cout);
+    }
 
-    thrust::for_each(
+    nr_vertices_n_plus_times = thrust::count_if(
       handle.get_thrust_policy(),
-      thrust::make_zip_iterator(thrust::make_tuple(
-        edge_reduced_dst_keys.begin(), minimum_weights.begin(), closest_preds.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(
-        edge_reduced_dst_keys.end(), minimum_weights.end(), closest_preds.end())),
-      [distances,
-       predecessors,
-       updated = updated.begin(),
-       v_first = current_graph_view.local_vertex_partition_range_first(),
-       v_last =
-         current_graph_view.local_vertex_partition_range_last()] __device__(auto v_dist_pred) {
-        auto v = thrust::get<0>(v_dist_pred);
-        if ((v < v_first) || (v >= v_last)) {
-          printf("%d > > > > > out of range [%d %d)\n",
-                 static_cast<int>(v),
-                 static_cast<int>(v_first),
-                 static_cast<int>(v_last));
-        }
-
-        auto dist     = thrust::get<1>(v_dist_pred);
-        auto pred     = thrust::get<2>(v_dist_pred);
-        auto v_offset = v - v_first;
-
-        printf(" vertex %d : [pred=%d dist = %f)\n",
-               static_cast<int>(v),
-               static_cast<int>(pred),
-               static_cast<float>(dist));
-
-        if (pred != invalid_vertex) {
-          updated[v_offset]      = flag_t{true};
-          distances[v_offset]    = dist;
-          predecessors[v_offset] = pred;
-        }
-      });
-
-    nr_of_updated_vertices =
-      thrust::count(handle.get_thrust_policy(), updated.begin(), updated.end(), flag_t{true});
+      enqueue_counter.begin(),
+      enqueue_counter.end(),
+      [n = current_graph_view.number_of_vertices()] __device__(auto flag) { return flag >= n; });
 
-    if constexpr (graph_view_t::is_multi_gpu) {
-      nr_of_updated_vertices = host_scalar_allreduce(
-        handle.get_comms(), nr_of_updated_vertices, raft::comms::op_t::SUM, handle.get_stream());
+    if (graph_view_t::is_multi_gpu) {
+      nr_vertices_n_plus_times = host_scalar_allreduce(
+        handle.get_comms(), nr_vertices_n_plus_times, raft::comms::op_t::SUM, handle.get_stream());
     }
 
-    itr_cnt++;
-    std::cout << "itr_cnt: " << itr_cnt << std::endl;
-
-    if (nr_of_updated_vertices == 0) {
-      std::cout << "No more updates\n";
+    if (nr_vertices_n_plus_times > 0) {
+      std::cout << "enque n+ break\n";
       break;
     }
-  }
 
-  std::cout << "itr_cnt (out of loop) : " << itr_cnt << std::endl;
-
-  if ((itr_cnt == current_graph_view.number_of_vertices()) && (nr_of_updated_vertices > 0)) {
-    std::cout << "Detected -ve cycle.\n";
-  }
-
-  ///
-
-  /*
-  vertex_t color_id = 0;
-  while (true) {
-    using flag_t                                 = uint8_t;
-    rmm::device_uvector<flag_t> is_vertex_in_mis = rmm::device_uvector<flag_t>(
-      current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(), is_vertex_in_mis.begin(), is_vertex_in_mis.end(), 0);
-
-    if (current_graph_view.compute_number_of_edges(handle) == 0) { break; }
-
-    cugraph::edge_src_property_t<graph_view_t, flag_t> src_mis_flags(handle, current_graph_view);
-    cugraph::edge_dst_property_t<graph_view_t, flag_t> dst_mis_flags(handle, current_graph_view);
-
-    cugraph::update_edge_src_property(
-      handle, current_graph_view, is_vertex_in_mis.begin(), src_mis_flags);
+    update_v_frontier(handle,
+                      current_graph_view,
+                      std::move(new_frontier_vertex_buffer),
+                      std::move(distance_predecessor_buffer),
+                      vertex_frontier,
+                      std::vector<size_t>{bucket_idx_next},
+                      distances,
+                      thrust::make_zip_iterator(thrust::make_tuple(distances, predecessors)),
+                      [] __device__(auto v, auto v_val, auto pushed_val) {
+                        auto new_dist = thrust::get<0>(pushed_val);
+                        auto update   = (new_dist < v_val);
+                        return thrust::make_tuple(
+                          update ? thrust::optional<size_t>{bucket_idx_next} : thrust::nullopt,
+                          update ? thrust::optional<thrust::tuple<weight_t, vertex_t>>{pushed_val}
+                                 : thrust::nullopt);
+                      });
+
+    vertex_frontier.bucket(bucket_idx_curr).clear();
+    vertex_frontier.bucket(bucket_idx_curr).shrink_to_fit();
+
+    if (vertex_frontier.bucket(bucket_idx_next).aggregate_size() > 0) {
+      vertex_frontier.swap_buckets(bucket_idx_curr, bucket_idx_next);
+    } else {
+      std::cout << "swap break\n";
+      break;
+    }
 
-    cugraph::update_edge_dst_property(
-      handle, current_graph_view, is_vertex_in_mis.begin(), dst_mis_flags);
+    if (graph_view_t::is_multi_gpu) {
+      auto const comm_rank = handle.get_comms().get_rank();
+      auto const comm_size = handle.get_comms().get_size();
 
-    if (color_id % 2 == 0) {
-      cugraph::transform_e(
-        handle,
-        current_graph_view,
-        src_mis_flags.view(),
-        dst_mis_flags.view(),
-        cugraph::edge_dummy_property_t{}.view(),
-        [color_id] __device__(
-          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
-          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
-        },
-        edge_masks_odd.mutable_view());
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto local_vertices_title = std::string("local_vertices_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(
+          local_vertices_title.c_str(), local_vertices.begin(), local_vertices.size(), std::cout);
 
-      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
-      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_even);
-      current_graph_view.attach_edge_mask(edge_masks_odd.view());
-    } else {
-      cugraph::transform_e(
-        handle,
-        current_graph_view,
-        src_mis_flags.view(),
-        dst_mis_flags.view(),
-        cugraph::edge_dummy_property_t{}.view(),
-        [color_id] __device__(
-          auto src, auto dst, auto is_src_in_mis, auto is_dst_in_mis, thrust::nullopt_t) {
-          return !((is_src_in_mis == uint8_t{true}) || (is_dst_in_mis == uint8_t{true}));
-        },
-        edge_masks_even.mutable_view());
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto distances_title = std::string("distances_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(distances_title.c_str(),
+                                  distances,
+                                  current_graph_view.local_vertex_partition_range_size(),
+                                  std::cout);
 
-      if (current_graph_view.has_edge_mask()) current_graph_view.clear_edge_mask();
-      cugraph::fill_edge_property(handle, current_graph_view, bool{false}, edge_masks_odd);
-      current_graph_view.attach_edge_mask(edge_masks_even.view());
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());
+      auto predecessors_title = std::string("predecessors_").append(std::to_string(comm_rank));
+      if (debug_flag)
+        raft::print_device_vector(predecessors_title.c_str(),
+                                  predecessors,
+                                  current_graph_view.local_vertex_partition_range_size(),
+                                  std::cout);
     }
 
-    color_id++;
+    itr_cnt++;
+    std::cout << "itr_cnt: " << itr_cnt << std::endl;
   }
-  */
+
+  std::cout << "itr_cnt (out of loop) : " << itr_cnt << std::endl;
+
+  if (nr_vertices_n_plus_times > 0) { std::cout << "Found -ve cycle " << std::endl; }
+  // if ((itr_cnt == current_graph_view.number_of_vertices()) && (nr_of_updated_vertices > 0)) {
+  //   std::cout << "Detected -ve cycle.\n";
+  // }
 }
 }  // namespace detail
 
@@ -418,6 +427,7 @@ void bellman_ford(raft::handle_t const& handle,
                   weight_t* distances)
 {
   detail::bellman_ford(handle, graph_view, edge_weight_view, source, predecessors, distances);
+  std::cout << " returning from cugraph::bellman\n";
 }
 
 }  // namespace cugraph

From 9182c872c193094622a894e89900d69b0f895c2d Mon Sep 17 00:00:00 2001
From: Naim <naim@uib.no>
Date: Mon, 1 Apr 2024 01:04:21 +0200
Subject: [PATCH 4/4] Compute shortest paths from a source vertex using
 Bellman-Ford algorithm

---
 cpp/include/cugraph/algorithms.hpp      |  36 ++++
 cpp/src/traversal/bellman_ford_impl.cuh | 275 ++++--------------------
 cpp/src/traversal/bellman_ford_mg.cu    |  88 ++++----
 cpp/src/traversal/bellman_ford_sg.cu    |  88 ++++----
 4 files changed, 156 insertions(+), 331 deletions(-)

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 1471d340cec..655c05d46ad 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1205,6 +1205,42 @@ void sssp(raft::handle_t const& handle,
           weight_t cutoff         = std::numeric_limits<weight_t>::max(),
           bool do_expensive_check = false);
 
+/**
+ * @brief Run Bellman-Ford algorithm to compute the minimum distances (and predecessors) from
+ * the source vertex.
+ *
+ * This function computes the distances (minimum edge weight sums) from the source vertex. If @p
+ * predecessors is not `nullptr`, this function calculates the predecessor of each vertex in the
+ * shortest-path as well. Bellman-Ford algorithm works for negative edge weights as well. If the
+ * input graph has negative cycle(s), the algorithm return false.
+ *
+ * @throws cugraph::logic_error on erroneous input arguments.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ * or multi-GPU (true).
+ * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[in] graph_view Graph view object.
+ * @param[in] edge_weight_view View object holding edge weights for @p graph_view.
+ * @param[in] source_vertex Source vertex to start single-source shortest-path.
+ * In a multi-gpu context the source vertex should be local to this GPU.
+ * @param[out] distances Pointer to the output distance array.
+ * @param[out] predecessors Pointer to the output predecessor array or `nullptr`.
+ * @return True if there is no negative cycle in input graph pointed by @p graph_view, and in
+ * that case @p distances and @p predecessors contain valid results.
+ * False otherwise, and in that case @p distances and @p predecessors contain invalid values.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+bool bellman_ford(raft::handle_t const& handle,
+                  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+                  edge_property_view_t<edge_t, weight_t const*> edge_weight_view,
+                  vertex_t source_vertex,
+                  vertex_t* predecessors,
+                  weight_t* distances);
+
 /*
  * @brief Compute the shortest distances from the given origins to all the given destinations.
  *
diff --git a/cpp/src/traversal/bellman_ford_impl.cuh b/cpp/src/traversal/bellman_ford_impl.cuh
index 7edb29b27e9..ac86e115d09 100644
--- a/cpp/src/traversal/bellman_ford_impl.cuh
+++ b/cpp/src/traversal/bellman_ford_impl.cuh
@@ -37,9 +37,9 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void bellman_ford(raft::handle_t const& handle,
+bool bellman_ford(raft::handle_t const& handle,
                   cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-                  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+                  edge_property_view_t<edge_t, weight_t const*> edge_weight_view,
                   vertex_t source,
                   vertex_t* predecessors,
                   weight_t* distances)
@@ -67,8 +67,6 @@ void bellman_ford(raft::handle_t const& handle,
 
   current_graph_view.attach_edge_mask(edge_masks_even.view());
 
-  bool debug_flag = current_graph_view.number_of_vertices() <= 7;
-
   // 2. initialize distances and predecessors
 
   auto constexpr invalid_distance = std::numeric_limits<weight_t>::max();
@@ -86,31 +84,11 @@ void bellman_ford(raft::handle_t const& handle,
       return thrust::make_tuple(distance, invalid_vertex);
     });
 
-  // auto src_predecessor_cache =
-  //   graph_view_t::is_multi_gpu
-  //     ? edge_src_property_t<graph_view_t, vertex_t>(handle, current_graph_view)
-  //     : edge_src_property_t<graph_view_t, vertex_t>(handle);
-
   auto src_distance_cache =
     graph_view_t::is_multi_gpu
       ? edge_src_property_t<graph_view_t, weight_t>(handle, current_graph_view)
       : edge_src_property_t<graph_view_t, weight_t>(handle);
 
-  // auto dst_predecessor_cache =
-  //   graph_view_t::is_multi_gpu
-  //     ? edge_dst_property_t<graph_view_t, vertex_t>(handle, current_graph_view)
-  //     : edge_dst_property_t<graph_view_t, vertex_t>(handle);
-
-  // auto dst_distance_cache =
-  //   graph_view_t::is_multi_gpu
-  //     ? edge_dst_property_t<graph_view_t, weight_t>(handle, current_graph_view)
-  //     : edge_dst_property_t<graph_view_t, weight_t>(handle);
-
-  // auto dst_key_cache = graph_view_t::is_multi_gpu
-  //                        ? edge_dst_property_t<graph_view_t, vertex_t>(handle,
-  //                        current_graph_view) : edge_dst_property_t<graph_view_t,
-  //                        vertex_t>(handle);
-
   rmm::device_uvector<vertex_t> local_vertices(
     current_graph_view.local_vertex_partition_range_size(), handle.get_stream());
 
@@ -136,71 +114,19 @@ void bellman_ford(raft::handle_t const& handle,
   thrust::fill(
     handle.get_thrust_policy(), enqueue_counter.begin(), enqueue_counter.end(), vertex_t{0});
 
-  vertex_t itr_cnt           = 0;
-  int nr_of_updated_vertices = 0;
-
-  vertex_t nr_vertices_n_plus_times = 0;
+  vertex_t nr_times_in_queue = 0;
   while (true) {
     if constexpr (graph_view_t::is_multi_gpu) {
-      // update_edge_src_property(handle,
-      //                          current_graph_view,
-      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
-      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
-      //                          predecessors,
-      //                          src_predecessor_cache);
-      update_edge_src_property(handle,
-                               current_graph_view,
-                               vertex_frontier.bucket(bucket_idx_curr).begin(),
-                               vertex_frontier.bucket(bucket_idx_curr).end(),
-                               distances,
-                               src_distance_cache);
-
-      // update_edge_dst_property(handle,
-      //                          current_graph_view,
-      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
-      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
-      //                          predecessors,
-      //                          dst_predecessor_cache);
-      // update_edge_dst_property(handle,
-      //                          current_graph_view,
-      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
-      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
-      //                          distances,
-      //                          dst_distance_cache);
-
-      // update_edge_dst_property(handle,
-      //                          current_graph_view,
-      //                          vertex_frontier.bucket(bucket_idx_curr).begin(),
-      //                          vertex_frontier.bucket(bucket_idx_curr).end(),
-      //                          local_vertices.begin(),
-      //                          dst_key_cache);
+      cugraph::update_edge_src_property(handle,
+                                        current_graph_view,
+                                        vertex_frontier.bucket(bucket_idx_curr).begin(),
+                                        vertex_frontier.bucket(bucket_idx_curr).end(),
+                                        distances,
+                                        src_distance_cache);
     }
 
-    /*
-    auto src_input_property_values =
-      graph_view_t::is_multi_gpu
-        // ? view_concat(src_predecessor_cache.view(), src_distance_cache.view())
-        ? src_distance_cache.view()
-        // : view_concat(detail::edge_major_property_view_t<vertex_t, vertex_t
-        // const*>(predecessors),
-        //               detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances));
-        : detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances);
-
-
-    auto dst_input_property_values = graph_view_t::is_multi_gpu
-      // ? view_concat(dst_predecessor_cache.view(), dst_distance_cache.view())
-      ? dst_distance_cache.view()
-        // : view_concat(
-        //     detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(predecessors,
-        //                                                                   vertex_t{0}),
-        //     detail::edge_minor_property_view_t<vertex_t, weight_t const*>(distances,
-        //     weight_t{0}));
-
-        //     : view_concat(
-        detail::edge_minor_property_view_t<vertex_t, weight_t const*>(distances, weight_t{0});
-      */
     auto [new_frontier_vertex_buffer, distance_predecessor_buffer] =
-      transform_reduce_v_frontier_outgoing_e_by_dst(
+      cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(
         handle,
         current_graph_view,
         vertex_frontier.bucket(bucket_idx_curr),
@@ -208,44 +134,19 @@ void bellman_ford(raft::handle_t const& handle,
           ? src_distance_cache.view()
           : detail::edge_major_property_view_t<vertex_t, weight_t const*>(distances),
         edge_dst_dummy_property_t{}.view(),
-        *edge_weight_view,
-        [debug_flag,
-         distances,
+        edge_weight_view,
+        [distances,
          v_first = current_graph_view.local_vertex_partition_range_first(),
-         v_last  = current_graph_view
-                    .local_vertex_partition_range_last()] __device__(auto src,
-                                                                     auto dst,
-                                                                     auto src_dist,
-                                                                     thrust::nullopt_t,
-                                                                     // auto dst_dist,
-                                                                     // thrust::tuple<vertex_t,
-                                                                     // weight_t> src_pred_dist,
-                                                                     // thrust::tuple<vertex_t,
-                                                                     // weight_t> dst_pred_dist,
-                                                                     auto wt) {
-          if (dst < v_first || dst >= v_last) {
-            printf("\n ****** dst = %d is not in this VP \n", static_cast<int>(dst));
-          }
-          // auto src_pred = thrust::get<0>(src_pred_dist);
-          // auto src_dist = thrust::get<1>(src_pred_dist);
-
-          // auto dst_pred = thrust::get<0>(dst_pred_dist);
-          // auto dst_dist = thrust::get<1>(dst_pred_dist);
+         v_last =
+           current_graph_view.local_vertex_partition_range_last()] __device__(auto src,
+                                                                              auto dst,
+                                                                              auto src_dist,
+                                                                              thrust::nullopt_t,
+                                                                              auto wt) {
+          assert(dst < v_first || dst >= v_last);
 
           auto dst_dist = distances[dst - v_first];
-
-          /*src_pred = %d dst_pred = %d,*/
-          if (debug_flag)
-            printf("src = %d dst = %d wt = %f  src_dist = %f dst_dist = %f\n",
-                   static_cast<int>(src),
-                   static_cast<int>(dst),
-                   static_cast<float>(wt),
-                   // static_cast<int>(src_pred),
-                   // static_cast<int>(dst_pred),
-                   static_cast<float>(src_dist),
-                   static_cast<float>(dst_dist));
-
-          auto relax = (dst_dist > (src_dist + wt));
+          auto relax    = (dst_dist > (src_dist + wt));
 
           return relax ? thrust::optional<thrust::tuple<weight_t, vertex_t>>{thrust::make_tuple(
                            src_dist + wt, src)}
@@ -253,61 +154,14 @@ void bellman_ford(raft::handle_t const& handle,
         },
         reduce_op::minimum<thrust::tuple<weight_t, vertex_t>>(),
         true);
-
-    if (graph_view_t::is_multi_gpu) {
-      auto const comm_rank = handle.get_comms().get_rank();
-      auto const comm_size = handle.get_comms().get_size();
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto new_frontier_vertex_buffer_title = std::string("nvb_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(new_frontier_vertex_buffer_title.c_str(),
-                                  new_frontier_vertex_buffer.begin(),
-                                  new_frontier_vertex_buffer.size(),
-                                  std::cout);
-
-      auto key_buffer = thrust::get<0>(
-        cugraph::get_dataframe_buffer_cbegin(distance_predecessor_buffer).get_iterator_tuple());
-      auto value_buffer = thrust::get<1>(
-        cugraph::get_dataframe_buffer_cbegin(distance_predecessor_buffer).get_iterator_tuple());
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto key_buffer_title = std::string("key_buffer_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(
-          key_buffer_title.c_str(), key_buffer, new_frontier_vertex_buffer.size(), std::cout);
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto value_buffer_title = std::string("value_buffer_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(
-          value_buffer_title.c_str(), value_buffer, new_frontier_vertex_buffer.size(), std::cout);
-    }
-
-    nr_of_updated_vertices = new_frontier_vertex_buffer.size();
+    size_t nr_of_updated_vertices = new_frontier_vertex_buffer.size();
 
     if (graph_view_t::is_multi_gpu) {
       nr_of_updated_vertices = host_scalar_allreduce(
         handle.get_comms(), nr_of_updated_vertices, raft::comms::op_t::SUM, handle.get_stream());
     }
 
-    if (nr_of_updated_vertices == 0) {
-      std::cout << "no update break\n";
-      break;
-    }
-
-    if (graph_view_t::is_multi_gpu) {
-      auto const comm_rank = handle.get_comms().get_rank();
-      auto const comm_size = handle.get_comms().get_size();
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto enqueue_counter_title = std::string("qc_b_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(enqueue_counter_title.c_str(),
-                                  enqueue_counter.begin(),
-                                  enqueue_counter.size(),
-                                  std::cout);
-    }
+    if (nr_of_updated_vertices == 0) { break; }
 
     thrust::for_each(handle.get_thrust_policy(),
                      new_frontier_vertex_buffer.begin(),
@@ -315,41 +169,23 @@ void bellman_ford(raft::handle_t const& handle,
                      [v_first         = current_graph_view.local_vertex_partition_range_first(),
                       v_last          = current_graph_view.local_vertex_partition_range_last(),
                       enqueue_counter = enqueue_counter.begin()] __device__(vertex_t v) {
-                       if (v < v_first || v >= v_last) {
-                         printf("\n enque conter: *** v = %d is not in this VP \n",
-                                static_cast<int>(v));
-                       }
+                       assert(v < v_first || v >= v_last);
                        enqueue_counter[v - v_first] += 1;
                      });
 
-    if (graph_view_t::is_multi_gpu) {
-      auto const comm_rank = handle.get_comms().get_rank();
-      auto const comm_size = handle.get_comms().get_size();
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto enqueue_counter_title = std::string("qc_a_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(enqueue_counter_title.c_str(),
-                                  enqueue_counter.begin(),
-                                  enqueue_counter.size(),
-                                  std::cout);
-    }
-
-    nr_vertices_n_plus_times = thrust::count_if(
-      handle.get_thrust_policy(),
-      enqueue_counter.begin(),
-      enqueue_counter.end(),
-      [n = current_graph_view.number_of_vertices()] __device__(auto flag) { return flag >= n; });
+    nr_times_in_queue =
+      thrust::count_if(handle.get_thrust_policy(),
+                       enqueue_counter.begin(),
+                       enqueue_counter.end(),
+                       [nr_vertices = current_graph_view.number_of_vertices()] __device__(
+                         auto freq_v) { return freq_v >= nr_vertices; });
 
     if (graph_view_t::is_multi_gpu) {
-      nr_vertices_n_plus_times = host_scalar_allreduce(
-        handle.get_comms(), nr_vertices_n_plus_times, raft::comms::op_t::SUM, handle.get_stream());
+      nr_times_in_queue = host_scalar_allreduce(
+        handle.get_comms(), nr_times_in_queue, raft::comms::op_t::SUM, handle.get_stream());
     }
 
-    if (nr_vertices_n_plus_times > 0) {
-      std::cout << "enque n+ break\n";
-      break;
-    }
+    if (nr_times_in_queue > 0) { break; }
 
     update_v_frontier(handle,
                       current_graph_view,
@@ -374,60 +210,25 @@ void bellman_ford(raft::handle_t const& handle,
     if (vertex_frontier.bucket(bucket_idx_next).aggregate_size() > 0) {
       vertex_frontier.swap_buckets(bucket_idx_curr, bucket_idx_next);
     } else {
-      std::cout << "swap break\n";
       break;
     }
-
-    if (graph_view_t::is_multi_gpu) {
-      auto const comm_rank = handle.get_comms().get_rank();
-      auto const comm_size = handle.get_comms().get_size();
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto local_vertices_title = std::string("local_vertices_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(
-          local_vertices_title.c_str(), local_vertices.begin(), local_vertices.size(), std::cout);
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto distances_title = std::string("distances_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(distances_title.c_str(),
-                                  distances,
-                                  current_graph_view.local_vertex_partition_range_size(),
-                                  std::cout);
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-      auto predecessors_title = std::string("predecessors_").append(std::to_string(comm_rank));
-      if (debug_flag)
-        raft::print_device_vector(predecessors_title.c_str(),
-                                  predecessors,
-                                  current_graph_view.local_vertex_partition_range_size(),
-                                  std::cout);
-    }
-
-    itr_cnt++;
-    std::cout << "itr_cnt: " << itr_cnt << std::endl;
   }
 
-  std::cout << "itr_cnt (out of loop) : " << itr_cnt << std::endl;
-
-  if (nr_vertices_n_plus_times > 0) { std::cout << "Found -ve cycle " << std::endl; }
-  // if ((itr_cnt == current_graph_view.number_of_vertices()) && (nr_of_updated_vertices > 0)) {
-  //   std::cout << "Detected -ve cycle.\n";
-  // }
+  if (nr_times_in_queue > 0) { return false; }
+  return true;
 }
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-void bellman_ford(raft::handle_t const& handle,
+bool bellman_ford(raft::handle_t const& handle,
                   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
-                  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+                  edge_property_view_t<edge_t, weight_t const*> edge_weight_view,
                   vertex_t source,
                   vertex_t* predecessors,
                   weight_t* distances)
 {
-  detail::bellman_ford(handle, graph_view, edge_weight_view, source, predecessors, distances);
-  std::cout << " returning from cugraph::bellman\n";
+  return detail::bellman_ford(
+    handle, graph_view, edge_weight_view, source, predecessors, distances);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/traversal/bellman_ford_mg.cu b/cpp/src/traversal/bellman_ford_mg.cu
index 328591728f4..5017e5d59ed 100644
--- a/cpp/src/traversal/bellman_ford_mg.cu
+++ b/cpp/src/traversal/bellman_ford_mg.cu
@@ -17,52 +17,46 @@
 
 namespace cugraph {
 
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  float* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  double* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  float* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  int64_t source,
-  int64_t* predecessors,
-  float* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  double* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  int64_t source,
-  int64_t* predecessors,
-  double* distances);
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+                           edge_property_view_t<int32_t, float const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           float* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+                           edge_property_view_t<int32_t, double const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           double* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+                           edge_property_view_t<int64_t, float const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           float* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+                           edge_property_view_t<int64_t, float const*> edge_weight_view,
+                           int64_t source,
+                           int64_t* predecessors,
+                           float* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+                           edge_property_view_t<int64_t, double const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           double* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+                           edge_property_view_t<int64_t, double const*> edge_weight_view,
+                           int64_t source,
+                           int64_t* predecessors,
+                           double* distances);
 
 }  // namespace cugraph
diff --git a/cpp/src/traversal/bellman_ford_sg.cu b/cpp/src/traversal/bellman_ford_sg.cu
index 6a438913fd8..b57c58746c3 100644
--- a/cpp/src/traversal/bellman_ford_sg.cu
+++ b/cpp/src/traversal/bellman_ford_sg.cu
@@ -17,52 +17,46 @@
 
 namespace cugraph {
 
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  float* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  double* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  float* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
-  int64_t source,
-  int64_t* predecessors,
-  float* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  int32_t source,
-  int32_t* predecessors,
-  double* distances);
-
-template void bellman_ford(
-  raft::handle_t const& handle,
-  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
-  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
-  int64_t source,
-  int64_t* predecessors,
-  double* distances);
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+                           edge_property_view_t<int32_t, float const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           float* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+                           edge_property_view_t<int32_t, double const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           double* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+                           edge_property_view_t<int64_t, float const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           float* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+                           edge_property_view_t<int64_t, float const*> edge_weight_view,
+                           int64_t source,
+                           int64_t* predecessors,
+                           float* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+                           edge_property_view_t<int64_t, double const*> edge_weight_view,
+                           int32_t source,
+                           int32_t* predecessors,
+                           double* distances);
+
+template bool bellman_ford(raft::handle_t const& handle,
+                           graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+                           edge_property_view_t<int64_t, double const*> edge_weight_view,
+                           int64_t source,
+                           int64_t* predecessors,
+                           double* distances);
 
 }  // namespace cugraph