diff --git a/cpp/src/community/detail/refine.hpp b/cpp/src/community/detail/refine.hpp
index a60efee887f..429e0e9e6c2 100644
--- a/cpp/src/community/detail/refine.hpp
+++ b/cpp/src/community/detail/refine.hpp
@@ -46,8 +46,7 @@ refine_clustering(
   rmm::device_uvector<typename graph_view_t::vertex_type>&& next_clusters_v,
   edge_src_property_t<graph_view_t, weight_t> const& src_vertex_weights_cache,
   edge_src_property_t<graph_view_t, typename graph_view_t::vertex_type> const& src_clusters_cache,
-  edge_dst_property_t<graph_view_t, typename graph_view_t::vertex_type> const& dst_clusters_cache,
-  bool up_down);
+  edge_dst_property_t<graph_view_t, typename graph_view_t::vertex_type> const& dst_clusters_cache);
 
 }
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index 272e3d71f83..62b66ed5f41 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -150,8 +150,7 @@ refine_clustering(
   edge_src_property_t<GraphViewType, typename GraphViewType::vertex_type> const&
     src_louvain_assignment_cache,
   edge_dst_property_t<GraphViewType, typename GraphViewType::vertex_type> const&
-    dst_louvain_assignment_cache,
-  bool up_down)
+    dst_louvain_assignment_cache)
 {
   const weight_t POSITIVE_GAIN = 1e-6;
   using vertex_t               = typename GraphViewType::vertex_type;
@@ -230,6 +229,7 @@ refine_clustering(
     cugraph::reduce_op::plus<weight_t>{},
     weighted_cut_of_vertices_to_louvain.begin());
 
+  // FIXME: Consider using bit mask logic here.  Would reduce memory by 8x
   rmm::device_uvector<uint8_t> singleton_and_connected_flags(
     graph_view.local_vertex_partition_range_size(), handle.get_stream());
 
@@ -297,6 +297,11 @@ refine_clustering(
   edge_dst_property_t<GraphViewType, vertex_t> dst_leiden_assignment_cache(handle);
   edge_src_property_t<GraphViewType, uint8_t> src_singleton_and_connected_flag_cache(handle);
 
+  // FIXME:  Why is kvstore used here?  Can't this be accomplished by
+  //  a direct lookup in louvain_assignment_of_vertices using
+  //     leiden - graph_view.local_vertex_partition_range_first() as the
+  //     index?
+  // Changing this would save memory and time
   kv_store_t<vertex_t, vertex_t, false> leiden_to_louvain_map(
     leiden_assignment.begin(),
     leiden_assignment.end(),
diff --git a/cpp/src/community/detail/refine_mg_v32_e32.cu b/cpp/src/community/detail/refine_mg_v32_e32.cu
index d27260c1337..ce46a48ed5c 100644
--- a/cpp/src/community/detail/refine_mg_v32_e32.cu
+++ b/cpp/src/community/detail/refine_mg_v32_e32.cu
@@ -37,8 +37,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int32_t, int32_t, false, true>, int32_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int32_t, int32_t, false, true>, int32_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
@@ -59,8 +58,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int32_t, int32_t, false, true>, int32_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int32_t, int32_t, false, true>, int32_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/refine_mg_v64_e64.cu b/cpp/src/community/detail/refine_mg_v64_e64.cu
index 1a2ed665b8a..d870f30cd3c 100644
--- a/cpp/src/community/detail/refine_mg_v64_e64.cu
+++ b/cpp/src/community/detail/refine_mg_v64_e64.cu
@@ -37,8 +37,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int64_t, int64_t, false, true>, int64_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int64_t, int64_t, false, true>, int64_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     std::pair<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>>
@@ -59,8 +58,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int64_t, int64_t, false, true>, int64_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int64_t, int64_t, false, true>, int64_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/refine_sg_v32_e32.cu b/cpp/src/community/detail/refine_sg_v32_e32.cu
index ac0ede8225d..803a37474d4 100644
--- a/cpp/src/community/detail/refine_sg_v32_e32.cu
+++ b/cpp/src/community/detail/refine_sg_v32_e32.cu
@@ -37,8 +37,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int32_t, int32_t, false, false>, int32_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int32_t, int32_t, false, false>, int32_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     std::pair<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>>
@@ -59,8 +58,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int32_t, int32_t, false, false>, int32_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int32_t, int32_t, false, false>, int32_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/detail/refine_sg_v64_e64.cu b/cpp/src/community/detail/refine_sg_v64_e64.cu
index 97ed43b3de0..7b8bc435bc3 100644
--- a/cpp/src/community/detail/refine_sg_v64_e64.cu
+++ b/cpp/src/community/detail/refine_sg_v64_e64.cu
@@ -37,8 +37,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int64_t, int64_t, false, false>, int64_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int64_t, int64_t, false, false>, int64_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     std::pair<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>>
@@ -59,8 +58,7 @@ refine_clustering(
   edge_src_property_t<cugraph::graph_view_t<int64_t, int64_t, false, false>, int64_t> const&
     src_clusters_cache,
   edge_dst_property_t<cugraph::graph_view_t<int64_t, int64_t, false, false>, int64_t> const&
-    dst_clusters_cache,
-  bool up_down);
+    dst_clusters_cache);
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/community/leiden_impl.cuh b/cpp/src/community/leiden_impl.cuh
index da790a5dd66..c3600ff12e0 100644
--- a/cpp/src/community/leiden_impl.cuh
+++ b/cpp/src/community/leiden_impl.cuh
@@ -102,7 +102,8 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
   HighResTimer hr_timer{};
 #endif
 
-  weight_t best_modularity = weight_t{-1.0};
+  weight_t final_Q{-1};
+
   weight_t total_edge_weight =
     compute_total_edge_weight(handle, current_graph_view, *current_edge_weight_view);
 
@@ -368,9 +369,6 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     detail::timer_stop<graph_view_t::is_multi_gpu>(handle, hr_timer);
 #endif
 
-    bool terminate = (cur_Q <= best_modularity);
-    if (!terminate) { best_modularity = cur_Q; }
-
 #ifdef TIMING
     detail::timer_start<graph_view_t::is_multi_gpu>(handle, hr_timer, "contract graph");
 #endif
@@ -386,8 +384,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     auto nr_unique_louvain_clusters =
       remove_duplicates<vertex_t, multi_gpu>(handle, copied_louvain_partition);
 
-    terminate =
-      terminate || (nr_unique_louvain_clusters == current_graph_view.number_of_vertices());
+    bool terminate = (nr_unique_louvain_clusters == current_graph_view.number_of_vertices());
 
     rmm::device_uvector<vertex_t> refined_leiden_partition(0, handle.get_stream());
     std::pair<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>> leiden_to_louvain_map{
@@ -426,11 +423,19 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
                                   std::move(louvain_assignment_for_vertices),
                                   src_vertex_weights_cache,
                                   src_louvain_assignment_cache,
-                                  dst_louvain_assignment_cache,
-                                  up_down);
+                                  dst_louvain_assignment_cache);
     }
 
     // Clear buffer and contract the graph
+    final_Q = detail::compute_modularity(handle,
+                                         current_graph_view,
+                                         current_edge_weight_view,
+                                         src_louvain_assignment_cache,
+                                         dst_louvain_assignment_cache,
+                                         louvain_assignment_for_vertices,
+                                         cluster_weights,
+                                         total_edge_weight,
+                                         resolution);
 
     cluster_keys.resize(0, handle.get_stream());
     cluster_weights.resize(0, handle.get_stream());
@@ -445,6 +450,9 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
     dst_louvain_assignment_cache.clear(handle);
 
     if (!terminate) {
+      src_louvain_assignment_cache.clear(handle);
+      dst_louvain_assignment_cache.clear(handle);
+
       auto nr_unique_leiden = static_cast<vertex_t>(leiden_to_louvain_map.first.size());
       if (graph_view_t::is_multi_gpu) {
         nr_unique_leiden = host_scalar_allreduce(
@@ -586,7 +594,7 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> leiden(
   detail::timer_display<graph_view_t::is_multi_gpu>(handle, hr_timer, std::cout);
 #endif
 
-  return std::make_pair(std::move(dendrogram), best_modularity);
+  return std::make_pair(std::move(dendrogram), final_Q);
 }
 
 template <typename vertex_t, bool multi_gpu>