Skip to content

Commit

Permalink
Address Leiden numbering issue (#4845)
Browse files Browse the repository at this point in the history
Our current implementation of Leiden can return non contiguous cluster IDs however, there is an unused utility function [relabel_cluster_ids](https://github.com/rapidsai/cugraph/blob/branch-25.02/cpp/src/community/leiden_impl.cuh#L601:L604) that serves the purpose of relabeling.

This PR
- Addresses the Leiden numbering issue from [4791](#4791)  by calling `relabel_cluster_ids` after flattening the dendrogram. 
- Fixes a bug in the MG python API of Leiden which requires a different seed for each GPU in the C++ API
- Add SG and MG C++ tests
- Add a python SG and MG test capturing the numbering issue


closes #4791

Authors:
  - Joseph Nke (https://github.com/jnke2016)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: #4845
  • Loading branch information
jnke2016 authored Jan 11, 2025
1 parent a5679f0 commit ed954dc
Show file tree
Hide file tree
Showing 6 changed files with 125 additions and 27 deletions.
45 changes: 25 additions & 20 deletions cpp/src/community/leiden_impl.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
* Copyright (c) 2022-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -604,26 +604,20 @@ void relabel_cluster_ids(raft::handle_t const& handle,
size_t num_nodes)
{
vertex_t local_cluster_id_first{0};

// Get unique cluster id and shuffle
remove_duplicates<vertex_t, multi_gpu>(handle, unique_cluster_ids);

if constexpr (multi_gpu) {
auto unique_cluster_range_lasts = cugraph::partition_manager::compute_partition_range_lasts(
handle, static_cast<vertex_t>(unique_cluster_ids.size()));

auto& comm = handle.get_comms();
auto const comm_size = comm.get_size();
auto const comm_rank = comm.get_rank();
auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
auto const major_comm_size = major_comm.get_size();
auto const major_comm_rank = major_comm.get_rank();
auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name());
auto const minor_comm_size = minor_comm.get_size();
auto const minor_comm_rank = minor_comm.get_rank();

auto vertex_partition_id =
partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks(
major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank);

local_cluster_id_first =
vertex_partition_id == 0 ? vertex_t{0} : unique_cluster_range_lasts[vertex_partition_id - 1];
auto cluster_ids_size_per_rank = cugraph::host_scalar_allgather(
handle.get_comms(), unique_cluster_ids.size(), handle.get_stream());

std::vector<vertex_t> cluster_ids_starts(cluster_ids_size_per_rank.size());
std::exclusive_scan(cluster_ids_size_per_rank.begin(),
cluster_ids_size_per_rank.end(),
cluster_ids_starts.begin(),
size_t{0});
local_cluster_id_first = cluster_ids_starts[handle.get_comms().get_rank()];
}

rmm::device_uvector<vertex_t> numbering_indices(unique_cluster_ids.size(), handle.get_stream());
Expand Down Expand Up @@ -713,6 +707,17 @@ std::pair<size_t, weight_t> leiden(

detail::flatten_leiden_dendrogram(handle, graph_view, *dendrogram, clustering);

size_t local_num_verts = (*dendrogram).get_level_size_nocheck(0);
rmm::device_uvector<vertex_t> unique_cluster_ids(local_num_verts, handle.get_stream());

thrust::copy(handle.get_thrust_policy(),
clustering,
clustering + local_num_verts,
unique_cluster_ids.begin());

detail::relabel_cluster_ids<vertex_t, multi_gpu>(
handle, unique_cluster_ids, clustering, local_num_verts);

return std::make_pair(dendrogram->num_levels(), modularity);
}

Expand Down
19 changes: 18 additions & 1 deletion cpp/tests/community/leiden_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2023-2025, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
Expand All @@ -9,6 +9,7 @@
*
*/
#include "utilities/base_fixture.hpp"
#include "utilities/conversion_utilities.hpp"
#include "utilities/test_graphs.hpp"

#include <cugraph/algorithms.hpp>
Expand Down Expand Up @@ -128,6 +129,22 @@ class Tests_Leiden : public ::testing::TestWithParam<std::tuple<Leiden_Usecase,
ASSERT_FLOAT_EQ(compare_modularity, expected_modularity);
ASSERT_EQ(level, expected_level);
}

auto unique_clustering_v = cugraph::test::sort<vertex_t>(handle, clustering_v);

unique_clustering_v = cugraph::test::unique<vertex_t>(handle, std::move(unique_clustering_v));

auto expected_unique_clustering_v =
cugraph::test::sequence<int32_t>(handle, unique_clustering_v.size(), size_t{1}, int32_t{0});

auto h_unique_clustering_v = cugraph::test::to_host(handle, unique_clustering_v);
auto h_expected_unique_clustering_v =
cugraph::test::to_host(handle, expected_unique_clustering_v);

ASSERT_TRUE(std::equal(h_unique_clustering_v.begin(),
h_unique_clustering_v.end(),
h_expected_unique_clustering_v.begin()))
<< "Returned cluster IDs are not numbered consecutively";
}
};

Expand Down
38 changes: 37 additions & 1 deletion cpp/tests/community/mg_leiden_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -175,6 +175,7 @@ class Tests_MGLeiden
if (leiden_usecase.check_correctness_) {
SCOPED_TRACE("compare modularity input");

// FIXME: The dendrogram is unused
compare_sg_results<vertex_t, edge_t, weight_t>(*handle_,
rng_state,
mg_graph_view,
Expand All @@ -184,6 +185,41 @@ class Tests_MGLeiden
leiden_usecase.theta_,
mg_modularity);
}

// Check numbering
vertex_t num_vertices = mg_graph_view.local_vertex_partition_range_size();
rmm::device_uvector<vertex_t> clustering_v(num_vertices, handle_->get_stream());
cugraph::leiden<vertex_t, edge_t, weight_t, true>(*handle_,
rng_state,
mg_graph_view,
mg_edge_weight_view,
clustering_v.data(),
leiden_usecase.max_level_,
leiden_usecase.resolution_);

auto unique_clustering_v = cugraph::test::sort<vertex_t>(*handle_, clustering_v);

unique_clustering_v = cugraph::test::unique<vertex_t>(*handle_, std::move(unique_clustering_v));

unique_clustering_v = cugraph::test::device_allgatherv(
*handle_, unique_clustering_v.data(), unique_clustering_v.size());

unique_clustering_v = cugraph::test::sort<vertex_t>(*handle_, unique_clustering_v);

unique_clustering_v = cugraph::test::unique<vertex_t>(*handle_, std::move(unique_clustering_v));

auto h_unique_clustering_v = cugraph::test::to_host(*handle_, unique_clustering_v);

auto expected_unique_clustering_v = cugraph::test::sequence<int32_t>(
*handle_, unique_clustering_v.size(), size_t{1}, h_unique_clustering_v[0]);

auto h_expected_unique_clustering_v =
cugraph::test::to_host(*handle_, expected_unique_clustering_v);

ASSERT_TRUE(std::equal(h_unique_clustering_v.begin(),
h_unique_clustering_v.end(),
h_expected_unique_clustering_v.begin()))
<< "Returned cluster IDs are not numbered consecutively";
}

private:
Expand Down
6 changes: 3 additions & 3 deletions python/cugraph/cugraph/dask/community/leiden.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -156,13 +156,13 @@ def leiden(
input_graph._plc_graph[w],
max_iter,
resolution,
random_state,
(random_state + i) if random_state is not None else random_state,
theta,
do_expensive_check,
workers=[w],
allow_other_workers=False,
)
for w in Comms.get_workers()
for i, w in enumerate(Comms.get_workers())
]

wait(result)
Expand Down
27 changes: 26 additions & 1 deletion python/cugraph/cugraph/tests/community/test_leiden.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -19,6 +19,7 @@

import cugraph
import cudf
from cudf.testing.testing import assert_series_equal
from cugraph.testing import utils, UNDIRECTED_DATASETS
from cugraph.datasets import karate_asymmetric

Expand Down Expand Up @@ -185,6 +186,18 @@ def test_leiden(graph_file):
leiden_parts, leiden_mod = cugraph_leiden(G)
louvain_parts, louvain_mod = cugraph_louvain(G)

unique_parts = (
leiden_parts["partition"]
.drop_duplicates()
.sort_values(ascending=True)
.reset_index(drop=True)
)

idx_col = cudf.Series(unique_parts.index)

# Ensure Leiden cluster's ID are numbered consecutively
assert_series_equal(unique_parts, idx_col, check_dtype=False, check_names=False)

# Leiden modularity score is smaller than Louvain's
assert leiden_mod >= (0.75 * louvain_mod)

Expand All @@ -202,6 +215,18 @@ def test_leiden_nx(graph_file):
leiden_parts, leiden_mod = cugraph_leiden(G)
louvain_parts, louvain_mod = cugraph_louvain(G)

unique_parts = (
cudf.Series(leiden_parts.values())
.drop_duplicates()
.sort_values(ascending=True)
.reset_index(drop=True)
)

idx_col = cudf.Series(unique_parts.index)

# Ensure Leiden cluster's ID are numbered consecutively
assert_series_equal(unique_parts, idx_col, check_dtype=False, check_names=False)

# Calculating modularity scores for comparison
# Leiden modularity score is smaller than Louvain's
assert leiden_mod >= (0.75 * louvain_mod)
Expand Down
17 changes: 16 additions & 1 deletion python/cugraph/cugraph/tests/community/test_leiden_mg.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -16,6 +16,8 @@
import cugraph
import cugraph.dask as dcg
from cugraph.datasets import karate_asymmetric, karate, dolphins
import cudf
from cudf.testing.testing import assert_series_equal


# =============================================================================
Expand Down Expand Up @@ -64,6 +66,19 @@ def test_mg_leiden_with_edgevals_undirected_graph(dask_client, dataset):
dg = get_mg_graph(dataset, directed=False)
parts, mod = dcg.leiden(dg)

unique_parts = (
parts["partition"]
.compute()
.drop_duplicates()
.sort_values(ascending=True)
.reset_index(drop=True)
)

idx_col = cudf.Series(unique_parts.index)

# Ensure Leiden cluster's ID are numbered consecutively
assert_series_equal(unique_parts, idx_col, check_dtype=False, check_names=False)

# FIXME: either call Nx with the same dataset and compare results, or
# hardcode golden results to compare to.
print()
Expand Down

0 comments on commit ed954dc

Please sign in to comment.