diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 83078a304ed..e48301e4d14 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index d59742575b5..a57ea0d163b 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,15 +5,15 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-conda/devcontainer.json b/.devcontainer/cuda12.0-conda/devcontainer.json
index bc4a2cb6fb4..10ba2f8fd3d 100644
--- a/.devcontainer/cuda12.0-conda/devcontainer.json
+++ b/.devcontainer/cuda12.0-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.0-pip/devcontainer.json b/.devcontainer/cuda12.0-pip/devcontainer.json
index 3eacf726bf0..a112483a6db 100644
--- a/.devcontainer/cuda12.0-pip/devcontainer.json
+++ b/.devcontainer/cuda12.0-pip/devcontainer.json
@@ -5,15 +5,15 @@
     "args": {
       "CUDA": "12.0",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:23.12-cpp-llvm16-cuda12.0-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.02-cpp-llvm16-cuda12.0-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:23.12": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.2": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:23.12": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.2": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/labeler.yml b/.github/labeler.yml
index c589fda6099..368bf328b99 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -9,17 +9,6 @@ python:
 benchmarks:
   - 'benchmarks/**'
 
-doc:
-  - 'docs/**'
-  - '**/*.md'
-  - 'datasets/**'
-  - 'notebooks/**'
-  - '**/*.txt'
-  - '**/*.rst'
-  - '**/*.ipynb'
-  - '**/*.pdf'
-  - '**/*.png'
-
 datasets:
   - 'datasets/**'
 
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0f490283795..243c5f23ec0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -38,7 +38,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -47,7 +47,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -77,13 +77,13 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-publish-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -93,7 +93,7 @@ jobs:
   wheel-build-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -101,12 +101,12 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-publish-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -116,7 +116,7 @@ jobs:
   wheel-build-nx-cugraph:
     needs: wheel-publish-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -126,10 +126,69 @@ jobs:
   wheel-publish-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: nx-cugraph
+  wheel-build-cugraph-dgl:
+    needs: wheel-publish-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-dgl.sh
+  wheel-publish-cugraph-dgl:
+    needs: wheel-build-cugraph-dgl
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-dgl
+  wheel-build-cugraph-pyg:
+    needs: wheel-publish-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-pyg.sh
+  wheel-publish-cugraph-pyg:
+    needs: wheel-build-cugraph-pyg
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-pyg
+  wheel-build-cugraph-equivariant:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-equivariant.sh
+  wheel-publish-cugraph-equivariant:
+    needs: wheel-build-cugraph-equivariant
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.02
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-equivariant
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 9d20074381e..1bb2e0ab0a7 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,43 +25,49 @@ jobs:
       - wheel-tests-cugraph
       - wheel-build-nx-cugraph
       - wheel-tests-nx-cugraph
+      - wheel-build-cugraph-dgl
+      - wheel-tests-cugraph-dgl
+      - wheel-build-cugraph-pyg
+      - wheel-tests-cugraph-pyg
+      - wheel-build-cugraph-equivariant
+      - wheel-tests-cugraph-equivariant
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.02
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.02
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: cpu32
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -71,7 +77,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.02
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -81,55 +87,99 @@ jobs:
   wheel-build-pylibcugraph:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibcugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
       node_type: cpu32
   wheel-tests-pylibcugraph:
     needs: wheel-build-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibcugraph.sh
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel_cugraph.sh
       extra-repo: rapidsai/cugraph-ops
-      extra-repo-sha: branch-23.12
+      extra-repo-sha: branch-24.02
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
   wheel-tests-cugraph:
     needs: wheel-build-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_cugraph.sh
   wheel-build-nx-cugraph:
     needs: wheel-tests-pylibcugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/build_wheel_nx-cugraph.sh
   wheel-tests-nx-cugraph:
     needs: wheel-build-nx-cugraph
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
+  wheel-build-cugraph-dgl:
+    needs: wheel-tests-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-dgl.sh
+  wheel-tests-cugraph-dgl:
+    needs: wheel-build-cugraph-dgl
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-dgl.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
+  wheel-build-cugraph-pyg:
+    needs: wheel-tests-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-pyg.sh
+  wheel-tests-cugraph-pyg:
+    needs: wheel-build-cugraph-pyg
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-pyg.sh
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
+  wheel-build-cugraph-equivariant:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-equivariant.sh
+  wheel-tests-cugraph-equivariant:
+    needs: wheel-build-cugraph-equivariant
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-equivariant.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.02
     with:
       node_type: cpu32
       extra-repo-deploy-key: CUGRAPH_OPS_SSH_PRIVATE_DEPLOY_KEY
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index a0ecb67712c..528bc1ca9a2 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -24,7 +24,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -32,7 +32,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibcugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -41,7 +41,7 @@ jobs:
       script: ci/test_wheel_pylibcugraph.sh
   wheel-tests-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -50,10 +50,40 @@ jobs:
       script: ci/test_wheel_cugraph.sh
   wheel-tests-nx-cugraph:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-23.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_nx-cugraph.sh
+  wheel-tests-cugraph-dgl:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-dgl.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
+  wheel-tests-cugraph-pyg:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-pyg.sh
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
+  wheel-tests-cugraph-equivariant:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.02
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-equivariant.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bab39557c99..188ea1a266a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -52,7 +52,7 @@ repos:
         pass_filenames: false
         additional_dependencies: [gitpython]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.5.1
+    rev: v1.8.0
     hooks:
         - id: rapids-dependency-file-generator
           args: ["--clean"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d165cd7efc4..fe08c8aeb03 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,84 @@
+# cuGraph 24.02.00 (12 Feb 2024)
+
+## 🚨 Breaking Changes
+
+- Remove Experimental Wrappers from GNN Code ([#4070](https://github.com/rapidsai/cugraph/pull/4070)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- Switch to scikit-build-core ([#4053](https://github.com/rapidsai/cugraph/pull/4053)) [@vyasr](https://github.com/vyasr)
+- Update to CCCL 2.2.0. ([#4052](https://github.com/rapidsai/cugraph/pull/4052)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Exclude tests from builds ([#4147)&quot; (#4157](https://github.com/rapidsai/cugraph/pull/4147)&quot; (#4157)) [@raydouglass](https://github.com/raydouglass)
+- Exclude tests from builds ([#4147](https://github.com/rapidsai/cugraph/pull/4147)) [@vyasr](https://github.com/vyasr)
+- Constraint pytorch-dependent wheel test to only run on amd64 ([#4133](https://github.com/rapidsai/cugraph/pull/4133)) [@tingyu66](https://github.com/tingyu66)
+- Removes the `networkx_algorithm` decorator to all SCC functions to disable dispatching to them ([#4120](https://github.com/rapidsai/cugraph/pull/4120)) [@rlratzel](https://github.com/rlratzel)
+- Correct `cugraph-pyg` package name used in wheels and fix test script ([#4083](https://github.com/rapidsai/cugraph/pull/4083)) [@tingyu66](https://github.com/tingyu66)
+- Fix Jaccard hang ([#4080](https://github.com/rapidsai/cugraph/pull/4080)) [@jnke2016](https://github.com/jnke2016)
+- Fix OOB error, BFS C API should validate that the source vertex is a valid vertex ([#4077](https://github.com/rapidsai/cugraph/pull/4077)) [@ChuckHastings](https://github.com/ChuckHastings)
+- [BUG]Fix non-type template parameter to cugraph::relabel ([#4064](https://github.com/rapidsai/cugraph/pull/4064)) [@naimnv](https://github.com/naimnv)
+- Fix MG weighted similarity test failure ([#4054](https://github.com/rapidsai/cugraph/pull/4054)) [@seunghwak](https://github.com/seunghwak)
+- MG C-API test failure fixes ([#4047](https://github.com/rapidsai/cugraph/pull/4047)) [@seunghwak](https://github.com/seunghwak)
+- Add a barrier before cugraph Graph creation ([#4046](https://github.com/rapidsai/cugraph/pull/4046)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Fix % 0 bug in MG_SELECT_RANDOM_VERTICES test ([#4034](https://github.com/rapidsai/cugraph/pull/4034)) [@seunghwak](https://github.com/seunghwak)
+- Branch 24.02 merge branch 23.12 ([#4012](https://github.com/rapidsai/cugraph/pull/4012)) [@vyasr](https://github.com/vyasr)
+
+## 📖 Documentation
+
+- Updates nx-cugraph README.md with latest algos ([#4135](https://github.com/rapidsai/cugraph/pull/4135)) [@rlratzel](https://github.com/rlratzel)
+- corrected links in C API and added groups for support functions ([#4131](https://github.com/rapidsai/cugraph/pull/4131)) [@acostadon](https://github.com/acostadon)
+- Forward-merge branch-23.12 to branch-24.02 ([#4049](https://github.com/rapidsai/cugraph/pull/4049)) [@GPUtester](https://github.com/GPUtester)
+
+## 🚀 New Features
+
+- Implement has_edge() &amp; compute_multiplicity() ([#4096](https://github.com/rapidsai/cugraph/pull/4096)) [@seunghwak](https://github.com/seunghwak)
+- Update per_v_transform_reduce_incoming|outgoing_e to support edge masking ([#4085](https://github.com/rapidsai/cugraph/pull/4085)) [@seunghwak](https://github.com/seunghwak)
+- Remove Experimental Wrappers from GNN Code ([#4070](https://github.com/rapidsai/cugraph/pull/4070)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+- MNMG ECG ([#4030](https://github.com/rapidsai/cugraph/pull/4030)) [@naimnv](https://github.com/naimnv)
+- Replace graph_view.hpp::number_of_edges with compute_number_of_edges ([#4026](https://github.com/rapidsai/cugraph/pull/4026)) [@seunghwak](https://github.com/seunghwak)
+- Update count_if_e, transform_reduce_e, and transform_e to support edge masking ([#4001](https://github.com/rapidsai/cugraph/pull/4001)) [@seunghwak](https://github.com/seunghwak)
+- Sampling Performance Testing ([#3584](https://github.com/rapidsai/cugraph/pull/3584)) [@alexbarghi-nv](https://github.com/alexbarghi-nv)
+
+## 🛠️ Improvements
+
+- Adds option to rapids_cpm_find for raft to disable hnswlib feature, adds updates for pytest 8 compat, temporarily skips IO intensive test in CI ([#4121](https://github.com/rapidsai/cugraph/pull/4121)) [@rlratzel](https://github.com/rlratzel)
+- Adds benchmarks for additional nx-cugraph 24.02 algos ([#4112](https://github.com/rapidsai/cugraph/pull/4112)) [@rlratzel](https://github.com/rlratzel)
+- nx-cugraph: use coverage to ensure all algorithms were run ([#4108](https://github.com/rapidsai/cugraph/pull/4108)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: rename `plc=` to `_plc=` ([#4106](https://github.com/rapidsai/cugraph/pull/4106)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `complement` and `reverse` ([#4103](https://github.com/rapidsai/cugraph/pull/4103)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `core_number` (undirected graphs only) ([#4100](https://github.com/rapidsai/cugraph/pull/4100)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: add `is_tree`, etc. ([#4097](https://github.com/rapidsai/cugraph/pull/4097)) [@eriknw](https://github.com/eriknw)
+- Optimize the drop-duplicate functionality ([#4095](https://github.com/rapidsai/cugraph/pull/4095)) [@jnke2016](https://github.com/jnke2016)
+- nx-cugraph: add triangles and clustering algorithms ([#4093](https://github.com/rapidsai/cugraph/pull/4093)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: PLC now handles isolated nodes; clean up our workarounds ([#4092](https://github.com/rapidsai/cugraph/pull/4092)) [@eriknw](https://github.com/eriknw)
+- Remove usages of rapids-env-update ([#4090](https://github.com/rapidsai/cugraph/pull/4090)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Provide explicit pool sizes and avoid RMM detail APIs ([#4086](https://github.com/rapidsai/cugraph/pull/4086)) [@harrism](https://github.com/harrism)
+- refactor CUDA versions in dependencies.yaml ([#4084](https://github.com/rapidsai/cugraph/pull/4084)) [@jameslamb](https://github.com/jameslamb)
+- build wheels for `cugraph-dgl` and `cugraph-pyg` ([#4075](https://github.com/rapidsai/cugraph/pull/4075)) [@tingyu66](https://github.com/tingyu66)
+- Match weight-sharing option of GATConv in DGL ([#4074](https://github.com/rapidsai/cugraph/pull/4074)) [@tingyu66](https://github.com/tingyu66)
+- nx-cugraph: add weakly connected components ([#4071](https://github.com/rapidsai/cugraph/pull/4071)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: indicate which plc algorithms are used and version_added ([#4069](https://github.com/rapidsai/cugraph/pull/4069)) [@eriknw](https://github.com/eriknw)
+- Adds `nx-cugraph` benchmarks for 23.12 algos (SSSP, pagerank, hits, katz_centrality, degree_centrality, eigenvector_centrality) ([#4065](https://github.com/rapidsai/cugraph/pull/4065)) [@rlratzel](https://github.com/rlratzel)
+- `nx-cugraph`: add `to_undirected` method; add reciprocity algorithms ([#4063](https://github.com/rapidsai/cugraph/pull/4063)) [@eriknw](https://github.com/eriknw)
+- Switch to scikit-build-core ([#4053](https://github.com/rapidsai/cugraph/pull/4053)) [@vyasr](https://github.com/vyasr)
+- Update to CCCL 2.2.0. ([#4052](https://github.com/rapidsai/cugraph/pull/4052)) [@bdice](https://github.com/bdice)
+- Prevent `actions/labeler` from adding `Label Checker` labels ([#4048](https://github.com/rapidsai/cugraph/pull/4048)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Update dependencies.yaml to new pip index ([#4045](https://github.com/rapidsai/cugraph/pull/4045)) [@vyasr](https://github.com/vyasr)
+- Remove checks for Pascal, no longer supported ([#4044](https://github.com/rapidsai/cugraph/pull/4044)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Fix HITS convergence error. ([#4043](https://github.com/rapidsai/cugraph/pull/4043)) [@seunghwak](https://github.com/seunghwak)
+- Test select_random_vertices for all possible values of flags ([#4042](https://github.com/rapidsai/cugraph/pull/4042)) [@naimnv](https://github.com/naimnv)
+- Remove CUGRAPH_BUILD_WHEELS and standardize Python builds ([#4041](https://github.com/rapidsai/cugraph/pull/4041)) [@vyasr](https://github.com/vyasr)
+- Create `cugraph-equivariant` package ([#4036](https://github.com/rapidsai/cugraph/pull/4036)) [@tingyu66](https://github.com/tingyu66)
+- [FEA]: Add DASK edgelist and graph support to the Dataset API ([#4035](https://github.com/rapidsai/cugraph/pull/4035)) [@huiyuxie](https://github.com/huiyuxie)
+- Add support for Louvain to MTMG ([#4033](https://github.com/rapidsai/cugraph/pull/4033)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Clean up self-loop and multi-edge removal logic ([#4032](https://github.com/rapidsai/cugraph/pull/4032)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Mtmg updates for rmm ([#4031](https://github.com/rapidsai/cugraph/pull/4031)) [@ChuckHastings](https://github.com/ChuckHastings)
+- nx-cugraph: adds `ancestors`, `descendants`, and BFS algos ([#4029](https://github.com/rapidsai/cugraph/pull/4029)) [@eriknw](https://github.com/eriknw)
+- nx-cugraph: update usage of `nodes_or_number` for nx compat ([#4028](https://github.com/rapidsai/cugraph/pull/4028)) [@eriknw](https://github.com/eriknw)
+- Removes unsupported `setup.py` calls, cleans up text ([#4024](https://github.com/rapidsai/cugraph/pull/4024)) [@rlratzel](https://github.com/rlratzel)
+- Resolves conflicts from forward-merging branch-23.12 into branch-24.02 ([#4020](https://github.com/rapidsai/cugraph/pull/4020)) [@rlratzel](https://github.com/rlratzel)
+- Add `HeteroGATConv` to `cugraph-pyg` ([#3914](https://github.com/rapidsai/cugraph/pull/3914)) [@tingyu66](https://github.com/tingyu66)
+- Update for CCCL 2.x ([#3862](https://github.com/rapidsai/cugraph/pull/3862)) [@seunghwak](https://github.com/seunghwak)
+
 # cuGraph 23.12.00 (6 Dec 2023)
 
 ## 🚨 Breaking Changes
diff --git a/VERSION b/VERSION
index a193fff41e8..3c6c5e2b706 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-23.12.00
+24.02.00
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/.gitignore b/benchmarks/cugraph/standalone/bulk_sampling/.gitignore
new file mode 100644
index 00000000000..19cbd00ebe0
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/.gitignore
@@ -0,0 +1 @@
+mg_utils/
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/README.md b/benchmarks/cugraph/standalone/bulk_sampling/README.md
index f48eea5c556..bb01133c52f 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/README.md
+++ b/benchmarks/cugraph/standalone/bulk_sampling/README.md
@@ -1,11 +1,13 @@
-# cuGraph Bulk Sampling
+# cuGraph Sampling Benchmarks
 
-## Overview
+## cuGraph Bulk Sampling
+
+### Overview
 The `cugraph_bulk_sampling.py` script runs the bulk sampler for a variety of datasets, including
 both generated (rmat) datasets and disk (ogbn_papers100M, etc.) datasets.  It can also load
 replicas of these datasets to create a larger benchmark (i.e. ogbn_papers100M x2).
 
-## Arguments
+### Arguments
 The script takes a variety of arguments to control sampling behavior.
 Required:
     --output_root
@@ -51,14 +53,8 @@ Optional:
         Seed for random number generation.
         Defaults to '62'
     
-    --persist
-        Whether to aggressively use persist() in dask to make the ETL steps (NOT PART OF SAMPLING) faster.
-        Will probably make this script finish sooner at the expense of memory usage, but won't affect
-        sampling time.
-        Changing this is not recommended unless you know what you are doing.
-        Defaults to False.
     
-## Input Format
+### Input Format
 The script expects its input data in the following format:
 ```
 <top level directory>
@@ -103,7 +99,7 @@ the parquet files.  It must have the following format:
 }
 ```
 
-## Output Meta
+### Output Meta
 The script, in addition to the samples, will also output a file named `output_meta.json`.
 This file contains various statistics about the sampling run, including the runtime,
 as well as information about the dataset and system that the samples were produced from.
@@ -111,6 +107,56 @@ as well as information about the dataset and system that the samples were produc
 This metadata file can be used to gather the results from the sampling and training stages
 together.
 
-## Other Notes
+### Other Notes
 For rmat datasets, you will need to generate your own bogus features in the training stage.
 Since that is trivial, that is not done in this sampling script.
+
+## cuGraph MNMG Training
+
+### Overview
+The script `run_train_job.sh` runs with the `sbatch` command to launch a series of slurm jobs.
+First, for a given number of epochs, the script will produce samples for a given graph.
+Then, the training process starts where samples are loaded and training iterations are
+processed.
+
+### Important Notes
+Downloading the dataset files before running the slurm jobs is highly recommended.  Even though
+the script will attempt to download the files if they are not available, this can often
+lead to a timeout which will crash the scripts.  This applies regardless of whether you are training
+with native PyG or cuGraph-PyG.  You can download data as follows:
+
+```
+from ogb.nodeproppred import NodePropPredDataset
+dataset = NodePropPredDataset('ogbn-papers100M', root='/home/username/datasets')
+```
+
+For datasets other than ogbn-papers100M, you follow the same process but only change the dataset name.
+The dataset will be correctly preprocessed when you run training.  In case you have a slow system, you
+can also run preprocessing by running the training script on a single worker, which will avoid a timeout
+which crashes the script.
+
+The multi-GPU utilities are in `mg_utils` in the top level of the cuGraph repository.  You should either
+copy them to this directory or symlink to them before running the scripts.
+
+### Arguments
+You will need to modify the bash scripts to run appopriately for your environment and
+desired training workflow.  The standard sbatch arguments are at the top of the script, such as
+job name, queue, etc.  These will need to be modified for your SLURM cluster.
+
+Next are arguments for the container image (required),
+and directories where the data and outputs are stored.  The directories default to subdirectories
+of the current working directory.  But if there is a high-throughput storage system available,
+using that storage for the samples and datasets is highly recommended.
+
+Next are standard GNN training arguments such as `FANOUT`, `BATCH_SIZE`, etc.  You can also set
+the number of training epochs here.  These are followed by the `REPLICATION_FACTOR` argument, which
+can be used to create replications of the dataset for scale testing purposes.
+
+The final two arguments are `FRAMEWORK` which can be either "cuGraphPyG" or "PyG", and `GPUS_PER_NODE`
+which must be set to the correct value, even if this is provided by a SLURM argument.  If `GPUS_PER_NODE`
+is not set to the correct number of GPUs, the script will hang indefinitely until it times out.  Mismatched
+GPUs per node is currently unsupported by this script but should be possible in practice.
+
+### Output
+The results of training will be outputted to the logs directory with an `output.txt` file for each worker.
+These will be overwritten upon each run.  Accuracy is only reported on rank 0.
\ No newline at end of file
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
new file mode 100644
index 00000000000..c9e347b261d
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+os.environ["RAPIDS_NO_INITIALIZE"] = "1"
+os.environ["CUDF_SPILL"] = "1"
+os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO"
+os.environ["KVIKIO_NTHREADS"] = "8"
+
+import argparse
+import json
+import warnings
+
+import torch
+import numpy as np
+import pandas
+
+import torch.distributed as dist
+
+from datasets import OGBNPapers100MDataset
+
+from cugraph.testing.mg_utils import enable_spilling
+
+
+def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None:
+    import cupy
+    import rmm
+    from pynvml.smi import nvidia_smi
+
+    smi = nvidia_smi.getInstance()
+    pool_size = 16e9  # FIXME calculate this
+
+    rmm.reinitialize(
+        devices=[rank],
+        pool_allocator=True,
+        initial_pool_size=pool_size,
+    )
+
+    if use_rmm_torch_allocator:
+        warnings.warn(
+            "Using the rmm pytorch allocator is currently unsupported."
+            " The default allocator will be used instead."
+        )
+        # FIXME somehow get the pytorch allocator to work
+        # from rmm.allocators.torch import rmm_torch_allocator
+        # torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
+    from rmm.allocators.cupy import rmm_cupy_allocator
+
+    cupy.cuda.set_allocator(rmm_cupy_allocator)
+
+    cupy.cuda.Device(rank).use()
+    torch.cuda.set_device(rank)
+
+    # Pytorch training worker initialization
+    torch.distributed.init_process_group(backend="nccl")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--gpus_per_node",
+        type=int,
+        default=8,
+        help="# GPUs per node",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        default=1,
+        help="Number of training epochs",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=512,
+        help="Batch size",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--fanout",
+        type=str,
+        default="10_10_10",
+        help="Fanout",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--sample_dir",
+        type=str,
+        help="Directory with stored bulk samples (required for cuGraph run)",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        help="File to store results",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--framework",
+        type=str,
+        help="The framework to test (PyG, cuGraphPyG)",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="GraphSAGE",
+        help="The model to use (currently only GraphSAGE supported)",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--replication_factor",
+        type=int,
+        default=1,
+        help="The replication factor for the dataset",
+        required=False,
+    )
+
+    parser.add_argument(
+        "--dataset_dir",
+        type=str,
+        help="The directory where datasets are stored",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--train_split",
+        type=float,
+        help="The percentage of the labeled data to use for training.  The remainder is used for testing/validation.",
+        default=0.8,
+        required=False,
+    )
+
+    parser.add_argument(
+        "--val_split",
+        type=float,
+        help="The percentage of the testing/validation data to allocate for validation.",
+        default=0.5,
+        required=False,
+    )
+
+    return parser.parse_args()
+
+
+def main(args):
+    import logging
+
+    logging.basicConfig(
+        level=logging.INFO,
+    )
+    logger = logging.getLogger("bench_cugraph_training")
+    logger.setLevel(logging.INFO)
+
+    local_rank = int(os.environ["LOCAL_RANK"])
+    global_rank = int(os.environ["RANK"])
+
+    init_pytorch_worker(
+        local_rank, use_rmm_torch_allocator=(args.framework == "cuGraph")
+    )
+    enable_spilling()
+    print(f"worker initialized")
+    dist.barrier()
+
+    world_size = int(os.environ["SLURM_JOB_NUM_NODES"]) * args.gpus_per_node
+
+    dataset = OGBNPapers100MDataset(
+        replication_factor=args.replication_factor,
+        dataset_dir=args.dataset_dir,
+        train_split=args.train_split,
+        val_split=args.val_split,
+        load_edge_index=(args.framework == "PyG"),
+    )
+
+    if global_rank == 0:
+        dataset.download()
+    dist.barrier()
+
+    fanout = [int(f) for f in args.fanout.split("_")]
+
+    if args.framework == "PyG":
+        from trainers.pyg import PyGNativeTrainer
+
+        trainer = PyGNativeTrainer(
+            model=args.model,
+            dataset=dataset,
+            device=local_rank,
+            rank=global_rank,
+            world_size=world_size,
+            num_epochs=args.num_epochs,
+            shuffle=True,
+            replace=False,
+            num_neighbors=fanout,
+            batch_size=args.batch_size,
+        )
+    elif args.framework == "cuGraphPyG":
+        sample_dir = os.path.join(
+            args.sample_dir,
+            f"ogbn_papers100M[{args.replication_factor}]_b{args.batch_size}_f{fanout}",
+        )
+        from trainers.pyg import PyGCuGraphTrainer
+
+        trainer = PyGCuGraphTrainer(
+            model=args.model,
+            dataset=dataset,
+            sample_dir=sample_dir,
+            device=local_rank,
+            rank=global_rank,
+            world_size=world_size,
+            num_epochs=args.num_epochs,
+            shuffle=True,
+            replace=False,
+            num_neighbors=fanout,
+            batch_size=args.batch_size,
+        )
+    else:
+        raise ValueError("unsupported framework")
+
+    logger.info(f"Trainer ready on rank {global_rank}")
+    stats = trainer.train()
+    logger.info(stats)
+
+    with open(f"{args.output_file}[{global_rank}]", "w") as f:
+        json.dump(stats, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bulk_sampling.sh b/benchmarks/cugraph/standalone/bulk_sampling/bulk_sampling.sh
deleted file mode 100755
index e62cb3cda29..00000000000
--- a/benchmarks/cugraph/standalone/bulk_sampling/bulk_sampling.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export RAPIDS_NO_INITIALIZE="1"
-export CUDF_SPILL="1"
-export LIBCUDF_CUFILE_POLICY=OFF
-
-
-dataset_name=$1
-dataset_root=$2
-output_root=$3
-batch_sizes=$4
-fanouts=$5
-reverse_edges=$6
-
-rm -rf $output_root
-mkdir -p $output_root
-
-# Change to 2 in Selene
-gpu_per_replica=4
-#--add_edge_ids \
-
-# Expand to 1, 4, 8 in Selene
-for i in 1,2,3,4:
-do 
-    for replication in 2;
-    do
-        dataset_name_with_replication="${dataset_name}[${replication}]"
-        dask_worker_devices=$(seq -s, 0 $((gpu_per_replica*replication-1)))
-        echo "Sampling dataset = $dataset_name_with_replication on devices = $dask_worker_devices"
-        python3 cugraph_bulk_sampling.py --datasets $dataset_name_with_replication \
-                --dataset_root $dataset_root \
-                --batch_sizes $batch_sizes \
-                --output_root $output_root \
-                --dask_worker_devices $dask_worker_devices \
-                --fanouts $fanouts \
-                --batch_sizes $batch_sizes \
-                --reverse_edges
-    done
-done
\ No newline at end of file
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
index 1ca5d6db637..e3a5bba3162 100644
--- a/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/cugraph_bulk_sampling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -28,7 +28,7 @@
 )
 
 from cugraph.structure.symmetrize import symmetrize
-from cugraph.experimental.gnn import BulkSampler
+from cugraph.gnn import BulkSampler
 
 import cugraph
 
@@ -97,19 +97,15 @@ def symmetrize_ddf(dask_dataframe):
     return new_ddf
 
 
-def renumber_ddf(dask_df, persist=False):
+def renumber_ddf(dask_df):
     vertices = (
         dask_cudf.concat([dask_df["src"], dask_df["dst"]])
         .unique()
         .reset_index(drop=True)
     )
-    if persist:
-        vertices = vertices.persist()
 
     vertices.name = "v"
     vertices = vertices.reset_index().set_index("v").rename(columns={"index": "m"})
-    if persist:
-        vertices = vertices.persist()
 
     src = dask_df.merge(vertices, left_on="src", right_on="v", how="left").m.rename(
         "src"
@@ -170,7 +166,7 @@ def _replicate_df(
 
     if replication_factor > 1:
         for r in range(1, replication_factor):
-            df_replicated = original_df
+            df_replicated = original_df.copy()
             for col, offset in col_item_counts.items():
                 df_replicated[col] += offset * r
 
@@ -189,46 +185,75 @@ def sample_graph(
     seeds_per_call=400000,
     batches_per_partition=100,
     fanout=[5, 5, 5],
+    num_epochs=1,
+    train_perc=0.8,
+    val_perc=0.5,
     sampling_kwargs={},
 ):
     cupy.random.seed(seed)
-
-    sampler = BulkSampler(
-        batch_size=batch_size,
-        output_path=output_path,
-        graph=G,
-        fanout_vals=fanout,
-        with_replacement=False,
-        random_state=seed,
-        seeds_per_call=seeds_per_call,
-        batches_per_partition=batches_per_partition,
-        log_level=logging.INFO,
-        **sampling_kwargs,
+    train_df, test_df = label_df.random_split(
+        [train_perc, 1 - train_perc], random_state=seed, shuffle=True
+    )
+    val_df, test_df = label_df.random_split(
+        [val_perc, 1 - val_perc], random_state=seed, shuffle=True
     )
 
-    n_workers = len(default_client().scheduler_info()["workers"])
+    total_time = 0.0
+    for epoch in range(num_epochs):
+        steps = [("train", train_df), ("test", test_df)]
+        if epoch == num_epochs - 1:
+            steps.append(("val", val_df))
 
-    meta = cudf.DataFrame(
-        {"node": cudf.Series(dtype="int64"), "batch": cudf.Series(dtype="int32")}
-    )
+        for step, batch_df in steps:
+            batch_df = batch_df.sample(frac=1.0, random_state=seed)
 
-    batch_df = label_df.map_partitions(
-        _make_batch_ids, batch_size, n_workers, meta=meta
-    )
-    # batch_df = batch_df.sort_values(by='node')
+            if step == "val":
+                output_sample_path = os.path.join(output_path, "val", "samples")
+            else:
+                output_sample_path = os.path.join(
+                    output_path, f"epoch={epoch}", f"{step}", "samples"
+                )
+            os.makedirs(output_sample_path)
+
+            sampler = BulkSampler(
+                batch_size=batch_size,
+                output_path=output_sample_path,
+                graph=G,
+                fanout_vals=fanout,
+                with_replacement=False,
+                random_state=seed,
+                seeds_per_call=seeds_per_call,
+                batches_per_partition=batches_per_partition,
+                log_level=logging.INFO,
+                **sampling_kwargs,
+            )
 
-    # should always persist the batch dataframe or performance may be suboptimal
-    batch_df = batch_df.persist()
+            n_workers = len(default_client().scheduler_info()["workers"])
 
-    del label_df
-    print("created batches")
+            meta = cudf.DataFrame(
+                {
+                    "node": cudf.Series(dtype="int64"),
+                    "batch": cudf.Series(dtype="int32"),
+                }
+            )
+
+            batch_df = batch_df.map_partitions(
+                _make_batch_ids, batch_size, n_workers, meta=meta
+            )
+
+            # should always persist the batch dataframe or performance may be suboptimal
+            batch_df = batch_df.persist()
+
+            print("created batches")
 
-    start_time = perf_counter()
-    sampler.add_batches(batch_df, start_col_name="node", batch_col_name="batch")
-    sampler.flush()
-    end_time = perf_counter()
-    print("flushed all batches")
-    return end_time - start_time
+            start_time = perf_counter()
+            sampler.add_batches(batch_df, start_col_name="node", batch_col_name="batch")
+            sampler.flush()
+            end_time = perf_counter()
+            print("flushed all batches")
+            total_time += end_time - start_time
+
+    return total_time
 
 
 def assign_offsets_pyg(node_counts: Dict[str, int], replication_factor: int = 1):
@@ -253,7 +278,6 @@ def generate_rmat_dataset(
     labeled_percentage=0.01,
     num_labels=256,
     reverse_edges=False,
-    persist=False,
     add_edge_types=False,
 ):
     """
@@ -282,12 +306,8 @@ def generate_rmat_dataset(
     dask_edgelist_df = dask_edgelist_df.reset_index(drop=True)
 
     dask_edgelist_df = renumber_ddf(dask_edgelist_df).persist()
-    if persist:
-        dask_edgelist_df = dask_edgelist_df.persist()
 
     dask_edgelist_df = symmetrize_ddf(dask_edgelist_df).persist()
-    if persist:
-        dask_edgelist_df = dask_edgelist_df.persist()
 
     if add_edge_types:
         dask_edgelist_df["etp"] = cupy.int32(
@@ -329,7 +349,6 @@ def load_disk_dataset(
     dataset_dir=".",
     reverse_edges=True,
     replication_factor=1,
-    persist=False,
     add_edge_types=False,
 ):
     from pathlib import Path
@@ -363,8 +382,6 @@ def load_disk_dataset(
         ]
 
         edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type]
-        if persist:
-            edge_index_dict = edge_index_dict.persist()
 
         if replication_factor > 1:
             edge_index_dict[can_edge_type] = edge_index_dict[
@@ -384,11 +401,6 @@ def load_disk_dataset(
                 ),
             )
 
-            if persist:
-                edge_index_dict[can_edge_type] = edge_index_dict[
-                    can_edge_type
-                ].persist()
-
         gc.collect()
 
         if reverse_edges:
@@ -396,9 +408,6 @@ def load_disk_dataset(
                 columns={"src": "dst", "dst": "src"}
             )
 
-        if persist:
-            edge_index_dict[can_edge_type] = edge_index_dict[can_edge_type].persist()
-
     # Assign numeric edge type ids based on lexicographic order
     edge_offsets = {}
     edge_count = 0
@@ -410,9 +419,6 @@ def load_disk_dataset(
 
     all_edges_df = dask_cudf.concat(list(edge_index_dict.values()))
 
-    if persist:
-        all_edges_df = all_edges_df.persist()
-
     del edge_index_dict
     gc.collect()
 
@@ -440,15 +446,9 @@ def load_disk_dataset(
                     meta=cudf.DataFrame({"node": cudf.Series(dtype="int64")}),
                 )
 
-                if persist:
-                    node_labels[node_type] = node_labels[node_type].persist()
-
             gc.collect()
 
-    node_labels_df = dask_cudf.concat(list(node_labels.values()))
-
-    if persist:
-        node_labels_df = node_labels_df.persist()
+    node_labels_df = dask_cudf.concat(list(node_labels.values())).reset_index(drop=True)
 
     del node_labels
     gc.collect()
@@ -475,8 +475,8 @@ def benchmark_cugraph_bulk_sampling(
     replication_factor=1,
     num_labels=256,
     labeled_percentage=0.001,
-    persist=False,
     add_edge_types=False,
+    num_epochs=1,
 ):
     """
     Entry point for the benchmark.
@@ -506,14 +506,17 @@ def benchmark_cugraph_bulk_sampling(
     labeled_percentage: float
         The percentage of the data that is labeled (only for rmat datasets)
         Defaults to 0.001 to match papers100M
-    persist: bool
-        Whether to aggressively persist data in dask in attempt to speed up ETL.
-        Defaults to False.
     add_edge_types: bool
         Whether to add edge types to the edgelist.
         Defaults to False.
+    sampling_target_framework: str
+        The framework to sample for.
+    num_epochs: int
+        The number of epochs to sample for.
     """
-    print(dataset)
+
+    logger = logging.getLogger("__main__")
+    logger.info(str(dataset))
     if dataset[0:4] == "rmat":
         (
             dask_edgelist_df,
@@ -527,7 +530,6 @@ def benchmark_cugraph_bulk_sampling(
             seed=seed,
             labeled_percentage=labeled_percentage,
             num_labels=num_labels,
-            persist=persist,
             add_edge_types=add_edge_types,
         )
 
@@ -543,28 +545,25 @@ def benchmark_cugraph_bulk_sampling(
             dataset_dir=dataset_dir,
             reverse_edges=reverse_edges,
             replication_factor=replication_factor,
-            persist=persist,
             add_edge_types=add_edge_types,
         )
 
     num_input_edges = len(dask_edgelist_df)
-    print(f"Number of input edges = {num_input_edges:,}")
+    logger.info(f"Number of input edges = {num_input_edges:,}")
 
     G = construct_graph(dask_edgelist_df)
     del dask_edgelist_df
-    print("constructed graph")
+    logger.info("constructed graph")
 
     input_memory = G.edgelist.edgelist_df.memory_usage().sum().compute()
-    print(f"input memory: {input_memory}")
+    logger.info(f"input memory: {input_memory}")
 
     output_subdir = os.path.join(
-        output_path, f"{dataset}[{replication_factor}]_b{batch_size}_f{fanout}"
+        output_path,
+        f"{dataset}[{replication_factor}]_b{batch_size}_f{fanout}",
     )
     os.makedirs(output_subdir)
 
-    output_sample_path = os.path.join(output_subdir, "samples")
-    os.makedirs(output_sample_path)
-
     if sampling_target_framework == "cugraph_dgl_csr":
         sampling_kwargs = {
             "deduplicate_sources": True,
@@ -587,11 +586,12 @@ def benchmark_cugraph_bulk_sampling(
             "include_hop_column": True,
         }
 
-    batches_per_partition = 400_000 // batch_size
+    batches_per_partition = 600_000 // batch_size
     execution_time, allocation_counts = sample_graph(
         G=G,
         label_df=dask_label_df,
-        output_path=output_sample_path,
+        output_path=output_subdir,
+        num_epochs=num_epochs,
         seed=seed,
         batch_size=batch_size,
         seeds_per_call=seeds_per_call,
@@ -620,8 +620,8 @@ def benchmark_cugraph_bulk_sampling(
     with open(os.path.join(output_subdir, "output_meta.json"), "w") as f:
         json.dump(output_meta, f, indent="\t")
 
-    print("allocation counts b:")
-    print(allocation_counts.values())
+    logger.info("allocation counts b:")
+    logger.info(allocation_counts.values())
 
     (
         input_to_peak_ratio,
@@ -631,8 +631,8 @@ def benchmark_cugraph_bulk_sampling(
     ) = get_memory_statistics(
         allocation_counts=allocation_counts, input_memory=input_memory
     )
-    print(f"Number of edges in final graph = {G.number_of_edges():,}")
-    print("-" * 80)
+    logger.info(f"Number of edges in final graph = {G.number_of_edges():,}")
+    logger.info("-" * 80)
     return (
         num_input_edges,
         input_to_peak_ratio,
@@ -693,12 +693,20 @@ def get_args():
         required=True,
     )
 
+    parser.add_argument(
+        "--num_epochs",
+        type=int,
+        help="Number of epochs to run for",
+        required=False,
+        default=1,
+    )
+
     parser.add_argument(
         "--fanouts",
         type=str,
-        help="Comma separated list of fanouts (i.e. 10_25,5_5_5)",
+        help='Comma separated list of fanouts (i.e. "10_25,5_5_5")',
         required=False,
-        default="10_25",
+        default="10_10_10",
     )
 
     parser.add_argument(
@@ -743,28 +751,14 @@ def get_args():
         "--random_seed", type=int, help="Random seed", required=False, default=62
     )
 
-    parser.add_argument(
-        "--persist",
-        action="store_true",
-        help="Will add additional persist() calls to speed up ETL.  Does not affect sampling runtime.",
-        required=False,
-        default=False,
-    )
-
-    parser.add_argument(
-        "--add_edge_types",
-        action="store_true",
-        help="Adds edge types to the edgelist.  Required for PyG if not providing edge ids.",
-        required=False,
-        default=False,
-    )
-
     return parser.parse_args()
 
 
 # call __main__ function
 if __name__ == "__main__":
     logging.basicConfig()
+    logger = logging.getLogger("__main__")
+    logger.setLevel(logging.INFO)
 
     args = get_args()
     if args.sampling_target_framework not in ["cugraph_dgl_csr", None]:
@@ -781,29 +775,28 @@ def get_args():
     seeds_per_call_opts = [int(s) for s in args.seeds_per_call_opts.split(",")]
     dask_worker_devices = [int(d) for d in args.dask_worker_devices.split(",")]
 
-    client, cluster = start_dask_client(
-        dask_worker_devices=dask_worker_devices,
-        jit_unspill=False,
-        rmm_pool_size=28e9,
-        rmm_async=True,
-    )
+    logger.info("starting dask client")
+    client, cluster = start_dask_client()
     enable_spilling()
     stats_ls = []
     client.run(enable_spilling)
+    logger.info("dask client started")
     for dataset in datasets:
-        if re.match(r"([A-z]|[0-9])+\[[0-9]+\]", dataset):
-            replication_factor = int(dataset[-2])
-            dataset = dataset[:-3]
+        m = re.match(r"(\w+)\[([0-9]+)\]", dataset)
+        if m:
+            replication_factor = int(m.groups()[1])
+            dataset = m.groups()[0]
         else:
             replication_factor = 1
 
         for fanout in fanouts:
             for batch_size in batch_sizes:
                 for seeds_per_call in seeds_per_call_opts:
-                    print(f"dataset: {dataset}")
-                    print(f"batch size: {batch_size}")
-                    print(f"fanout: {fanout}")
-                    print(f"seeds_per_call: {seeds_per_call}")
+                    logger.info(f"dataset: {dataset}")
+                    logger.info(f"batch size: {batch_size}")
+                    logger.info(f"fanout: {fanout}")
+                    logger.info(f"seeds_per_call: {seeds_per_call}")
+                    logger.info(f"num epochs: {args.num_epochs}")
 
                     try:
                         stats_d = {}
@@ -816,6 +809,7 @@ def get_args():
                         ) = benchmark_cugraph_bulk_sampling(
                             dataset=dataset,
                             output_path=args.output_root,
+                            num_epochs=args.num_epochs,
                             seed=args.random_seed,
                             batch_size=batch_size,
                             seeds_per_call=seeds_per_call,
@@ -824,8 +818,6 @@ def get_args():
                             dataset_dir=args.dataset_root,
                             reverse_edges=args.reverse_edges,
                             replication_factor=replication_factor,
-                            persist=args.persist,
-                            add_edge_types=args.add_edge_types,
                         )
                         stats_d["dataset"] = dataset
                         stats_d["num_input_edges"] = num_input_edges
diff --git a/python/nx-cugraph/setup.py b/benchmarks/cugraph/standalone/bulk_sampling/datasets/__init__.py
similarity index 72%
rename from python/nx-cugraph/setup.py
rename to benchmarks/cugraph/standalone/bulk_sampling/datasets/__init__.py
index c4ab535923b..0f4b516cd80 100644
--- a/python/nx-cugraph/setup.py
+++ b/benchmarks/cugraph/standalone/bulk_sampling/datasets/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,9 +10,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from setuptools import find_packages, setup
 
-packages = find_packages(include=["nx_cugraph*"])
-setup(
-    package_data={key: ["VERSION"] for key in packages},
-)
+from .dataset import Dataset
+from .ogbn_papers100M import OGBNPapers100MDataset
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/datasets/dataset.py b/benchmarks/cugraph/standalone/bulk_sampling/datasets/dataset.py
new file mode 100644
index 00000000000..f914f69fa4e
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/datasets/dataset.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from typing import Dict, Tuple
+
+
+class Dataset:
+    @property
+    def edge_index_dict(self) -> Dict[Tuple[str, str, str], Dict[str, torch.Tensor]]:
+        raise NotImplementedError()
+
+    @property
+    def x_dict(self) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+    @property
+    def y_dict(self) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+    @property
+    def train_dict(self) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+    @property
+    def test_dict(self) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+    @property
+    def val_dict(self) -> Dict[str, torch.Tensor]:
+        raise NotImplementedError()
+
+    @property
+    def num_input_features(self) -> int:
+        raise NotImplementedError()
+
+    @property
+    def num_labels(self) -> int:
+        raise NotImplementedError()
+
+    def num_nodes(self, node_type: str) -> int:
+        raise NotImplementedError()
+
+    def num_edges(self, edge_type: Tuple[str, str, str]) -> int:
+        raise NotImplementedError()
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py b/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py
new file mode 100644
index 00000000000..a50e40f6d55
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dataset import Dataset
+from typing import Dict, Tuple, Union
+
+import pandas
+import torch
+import numpy as np
+
+from sklearn.model_selection import train_test_split
+
+import gc
+import os
+import json
+
+
+class OGBNPapers100MDataset(Dataset):
+    def __init__(
+        self,
+        *,
+        replication_factor=1,
+        dataset_dir=".",
+        train_split=0.8,
+        val_split=0.5,
+        load_edge_index=True,
+    ):
+        self.__replication_factor = replication_factor
+        self.__disk_x = None
+        self.__y = None
+        self.__edge_index = None
+        self.__dataset_dir = dataset_dir
+        self.__train_split = train_split
+        self.__val_split = val_split
+        self.__load_edge_index = load_edge_index
+
+    def download(self):
+        import logging
+
+        logger = logging.getLogger("OGBNPapers100MDataset")
+        logger.info("Processing dataset...")
+
+        dataset_path = os.path.join(self.__dataset_dir, "ogbn_papers100M")
+
+        meta_json_path = os.path.join(dataset_path, "meta.json")
+        if not os.path.exists(meta_json_path):
+            j = {
+                "num_nodes": {"paper": 111059956},
+                "num_edges": {"paper__cites__paper": 1615685872},
+            }
+            with open(meta_json_path, "w") as file:
+                json.dump(j, file)
+
+        dataset = None
+        if not os.path.exists(dataset_path):
+            from ogb.nodeproppred import NodePropPredDataset
+
+            dataset = NodePropPredDataset(
+                name="ogbn-papers100M", root=self.__dataset_dir
+            )
+
+        features_path = os.path.join(dataset_path, "npy", "paper")
+        os.makedirs(features_path, exist_ok=True)
+
+        logger.info("Processing node features...")
+        if self.__replication_factor == 1:
+            replication_path = os.path.join(features_path, "node_feat.npy")
+        else:
+            replication_path = os.path.join(
+                features_path, f"node_feat_{self.__replication_factor}x.npy"
+            )
+        if not os.path.exists(replication_path):
+            if dataset is None:
+                from ogb.nodeproppred import NodePropPredDataset
+
+                dataset = NodePropPredDataset(
+                    name="ogbn-papers100M", root=self.__dataset_dir
+                )
+
+            node_feat = dataset[0][0]["node_feat"]
+            if self.__replication_factor != 1:
+                node_feat_replicated = np.concat(
+                    [node_feat] * self.__replication_factor
+                )
+                node_feat = node_feat_replicated
+            np.save(replication_path, node_feat)
+
+        logger.info("Processing edge index...")
+        edge_index_parquet_path = os.path.join(
+            dataset_path, "parquet", "paper__cites__paper"
+        )
+        os.makedirs(edge_index_parquet_path, exist_ok=True)
+
+        edge_index_parquet_file_path = os.path.join(
+            edge_index_parquet_path, "edge_index.parquet"
+        )
+        if not os.path.exists(edge_index_parquet_file_path):
+            if dataset is None:
+                from ogb.nodeproppred import NodePropPredDataset
+
+                dataset = NodePropPredDataset(
+                    name="ogbn-papers100M", root=self.__dataset_dir
+                )
+
+            edge_index = dataset[0][0]["edge_index"]
+            eidf = pandas.DataFrame({"src": edge_index[0], "dst": edge_index[1]})
+            eidf.to_parquet(edge_index_parquet_file_path)
+
+        edge_index_npy_path = os.path.join(dataset_path, "npy", "paper__cites__paper")
+        os.makedirs(edge_index_npy_path, exist_ok=True)
+
+        edge_index_npy_file_path = os.path.join(edge_index_npy_path, "edge_index.npy")
+        if not os.path.exists(edge_index_npy_file_path):
+            if dataset is None:
+                from ogb.nodeproppred import NodePropPredDataset
+
+                dataset = NodePropPredDataset(
+                    name="ogbn-papers100M", root=self.__dataset_dir
+                )
+
+            edge_index = dataset[0][0]["edge_index"]
+            np.save(edge_index_npy_file_path, edge_index)
+
+        logger.info("Processing labels...")
+        node_label_path = os.path.join(dataset_path, "parquet", "paper")
+        os.makedirs(node_label_path, exist_ok=True)
+
+        node_label_file_path = os.path.join(node_label_path, "node_label.parquet")
+        if not os.path.exists(node_label_file_path):
+            if dataset is None:
+                from ogb.nodeproppred import NodePropPredDataset
+
+                dataset = NodePropPredDataset(
+                    name="ogbn-papers100M", root=self.__dataset_dir
+                )
+
+            ldf = pandas.Series(dataset[0][1].T[0])
+            ldf = (
+                ldf[ldf >= 0]
+                .reset_index()
+                .rename(columns={"index": "node", 0: "label"})
+            )
+            ldf.to_parquet(node_label_file_path)
+
+    @property
+    def edge_index_dict(
+        self,
+    ) -> Dict[Tuple[str, str, str], Union[Dict[str, torch.Tensor], int]]:
+        import logging
+
+        logger = logging.getLogger("OGBNPapers100MDataset")
+
+        if self.__edge_index is None:
+            if self.__load_edge_index:
+                npy_path = os.path.join(
+                    self.__dataset_dir,
+                    "ogbn_papers100M",
+                    "npy",
+                    "paper__cites__paper",
+                    "edge_index.npy",
+                )
+
+                logger.info(f"loading edge index from {npy_path}")
+                ei = np.load(npy_path, mmap_mode="r")
+                ei = torch.as_tensor(ei)
+                ei = {
+                    "src": ei[1],
+                    "dst": ei[0],
+                }
+
+                logger.info("sorting edge index...")
+                ei["dst"], ix = torch.sort(ei["dst"])
+                ei["src"] = ei["src"][ix]
+                del ix
+                gc.collect()
+
+                logger.info("processing replications...")
+                orig_num_nodes = self.num_nodes("paper") // self.__replication_factor
+                if self.__replication_factor > 1:
+                    orig_src = ei["src"].clone().detach()
+                    orig_dst = ei["dst"].clone().detach()
+                    for r in range(1, self.__replication_factor):
+                        ei["src"] = torch.concat(
+                            [
+                                ei["src"],
+                                orig_src + int(r * orig_num_nodes),
+                            ]
+                        )
+
+                        ei["dst"] = torch.concat(
+                            [
+                                ei["dst"],
+                                orig_dst + int(r * orig_num_nodes),
+                            ]
+                        )
+
+                    del orig_src
+                    del orig_dst
+
+                    ei["src"] = ei["src"].contiguous()
+                    ei["dst"] = ei["dst"].contiguous()
+                gc.collect()
+
+                logger.info(f"# edges: {len(ei['src'])}")
+                self.__edge_index = {("paper", "cites", "paper"): ei}
+            else:
+                self.__edge_index = {
+                    ("paper", "cites", "paper"): self.num_edges(
+                        ("paper", "cites", "paper")
+                    )
+                }
+
+        return self.__edge_index
+
+    @property
+    def x_dict(self) -> Dict[str, torch.Tensor]:
+        node_type_path = os.path.join(
+            self.__dataset_dir, "ogbn_papers100M", "npy", "paper"
+        )
+
+        if self.__disk_x is None:
+            if self.__replication_factor == 1:
+                full_path = os.path.join(node_type_path, "node_feat.npy")
+            else:
+                full_path = os.path.join(
+                    node_type_path, f"node_feat_{self.__replication_factor}x.npy"
+                )
+
+            self.__disk_x = {"paper": np.load(full_path, mmap_mode="r")}
+
+        return self.__disk_x
+
+    @property
+    def y_dict(self) -> Dict[str, torch.Tensor]:
+        if self.__y is None:
+            self.__get_labels()
+
+        return self.__y
+
+    @property
+    def train_dict(self) -> Dict[str, torch.Tensor]:
+        if self.__train is None:
+            self.__get_labels()
+        return self.__train
+
+    @property
+    def test_dict(self) -> Dict[str, torch.Tensor]:
+        if self.__test is None:
+            self.__get_labels()
+        return self.__test
+
+    @property
+    def val_dict(self) -> Dict[str, torch.Tensor]:
+        if self.__val is None:
+            self.__get_labels()
+        return self.__val
+
+    @property
+    def num_input_features(self) -> int:
+        return int(self.x_dict["paper"].shape[1])
+
+    @property
+    def num_labels(self) -> int:
+        return int(self.y_dict["paper"].max()) + 1
+
+    def num_nodes(self, node_type: str) -> int:
+        if node_type != "paper":
+            raise ValueError(f"Invalid node type {node_type}")
+
+        return 111_059_956 * self.__replication_factor
+
+    def num_edges(self, edge_type: Tuple[str, str, str]) -> int:
+        if edge_type != ("paper", "cites", "paper"):
+            raise ValueError(f"Invalid edge type {edge_type}")
+
+        return 1_615_685_872 * self.__replication_factor
+
+    def __get_labels(self):
+        label_path = os.path.join(
+            self.__dataset_dir,
+            "ogbn_papers100M",
+            "parquet",
+            "paper",
+            "node_label.parquet",
+        )
+
+        node_label = pandas.read_parquet(label_path)
+
+        if self.__replication_factor > 1:
+            orig_num_nodes = self.num_nodes("paper") // self.__replication_factor
+            dfr = pandas.DataFrame(
+                {
+                    "node": pandas.concat(
+                        [
+                            node_label.node + (r * orig_num_nodes)
+                            for r in range(1, self.__replication_factor)
+                        ]
+                    ),
+                    "label": pandas.concat(
+                        [node_label.label for r in range(1, self.__replication_factor)]
+                    ),
+                }
+            )
+            node_label = pandas.concat([node_label, dfr]).reset_index(drop=True)
+
+        num_nodes = self.num_nodes("paper")
+        node_label_tensor = torch.full(
+            (num_nodes,), -1, dtype=torch.float32, device="cpu"
+        )
+        node_label_tensor[
+            torch.as_tensor(node_label.node.values, device="cpu")
+        ] = torch.as_tensor(node_label.label.values, device="cpu")
+
+        self.__y = {"paper": node_label_tensor.contiguous()}
+
+        train_ix, test_val_ix = train_test_split(
+            torch.as_tensor(node_label.node.values),
+            train_size=self.__train_split,
+            random_state=num_nodes,
+        )
+        test_ix, val_ix = train_test_split(
+            test_val_ix, test_size=self.__val_split, random_state=num_nodes
+        )
+
+        train_tensor = torch.full((num_nodes,), 0, dtype=torch.bool, device="cpu")
+        train_tensor[train_ix] = 1
+        self.__train = {"paper": train_tensor}
+
+        test_tensor = torch.full((num_nodes,), 0, dtype=torch.bool, device="cpu")
+        test_tensor[test_ix] = 1
+        self.__test = {"paper": test_tensor}
+
+        val_tensor = torch.full((num_nodes,), 0, dtype=torch.bool, device="cpu")
+        val_tensor[val_ix] = 1
+        self.__val = {"paper": val_tensor}
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/__init__.py b/benchmarks/cugraph/standalone/bulk_sampling/models/__init__.py
new file mode 100644
index 00000000000..c2002fd3fb9
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/models/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/__init__.py b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/__init__.py
new file mode 100644
index 00000000000..337cb0fa243
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .models_cugraph_pyg import CuGraphSAGE
+from .models_pyg import GraphSAGE
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_cugraph_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_cugraph_pyg.py
new file mode 100644
index 00000000000..1de791bf588
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_cugraph_pyg.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from cugraph_pyg.nn.conv import SAGEConv as CuGraphSAGEConv
+
+try:
+    from torch_geometric.utils.trim_to_layer import TrimToLayer
+except ModuleNotFoundError:
+    from torch_geometric.utils._trim_to_layer import TrimToLayer
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def extend_tensor(t: torch.Tensor, l: int):
+    return torch.concat([t, torch.zeros(l - len(t), dtype=t.dtype, device=t.device)])
+
+
+class CuGraphSAGE(nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
+        super().__init__()
+
+        self.convs = torch.nn.ModuleList()
+        self.convs.append(CuGraphSAGEConv(in_channels, hidden_channels, aggr="mean"))
+        for _ in range(num_layers - 2):
+            conv = CuGraphSAGEConv(hidden_channels, hidden_channels, aggr="mean")
+            self.convs.append(conv)
+
+        self.convs.append(CuGraphSAGEConv(hidden_channels, out_channels, aggr="mean"))
+
+        self._trim = TrimToLayer()
+
+    def forward(self, x, edge, num_sampled_nodes, num_sampled_edges):
+        if isinstance(edge, torch.Tensor):
+            edge = list(
+                CuGraphSAGEConv.to_csc(
+                    edge.cuda(), (x.shape[0], num_sampled_nodes.sum())
+                )
+            )
+        else:
+            edge = edge.csr()
+            edge = [edge[1], edge[0], x.shape[0]]
+
+        x = x.cuda().to(torch.float32)
+
+        for i, conv in enumerate(self.convs):
+            if i > 0:
+                new_num_edges = edge[1][-2]
+                edge[0] = edge[0].narrow(
+                    dim=0,
+                    start=0,
+                    length=new_num_edges,
+                )
+                edge[1] = edge[1].narrow(
+                    dim=0, start=0, length=edge[1].size(0) - num_sampled_nodes[-i - 1]
+                )
+                edge[2] = x.shape[0]
+
+            x = conv(x, edge)
+
+            x = F.relu(x)
+            x = F.dropout(x, p=0.5)
+
+        x = x.narrow(dim=0, start=0, length=num_sampled_nodes[0])
+
+        return x
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_pyg.py
new file mode 100644
index 00000000000..37f98d5362d
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/models/pyg/models_pyg.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from torch_geometric.nn import SAGEConv
+
+try:
+    from torch_geometric.utils.trim_to_layer import TrimToLayer
+except ModuleNotFoundError:
+    from torch_geometric.utils._trim_to_layer import TrimToLayer
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class GraphSAGE(nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
+        super().__init__()
+
+        self.convs = torch.nn.ModuleList()
+        self.convs.append(SAGEConv(in_channels, hidden_channels, aggr="mean"))
+        for _ in range(num_layers - 2):
+            conv = SAGEConv(hidden_channels, hidden_channels, aggr="mean")
+            self.convs.append(conv)
+
+        self.convs.append(SAGEConv(hidden_channels, out_channels, aggr="mean"))
+
+        self._trim = TrimToLayer()
+
+    def forward(self, x, edge, num_sampled_nodes, num_sampled_edges):
+        edge = edge.cuda()
+        x = x.cuda().to(torch.float32)
+
+        for i, conv in enumerate(self.convs):
+            x, edge, _ = self._trim(
+                i, num_sampled_nodes, num_sampled_edges, x, edge, None
+            )
+
+            s = x.shape[0]
+            x = conv(x, edge, size=(s, s))
+            x = F.relu(x)
+            x = F.dropout(x, p=0.5)
+
+        x = x.narrow(dim=0, start=0, length=x.shape[0] - num_sampled_nodes[1])
+
+        # assert x.shape[0] == num_sampled_nodes[0]
+        return x
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh b/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh
new file mode 100644
index 00000000000..41792c0b63a
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+conda init
+source ~/.bashrc
+conda activate rapids
+
+BATCH_SIZE=$1
+FANOUT=$2
+REPLICATION_FACTOR=$3
+SCRIPTS_DIR=$4
+NUM_EPOCHS=$5
+
+SAMPLES_DIR=/samples
+DATASET_DIR=/datasets
+LOGS_DIR=/logs
+
+MG_UTILS_DIR=${SCRIPTS_DIR}/mg_utils
+SCHEDULER_FILE=${MG_UTILS_DIR}/dask_scheduler.json
+
+export WORKER_RMM_POOL_SIZE=28G
+export UCX_MAX_RNDV_RAILS=1
+export RAPIDS_NO_INITIALIZE=1
+export CUDF_SPILL=1
+export LIBCUDF_CUFILE_POLICY="OFF"
+export GPUS_PER_NODE=8
+
+export SCHEDULER_FILE=$SCHEDULER_FILE
+export LOGS_DIR=$LOGS_DIR
+
+function handleTimeout {
+    seconds=$1
+    eval "timeout --signal=2 --kill-after=60 $*"
+    LAST_EXITCODE=$?
+    if (( $LAST_EXITCODE == 124 )); then
+        logger "ERROR: command timed out after ${seconds} seconds"
+    elif (( $LAST_EXITCODE == 137 )); then
+        logger "ERROR: command timed out after ${seconds} seconds, and had to be killed with signal 9"
+    fi
+    ERRORCODE=$((ERRORCODE | ${LAST_EXITCODE}))
+}
+
+DASK_STARTUP_ERRORCODE=0
+if [[ $SLURM_NODEID == 0 ]]; then
+    ${MG_UTILS_DIR}/run-dask-process.sh scheduler workers &
+else
+    ${MG_UTILS_DIR}/run-dask-process.sh workers &
+fi
+
+echo "properly waiting for workers to connect"
+NUM_GPUS=$(python -c "import os; print(int(os.environ['SLURM_JOB_NUM_NODES'])*int(os.environ['GPUS_PER_NODE']))")
+handleTimeout 120 python ${MG_UTILS_DIR}/wait_for_workers.py \
+                    --num-expected-workers ${NUM_GPUS} \
+                    --scheduler-file-path ${SCHEDULER_FILE}
+
+
+DASK_STARTUP_ERRORCODE=$LAST_EXITCODE
+
+echo $SLURM_NODEID 
+if [[ $SLURM_NODEID == 0 ]]; then
+    echo "Launching Python Script"
+    python ${SCRIPTS_DIR}/cugraph_bulk_sampling.py \
+        --output_root ${SAMPLES_DIR} \
+        --dataset_root ${DATASET_DIR} \
+        --datasets "ogbn_papers100M["$REPLICATION_FACTOR"]" \
+        --fanouts $FANOUT \
+        --batch_sizes $BATCH_SIZE \
+        --seeds_per_call_opts "524288" \
+        --num_epochs $NUM_EPOCHS \
+        --random_seed 42 
+
+    echo "DONE" > ${SAMPLES_DIR}/status.txt
+fi
+
+while [ ! -f "${SAMPLES_DIR}"/status.txt ]
+do
+    sleep 1
+done
+
+sleep 3
+
+# At this stage there should be no running processes except /usr/lpp/mmfs/bin/mmsysmon.py
+dask_processes=$(pgrep -la dask)
+python_processes=$(pgrep -la python)
+echo "$dask_processes"
+echo "$python_processes"
+
+if [[ ${#python_processes[@]} -gt 1 || $dask_processes ]]; then
+    logger "The client was not shutdown properly, killing dask/python processes for Node $SLURM_NODEID"
+    # This can be caused by a job timeout
+    pkill python
+    pkill dask
+    pgrep -la python
+    pgrep -la dask
+fi
+sleep 2
+
+if [[ $SLURM_NODEID == 0 ]]; then
+    rm ${SAMPLES_DIR}/status.txt
+fi
\ No newline at end of file
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh b/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh
new file mode 100755
index 00000000000..977745a9593
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#SBATCH -A datascience_rapids_cugraphgnn
+#SBATCH -p luna
+#SBATCH -J datascience_rapids_cugraphgnn-papers:bulkSamplingPyG
+#SBATCH -N 1
+#SBATCH -t 00:25:00 
+
+CONTAINER_IMAGE=${CONTAINER_IMAGE:="please_specify_container"}
+SCRIPTS_DIR=$(pwd)
+LOGS_DIR=${LOGS_DIR:=$(pwd)"/logs"}
+SAMPLES_DIR=${SAMPLES_DIR:=$(pwd)/samples}
+DATASETS_DIR=${DATASETS_DIR:=$(pwd)/datasets}
+
+mkdir -p $LOGS_DIR
+mkdir -p $SAMPLES_DIR
+mkdir -p $DATASETS_DIR
+
+BATCH_SIZE=512
+FANOUT="10_10_10"
+NUM_EPOCHS=1
+REPLICATION_FACTOR=1
+
+# options: PyG or cuGraphPyG
+FRAMEWORK="cuGraphPyG"
+GPUS_PER_NODE=8
+
+nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
+nodes_array=($nodes)
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+echo Node IP: $head_node_ip
+
+nnodes=$SLURM_JOB_NUM_NODES
+echo Num Nodes: $nnodes
+
+gpus_per_node=$GPUS_PER_NODE
+echo Num GPUs Per Node: $gpus_per_node
+
+set -e
+
+# First run without cuGraph to get data
+
+if [[ "$FRAMEWORK" == "cuGraphPyG" ]]; then
+    # Generate samples
+    srun \
+        --container-image $CONTAINER_IMAGE \
+        --container-mounts=${LOGS_DIR}":/logs",${SAMPLES_DIR}":/samples",${SCRIPTS_DIR}":/scripts",${DATASETS_DIR}":/datasets" \
+        bash /scripts/run_sampling.sh $BATCH_SIZE $FANOUT $REPLICATION_FACTOR "/scripts" $NUM_EPOCHS
+fi
+
+# Train
+srun \
+    --container-image $CONTAINER_IMAGE \
+    --container-mounts=${LOGS_DIR}":/logs",${SAMPLES_DIR}":/samples",${SCRIPTS_DIR}":/scripts",${DATASETS_DIR}":/datasets" \
+    torchrun \
+        --nnodes $nnodes \
+        --nproc-per-node $gpus_per_node \
+        --rdzv-id $RANDOM \
+        --rdzv-backend c10d \
+        --rdzv-endpoint $head_node_ip:29500 \
+        /scripts/bench_cugraph_training.py \
+            --output_file "/logs/output.txt" \
+            --framework $FRAMEWORK \
+            --dataset_dir "/datasets" \
+            --sample_dir "/samples" \
+            --batch_size $BATCH_SIZE \
+            --fanout $FANOUT \
+            --replication_factor $REPLICATION_FACTOR \
+            --num_epochs $NUM_EPOCHS
+
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/__init__.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/__init__.py
new file mode 100644
index 00000000000..5f8f4c2b868
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .trainer import Trainer
+from .trainer import extend_tensor
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/__init__.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/__init__.py
new file mode 100644
index 00000000000..def6110b8e5
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .trainers_cugraph_pyg import PyGCuGraphTrainer
+from .trainers_pyg import PyGNativeTrainer
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_cugraph_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_cugraph_pyg.py
new file mode 100644
index 00000000000..71151e9ba59
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_cugraph_pyg.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .trainers_pyg import PyGTrainer
+from models.pyg import CuGraphSAGE
+
+import torch
+import numpy as np
+
+from torch.nn.parallel import DistributedDataParallel as ddp
+
+from cugraph.gnn import FeatureStore
+from cugraph_pyg.data import CuGraphStore
+from cugraph_pyg.loader import BulkSampleLoader
+
+import os
+
+
+class PyGCuGraphTrainer(PyGTrainer):
+    def __init__(
+        self,
+        dataset,
+        model="GraphSAGE",
+        device=0,
+        rank=0,
+        world_size=1,
+        num_epochs=1,
+        sample_dir=".",
+        **kwargs,
+    ):
+        self.__data = None
+        self.__device = device
+        self.__rank = rank
+        self.__world_size = world_size
+        self.__num_epochs = num_epochs
+        self.__dataset = dataset
+        self.__sample_dir = sample_dir
+        self.__loader_kwargs = kwargs
+        self.__model = self.get_model(model)
+        self.__optimizer = None
+
+    @property
+    def rank(self):
+        return self.__rank
+
+    @property
+    def model(self):
+        return self.__model
+
+    @property
+    def dataset(self):
+        return self.__dataset
+
+    @property
+    def optimizer(self):
+        if self.__optimizer is None:
+            self.__optimizer = torch.optim.Adam(
+                self.model.parameters(), lr=0.01, weight_decay=0.0005
+            )
+        return self.__optimizer
+
+    @property
+    def num_epochs(self) -> int:
+        return self.__num_epochs
+
+    def get_loader(self, epoch: int = 0, stage="train") -> int:
+        import logging
+
+        logger = logging.getLogger("PyGCuGraphTrainer")
+
+        logger.info(f"getting loader for epoch {epoch}, {stage} stage")
+
+        # TODO support online sampling
+        if stage == "val":
+            path = os.path.join(self.__sample_dir, "val", "samples")
+        else:
+            path = os.path.join(self.__sample_dir, f"epoch={epoch}", stage, "samples")
+
+        loader = BulkSampleLoader(
+            self.data,
+            self.data,
+            None,  # FIXME get input nodes properly
+            directory=path,
+            input_files=self.get_input_files(path, epoch=epoch, stage=stage),
+            **self.__loader_kwargs,
+        )
+
+        logger.info(f"got loader successfully on rank {self.rank}")
+        return loader
+
+    @property
+    def data(self):
+        import logging
+
+        logger = logging.getLogger("PyGCuGraphTrainer")
+        logger.info("getting data")
+
+        if self.__data is None:
+            # FIXME wholegraph
+            fs = FeatureStore(backend="torch")
+            num_nodes_dict = {}
+
+            for node_type, x in self.__dataset.x_dict.items():
+                logger.debug(f"getting x for {node_type}")
+                fs.add_data(x, node_type, "x")
+                num_nodes_dict[node_type] = self.__dataset.num_nodes(node_type)
+
+            for node_type, y in self.__dataset.y_dict.items():
+                logger.debug(f"getting y for {node_type}")
+                fs.add_data(y, node_type, "y")
+
+            for node_type, train in self.__dataset.train_dict.items():
+                logger.debug(f"getting train for {node_type}")
+                fs.add_data(train, node_type, "train")
+
+            for node_type, test in self.__dataset.test_dict.items():
+                logger.debug(f"getting test for {node_type}")
+                fs.add_data(test, node_type, "test")
+
+            for node_type, val in self.__dataset.val_dict.items():
+                logger.debug(f"getting val for {node_type}")
+                fs.add_data(val, node_type, "val")
+
+            # TODO support online sampling if the edge index is provided
+            num_edges_dict = self.__dataset.edge_index_dict
+            if not isinstance(list(num_edges_dict.values())[0], int):
+                num_edges_dict = {k: len(v) for k, v in num_edges_dict}
+
+            self.__data = CuGraphStore(
+                fs,
+                num_edges_dict,
+                num_nodes_dict,
+            )
+
+        logger.info(f"got data successfully on rank {self.rank}")
+
+        return self.__data
+
+    def get_model(self, name="GraphSAGE"):
+        if name != "GraphSAGE":
+            raise ValueError("only GraphSAGE is currently supported")
+
+        num_input_features = self.__dataset.num_input_features
+        num_output_features = self.__dataset.num_labels
+        num_layers = len(self.__loader_kwargs["num_neighbors"])
+
+        with torch.cuda.device(self.__device):
+            model = (
+                CuGraphSAGE(
+                    in_channels=num_input_features,
+                    hidden_channels=64,
+                    out_channels=num_output_features,
+                    num_layers=num_layers,
+                )
+                .to(torch.float32)
+                .to(self.__device)
+            )
+
+            model = ddp(model, device_ids=[self.__device])
+            print("done creating model")
+
+        return model
+
+    def get_input_files(self, path, epoch=0, stage="train"):
+        file_list = np.array(os.listdir(path))
+        file_list.sort()
+
+        if stage == "train":
+            splits = np.array_split(file_list, self.__world_size)
+            np.random.seed(epoch)
+            np.random.shuffle(splits)
+            return splits[self.rank]
+        else:
+            return file_list
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
new file mode 100644
index 00000000000..bddd6ae2644
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -0,0 +1,430 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from trainers import Trainer
+from trainers import extend_tensor
+from datasets import OGBNPapers100MDataset
+from models.pyg import GraphSAGE
+
+import torch
+import numpy as np
+
+import torch.distributed as td
+from torch.nn.parallel import DistributedDataParallel as ddp
+import torch.nn.functional as F
+
+from torch_geometric.utils.sparse import index2ptr
+from torch_geometric.data import HeteroData
+from torch_geometric.loader import NeighborLoader
+
+import gc
+import os
+import time
+
+
+def pyg_num_workers(world_size):
+    num_workers = None
+    if hasattr(os, "sched_getaffinity"):
+        try:
+            num_workers = len(os.sched_getaffinity(0)) / (2 * world_size)
+        except Exception:
+            pass
+    if num_workers is None:
+        num_workers = os.cpu_count() / (2 * world_size)
+    return int(num_workers)
+
+
+class PyGTrainer(Trainer):
+    def train(self):
+        import logging
+
+        logger = logging.getLogger("PyGTrainer")
+        logger.info("Entered train loop")
+
+        total_loss = 0.0
+        num_batches = 0
+
+        time_forward = 0.0
+        time_backward = 0.0
+        time_loader = 0.0
+        time_feature_transfer = 0.0
+        start_time = time.perf_counter()
+        end_time_backward = start_time
+
+        for epoch in range(self.num_epochs):
+            with td.algorithms.join.Join(
+                [self.model], divide_by_initial_world_size=False
+            ):
+                self.model.train()
+                for iter_i, data in enumerate(
+                    self.get_loader(epoch=epoch, stage="train")
+                ):
+                    loader_time_iter = time.perf_counter() - end_time_backward
+                    time_loader += loader_time_iter
+
+                    time_feature_transfer_start = time.perf_counter()
+
+                    num_sampled_nodes = sum(
+                        [
+                            torch.as_tensor(n)
+                            for n in data.num_sampled_nodes_dict.values()
+                        ]
+                    )
+                    num_sampled_edges = sum(
+                        [
+                            torch.as_tensor(e)
+                            for e in data.num_sampled_edges_dict.values()
+                        ]
+                    )
+
+                    # FIXME find a way to get around this and not have to call extend_tensor
+                    num_layers = len(self.model.module.convs)
+                    num_sampled_nodes = extend_tensor(num_sampled_nodes, num_layers + 1)
+                    num_sampled_edges = extend_tensor(num_sampled_edges, num_layers)
+
+                    data = data.to_homogeneous().cuda()
+                    time_feature_transfer_end = time.perf_counter()
+                    time_feature_transfer += (
+                        time_feature_transfer_end - time_feature_transfer_start
+                    )
+
+                    num_batches += 1
+                    if iter_i % 20 == 1:
+                        time_forward_iter = time_forward / num_batches
+                        time_backward_iter = time_backward / num_batches
+
+                        total_time_iter = (
+                            time.perf_counter() - start_time
+                        ) / num_batches
+                        logger.info(f"epoch {epoch}, iteration {iter_i}")
+                        logger.info(f"num sampled nodes: {num_sampled_nodes}")
+                        logger.info(f"num sampled edges: {num_sampled_edges}")
+                        logger.info(f"time forward: {time_forward_iter}")
+                        logger.info(f"time backward: {time_backward_iter}")
+                        logger.info(f"loader time: {loader_time_iter}")
+                        logger.info(
+                            f"feature transfer time: {time_feature_transfer / num_batches}"
+                        )
+                        logger.info(f"total time: {total_time_iter}")
+
+                    y_true = data.y
+                    x = data.x.to(torch.float32)
+
+                    start_time_forward = time.perf_counter()
+                    edge_index = data.edge_index if "edge_index" in data else data.adj_t
+
+                    self.optimizer.zero_grad()
+                    y_pred = self.model(
+                        x,
+                        edge_index,
+                        num_sampled_nodes,
+                        num_sampled_edges,
+                    )
+
+                    end_time_forward = time.perf_counter()
+                    time_forward += end_time_forward - start_time_forward
+
+                    if y_pred.shape[0] > len(y_true):
+                        raise ValueError(
+                            f"illegal shape: {y_pred.shape}; {y_true.shape}"
+                        )
+
+                    y_true = y_true[: y_pred.shape[0]]
+
+                    y_true = F.one_hot(
+                        y_true.to(torch.int64), num_classes=self.dataset.num_labels
+                    ).to(torch.float32)
+
+                    if y_true.shape != y_pred.shape:
+                        raise ValueError(
+                            f"y_true shape was {y_true.shape} "
+                            f"but y_pred shape was {y_pred.shape} "
+                            f"in iteration {iter_i} "
+                            f"on rank {y_pred.device.index}"
+                        )
+
+                    start_time_backward = time.perf_counter()
+                    loss = F.cross_entropy(y_pred, y_true)
+
+                    self.optimizer.zero_grad()
+                    loss.backward()
+                    self.optimizer.step()
+                    total_loss += loss.item()
+                    end_time_backward = time.perf_counter()
+                    time_backward += end_time_backward - start_time_backward
+
+            end_time = time.perf_counter()
+
+            # test
+            from torchmetrics import Accuracy
+
+            acc = Accuracy(
+                task="multiclass", num_classes=self.dataset.num_labels
+            ).cuda()
+
+            with td.algorithms.join.Join(
+                [self.model], divide_by_initial_world_size=False
+            ):
+                self.model.eval()
+                if self.rank == 0:
+                    acc_sum = 0.0
+                    with torch.no_grad():
+                        for i, batch in enumerate(
+                            self.get_loader(epoch=epoch, stage="test")
+                        ):
+                            num_sampled_nodes = sum(
+                                [
+                                    torch.as_tensor(n)
+                                    for n in batch.num_sampled_nodes_dict.values()
+                                ]
+                            )
+                            num_sampled_edges = sum(
+                                [
+                                    torch.as_tensor(e)
+                                    for e in batch.num_sampled_edges_dict.values()
+                                ]
+                            )
+                            batch_size = num_sampled_nodes[0]
+
+                            batch = batch.to_homogeneous().cuda()
+
+                            batch.y = batch.y.to(torch.long)
+                            out = self.model.module(
+                                batch.x,
+                                batch.edge_index,
+                                num_sampled_nodes,
+                                num_sampled_edges,
+                            )
+                            acc_sum += acc(
+                                out[:batch_size].softmax(dim=-1), batch.y[:batch_size]
+                            )
+                    print(
+                        f"Accuracy: {acc_sum/(i) * 100.0:.4f}%",
+                    )
+
+            td.barrier()
+
+        with td.algorithms.join.Join([self.model], divide_by_initial_world_size=False):
+            self.model.eval()
+            if self.rank == 0:
+                acc_sum = 0.0
+                with torch.no_grad():
+                    for i, batch in enumerate(
+                        self.get_loader(epoch=epoch, stage="val")
+                    ):
+                        num_sampled_nodes = sum(
+                            [
+                                torch.as_tensor(n)
+                                for n in batch.num_sampled_nodes_dict.values()
+                            ]
+                        )
+                        num_sampled_edges = sum(
+                            [
+                                torch.as_tensor(e)
+                                for e in batch.num_sampled_edges_dict.values()
+                            ]
+                        )
+                        batch_size = num_sampled_nodes[0]
+
+                        batch = batch.to_homogeneous().cuda()
+
+                        batch.y = batch.y.to(torch.long)
+                        out = self.model.module(
+                            batch.x,
+                            batch.edge_index,
+                            num_sampled_nodes,
+                            num_sampled_edges,
+                        )
+                        acc_sum += acc(
+                            out[:batch_size].softmax(dim=-1), batch.y[:batch_size]
+                        )
+                print(
+                    f"Validation Accuracy: {acc_sum/(i) * 100.0:.4f}%",
+                )
+
+        stats = {
+            "Accuracy": float(acc_sum / (i) * 100.0) if self.rank == 0 else 0.0,
+            "# Batches": num_batches,
+            "Loader Time": time_loader,
+            "Feature Transfer Time": time_feature_transfer,
+            "Forward Time": time_forward,
+            "Backward Time": time_backward,
+        }
+        return stats
+
+
+class PyGNativeTrainer(PyGTrainer):
+    def __init__(
+        self,
+        dataset,
+        model="GraphSAGE",
+        device=0,
+        rank=0,
+        world_size=1,
+        num_epochs=1,
+        **kwargs,
+    ):
+        self.__dataset = dataset
+        self.__device = device
+        self.__data = None
+        self.__rank = rank
+        self.__num_epochs = num_epochs
+        self.__world_size = world_size
+        self.__loader_kwargs = kwargs
+        self.__model = self.get_model(model)
+        self.__optimizer = None
+
+    @property
+    def rank(self):
+        return self.__rank
+
+    @property
+    def model(self):
+        return self.__model
+
+    @property
+    def dataset(self):
+        return self.__dataset
+
+    @property
+    def data(self):
+        import logging
+
+        logger = logging.getLogger("PyGNativeTrainer")
+        logger.info("getting data")
+
+        if self.__data is None:
+            self.__data = HeteroData()
+
+            for node_type, x in self.__dataset.x_dict.items():
+                logger.debug(f"getting x for {node_type}")
+                self.__data[node_type].x = x
+                self.__data[node_type]["num_nodes"] = self.__dataset.num_nodes(
+                    node_type
+                )
+
+            for node_type, y in self.__dataset.y_dict.items():
+                logger.debug(f"getting y for {node_type}")
+                self.__data[node_type]["y"] = y
+
+            for node_type, train in self.__dataset.train_dict.items():
+                logger.debug(f"getting train for {node_type}")
+                self.__data[node_type]["train"] = train
+
+            for node_type, test in self.__dataset.test_dict.items():
+                logger.debug(f"getting test for {node_type}")
+                self.__data[node_type]["test"] = test
+
+            for node_type, val in self.__dataset.val_dict.items():
+                logger.debug(f"getting val for {node_type}")
+                self.__data[node_type]["val"] = val
+
+            for can_edge_type, ei in self.__dataset.edge_index_dict.items():
+                logger.info("converting to csc...")
+                ei["dst"] = index2ptr(
+                    ei["dst"], self.__dataset.num_nodes(can_edge_type[2])
+                )
+
+                logger.info("updating data structure...")
+                self.__data.put_edge_index(
+                    layout="csc",
+                    edge_index=list(ei.values()),
+                    edge_type=can_edge_type,
+                    size=(
+                        self.__dataset.num_nodes(can_edge_type[0]),
+                        self.__dataset.num_nodes(can_edge_type[2]),
+                    ),
+                    is_sorted=True,
+                )
+                gc.collect()
+
+        return self.__data
+
+    @property
+    def optimizer(self):
+        if self.__optimizer is None:
+            self.__optimizer = torch.optim.Adam(
+                self.model.parameters(), lr=0.01, weight_decay=0.0005
+            )
+        return self.__optimizer
+
+    @property
+    def num_epochs(self) -> int:
+        return self.__num_epochs
+
+    def get_loader(self, epoch: int = 0, stage="train"):
+        import logging
+
+        logger = logging.getLogger("PyGNativeTrainer")
+        logger.info(f"Getting loader for epoch {epoch}")
+
+        if stage == "train":
+            mask_dict = self.__dataset.train_dict
+        elif stage == "test":
+            mask_dict = self.__dataset.test_dict
+        elif stage == "val":
+            mask_dict = self.__dataset.val_dict
+        else:
+            raise ValueError(f"Invalid stage {stage}")
+
+        input_nodes_dict = {
+            node_type: np.array_split(np.arange(len(mask))[mask], self.__world_size)[
+                self.__rank
+            ]
+            for node_type, mask in mask_dict.items()
+        }
+
+        input_nodes = list(input_nodes_dict.items())
+        if len(input_nodes) > 1:
+            raise ValueError("Multiple input node types currently unsupported")
+        else:
+            input_nodes = tuple(input_nodes[0])
+
+        # get loader
+        loader = NeighborLoader(
+            self.data,
+            input_nodes=input_nodes,
+            is_sorted=True,
+            disjoint=False,
+            num_workers=pyg_num_workers(self.__world_size),  # FIXME change this
+            persistent_workers=True,
+            **self.__loader_kwargs,  # batch size, num neighbors, replace, shuffle, etc.
+        )
+
+        logger.info("done creating loader")
+        return loader
+
+    def get_model(self, name="GraphSAGE"):
+        if name != "GraphSAGE":
+            raise ValueError("only GraphSAGE is currently supported")
+
+        num_input_features = self.__dataset.num_input_features
+        num_output_features = self.__dataset.num_labels
+        num_layers = len(self.__loader_kwargs["num_neighbors"])
+
+        with torch.cuda.device(self.__device):
+            model = (
+                GraphSAGE(
+                    in_channels=num_input_features,
+                    hidden_channels=64,
+                    out_channels=num_output_features,
+                    num_layers=num_layers,
+                )
+                .to(torch.float32)
+                .to(self.__device)
+            )
+            model = ddp(model, device_ids=[self.__device])
+            print("done creating model")
+
+        return model
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/trainer.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/trainer.py
new file mode 100644
index 00000000000..321edbea96e
--- /dev/null
+++ b/benchmarks/cugraph/standalone/bulk_sampling/trainers/trainer.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from typing import Union, List
+
+
+def extend_tensor(t: Union[List[int], torch.Tensor], l: int):
+    t = torch.as_tensor(t)
+
+    return torch.concat([t, torch.zeros(l - len(t), dtype=t.dtype, device=t.device)])
+
+
+class Trainer:
+    @property
+    def rank(self):
+        raise NotImplementedError()
+
+    @property
+    def model(self):
+        raise NotImplementedError()
+
+    @property
+    def dataset(self):
+        raise NotImplementedError()
+
+    @property
+    def data(self):
+        raise NotImplementedError()
+
+    @property
+    def optimizer(self):
+        raise NotImplementedError()
+
+    @property
+    def num_epochs(self) -> int:
+        raise NotImplementedError()
+
+    def get_loader(self, epoch: int = 0, stage="train"):
+        raise NotImplementedError()
+
+    def train(self):
+        raise NotImplementedError()
diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
index 971c3ff1032..97eb32e2aaa 100644
--- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,29 +11,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import random
+
 import networkx as nx
 import pandas as pd
 import pytest
 from cugraph import datasets
-
-# FIXME: promote these to cugraph.datasets so the following steps aren't
-# necessary
-#
-# These datasets can be downloaded using the script in the 'datasets' dir:
-#
-# cd <repo dir>/datasets
-# ./get_test_data.sh --benchmark
-#
-# Then set the following env var so the dataset utils can find their location:
-#
-# export RAPIDS_DATASET_ROOT_DIR=<repo dir>/datasets
-#
-from cugraph_benchmarking.params import (
-    hollywood,
-    europe_osm,
-    cit_patents,
-    soc_livejournal,
-)
+import nx_cugraph as nxcg
 
 # Attempt to import the NetworkX dispatching module, which is only needed when
 # testing with NX <3.2 in order to dynamically switch backends. NX >=3.2 allows
@@ -45,22 +29,83 @@
 
 
 ################################################################################
-# Fixtures and helpers
-backend_params = ["cugraph", None]
+# Fixtures and params
+
+# See https://pytest-benchmark.readthedocs.io/en/latest/glossary.html for how
+# these variables are used.
+rounds = 1
+iterations = 1
+warmup_rounds = 1
 
-dataset_params = [
+dataset_param_values = [
+    # name: karate, nodes: 34, edges: 156
     pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]),
+    # name: netscience, nodes: 1461, edges: 5484
     pytest.param(datasets.netscience, marks=[pytest.mark.small, pytest.mark.directed]),
+    # name: email-Eu-core, nodes: 1005, edges: 25571
     pytest.param(
         datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed]
     ),
-    pytest.param(cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]),
-    pytest.param(hollywood, marks=[pytest.mark.medium, pytest.mark.undirected]),
-    pytest.param(europe_osm, marks=[pytest.mark.medium, pytest.mark.undirected]),
-    pytest.param(soc_livejournal, marks=[pytest.mark.large, pytest.mark.directed]),
+    # name: cit-Patents, nodes: 3774768, edges: 16518948
+    pytest.param(
+        datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]
+    ),
+    # name: hollywood, nodes: 1139905, edges: 57515616
+    pytest.param(
+        datasets.hollywood, marks=[pytest.mark.medium, pytest.mark.undirected]
+    ),
+    # name: soc-LiveJournal1, nodes: 4847571, edges: 68993773
+    pytest.param(
+        datasets.soc_livejournal, marks=[pytest.mark.medium, pytest.mark.directed]
+    ),
+    # name: europe_osm, nodes: 50912018, edges: 54054660
+    pytest.param(
+        datasets.europe_osm, marks=[pytest.mark.large, pytest.mark.undirected]
+    ),
 ]
 
+backend_param_values = ["cugraph", "cugraph-preconverted", None]
+
+
+def setup_module(module):
+    """
+    Trivial conversion call to force various one-time CUDA initialization
+    operations to happen outside of benchmarks.
+    """
+    G = nx.karate_club_graph()
+    nxcg.from_networkx(G)
+
 
+# Test IDs are generated using the lambda assigned to the ids arg to provide an
+# easier-to-read name. This is especially helpful for Dataset objs (see
+# https://docs.pytest.org/en/stable/reference/reference.html#pytest-fixture)
+@pytest.fixture(
+    scope="module", params=dataset_param_values, ids=lambda ds: f"ds={str(ds)}"
+)
+def graph_obj(request):
+    """
+    Returns a NX Graph or DiGraph obj from the dataset instance parameter.
+    """
+    dataset = request.param
+    return nx_graph_from_dataset(dataset)
+
+
+@pytest.fixture(
+    scope="module",
+    params=backend_param_values,
+    ids=lambda backend: f"backend={backend}",
+)
+def backend(request):
+    """
+    Returns the backend name to use. This is done as a fixture for consistency
+    and simplicity when creating benchmarks (no need to mark the benchmark as
+    parametrized).
+    """
+    return request.param
+
+
+################################################################################
+# Helpers
 def nx_graph_from_dataset(dataset_obj):
     """
     Read the dataset specified by the dataset_obj and create and return a
@@ -87,126 +132,675 @@ def nx_graph_from_dataset(dataset_obj):
     return G
 
 
-# Test IDs are generated using the lambda assigned to the ids arg to provide an
-# easier-to-read name from the Dataset obj string repr.
-# See: https://docs.pytest.org/en/stable/reference/reference.html#pytest-fixture
-@pytest.fixture(scope="module", params=dataset_params, ids=lambda ds: f"ds={str(ds)}")
-def graph_obj(request):
-    """
-    Returns a NX Graph or DiGraph obj from the dataset instance parameter.
-    """
-    dataset = request.param
-    return nx_graph_from_dataset(dataset)
-
-
-def get_legacy_backend_selector(backend_name):
+def get_legacy_backend_wrapper(backend_name):
     """
     Returns a callable that wraps an algo function with either the default
-    dispatch decorator, or the "testing" decorator which unconditionally
-    dispatches.
+    dispatcher (which dispatches based on input graph type), or the "testing"
+    dispatcher (which autoconverts and unconditionally dispatches).
     This is only supported for NetworkX <3.2
     """
     backends.plugin_name = "cugraph"
     orig_dispatch = backends._dispatch
     testing_dispatch = backends.test_override_dispatch
 
-    # Testing with the networkx <3.2 dispatch mechanism is based on decorating
-    # networkx APIs. The decorator is either one that only uses a backend if
-    # the input graph type is for that backend (the default decorator), or the
-    # "testing" decorator, which unconditionally converts a graph type to the
-    # type needed by the backend then calls the backend. If the cugraph backend
-    # is specified, create a callable that decorates the benchmarked function
-    # with the testing decorator.
-    #
-    # Because both the default and testing decorators assume they are only
-    # applied once and do bookkeeping to ensure algos are not registered
-    # multiple times, the callable also clears bookkeeping so the decorators
-    # can be reapplied multiple times. This is obviously a hack and networkx
-    # >=3.2 makes this use case properly supported.
     if backend_name == "cugraph":
-
-        def wrapper(*args, **kwargs):
-            backends._registered_algorithms = {}
-            return testing_dispatch(*args, **kwargs)
-
+        dispatch = testing_dispatch
     else:
+        dispatch = orig_dispatch
+
+    def wrap_callable_for_dispatch(func, exhaust_returned_iterator=False):
+        # Networkx <3.2 registers functions when the dispatch decorator is
+        # applied (called) and errors if re-registered, so clear bookkeeping to
+        # allow it to be called repeatedly.
+        backends._registered_algorithms = {}
+        actual_func = dispatch(func)  # returns the func the dispatcher picks
 
         def wrapper(*args, **kwargs):
-            backends._registered_algorithms = {}
-            return orig_dispatch(*args, **kwargs)
+            retval = actual_func(*args, **kwargs)
+            if exhaust_returned_iterator:
+                retval = list(retval)
+            return retval
 
-    return wrapper
+        return wrapper
 
+    return wrap_callable_for_dispatch
 
-def get_backend_selector(backend_name):
+
+def get_backend_wrapper(backend_name):
     """
     Returns a callable that wraps an algo function in order to set the
     "backend" kwarg on it.
     This is only supported for NetworkX >= 3.2
     """
 
-    def get_callable_for_func(func):
+    def wrap_callable_for_dispatch(func, exhaust_returned_iterator=False):
         def wrapper(*args, **kwargs):
             kwargs["backend"] = backend_name
-            return func(*args, **kwargs)
+            retval = func(*args, **kwargs)
+            if exhaust_returned_iterator:
+                retval = list(retval)
+            return retval
 
         return wrapper
 
-    return get_callable_for_func
+    return wrap_callable_for_dispatch
 
 
 @pytest.fixture(
-    scope="module", params=backend_params, ids=lambda backend: f"backend={backend}"
+    scope="module",
+    params=backend_param_values,
+    ids=lambda backend: f"backend={backend}",
 )
-def backend_selector(request):
+def backend_wrapper(request):
     """
     Returns a callable that takes a function algo and wraps it in another
     function that calls the algo using the appropriate backend.
+
+    For example: if the backend to test is "cugraph", this will return a
+    function that calls nx.pagerank(..., backend='cugraph')
     """
     backend_name = request.param
+    actual_backend_name = backend_name
+
+    # Special case: cugraph-preconverted may be specified as a backend but this
+    # name is reserved to indicate a cugraph backend is to be used with a
+    # preconverted graph obj (rather than having the backend do the
+    # conversion).
+    if backend_name == "cugraph-preconverted":
+        actual_backend_name = "cugraph"
+
+    # NX <3.2 does not support the backends= kwarg, so the backend must be
+    # enabled differently
     if backends is not None:
-        return get_legacy_backend_selector(backend_name)
+        wrapper = get_legacy_backend_wrapper(actual_backend_name)
     else:
-        return get_backend_selector(backend_name)
+        wrapper = get_backend_wrapper(actual_backend_name)
+
+    wrapper.backend_name = backend_name
+    return wrapper
+
+
+def get_graph_obj_for_benchmark(graph_obj, backend_wrapper):
+    """
+    Given a Graph object and a backend name, return a converted Graph or the
+    original Graph object based on the backend to use.
+
+    This is needed because some backend names are actually used as descriptions
+    for combinations of backends and converted/non-converted graphs.  For
+    example, a benchmark may specify the "cugraph-preconverted" backend, which
+    is not an installed backend but instead refers to the "cugraph" backend
+    passed a NX Graph that has been converted to a nx-cugraph Graph object.
+    """
+    G = graph_obj
+    if backend_wrapper.backend_name == "cugraph-preconverted":
+        G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    return G
+
+
+def get_highest_degree_node(graph_obj):
+    degrees = graph_obj.degree()  # list of tuples of (node, degree)
+    return max(degrees, key=lambda t: t[1])[0]
 
 
 ################################################################################
 # Benchmarks
-normalized_params = [True, False]
-k_params = [10, 100]
+def bench_from_networkx(benchmark, graph_obj):
+    benchmark(nxcg.from_networkx, graph_obj)
+
 
+# normalized_param_values = [True, False]
+# k_param_values = [10, 100]
+normalized_param_values = [True]
+k_param_values = [10]
 
-@pytest.mark.parametrize("normalized", normalized_params, ids=lambda norm: f"{norm=}")
-@pytest.mark.parametrize("k", k_params, ids=lambda k: f"{k=}")
-def bench_betweenness_centrality(benchmark, graph_obj, backend_selector, normalized, k):
-    result = benchmark(
-        backend_selector(nx.betweenness_centrality),
-        graph_obj,
-        weight=None,
-        normalized=normalized,
-        k=k,
+
+@pytest.mark.parametrize(
+    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
+)
+@pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
+def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.betweenness_centrality),
+        args=(G,),
+        kwargs=dict(
+            weight=None,
+            normalized=normalized,
+            k=k,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
     )
     assert type(result) is dict
 
 
-@pytest.mark.parametrize("normalized", normalized_params, ids=lambda norm: f"{norm=}")
+@pytest.mark.parametrize(
+    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
+)
+@pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
 def bench_edge_betweenness_centrality(
-    benchmark, graph_obj, backend_selector, normalized
+    benchmark, graph_obj, backend_wrapper, normalized, k
 ):
-    result = benchmark(
-        backend_selector(nx.edge_betweenness_centrality),
-        graph_obj,
-        weight=None,
-        normalized=normalized,
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.edge_betweenness_centrality),
+        args=(G,),
+        kwargs=dict(
+            weight=None,
+            normalized=normalized,
+            k=k,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
     )
     assert type(result) is dict
 
 
-def bench_louvain_communities(benchmark, graph_obj, backend_selector):
-    # The cugraph backend for louvain_communities only supports undirected graphs
-    if isinstance(graph_obj, nx.DiGraph):
-        G = graph_obj.to_undirected()
-    else:
-        G = graph_obj
-    result = benchmark(backend_selector(nx.community.louvain_communities), G)
+def bench_louvain_communities(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    # DiGraphs are not supported
+    if G.is_directed():
+        G = G.to_undirected()
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.community.louvain_communities),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is list
+
+
+def bench_degree_centrality(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.degree_centrality),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_eigenvector_centrality(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.eigenvector_centrality),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+@pytest.mark.parametrize(
+    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
+)
+def bench_hits(benchmark, graph_obj, backend_wrapper, normalized):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.hits),
+        args=(G,),
+        kwargs=dict(
+            normalized=normalized,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is tuple
+    assert len(result) == 2
+    assert type(result[0]) is dict
+    assert type(result[1]) is dict
+
+
+def bench_in_degree_centrality(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.in_degree_centrality),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+@pytest.mark.parametrize(
+    "normalized", normalized_param_values, ids=lambda norm: f"{norm=}"
+)
+def bench_katz_centrality(benchmark, graph_obj, backend_wrapper, normalized):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.katz_centrality),
+        args=(G,),
+        kwargs=dict(
+            normalized=normalized,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_k_truss(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    # DiGraphs are not supported
+    if G.is_directed():
+        G = G.to_undirected()
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.k_truss),
+        args=(G,),
+        kwargs=dict(
+            k=2,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    # Check that this at least appears to be some kind of NX-like Graph
+    assert hasattr(result, "has_node")
+
+
+def bench_out_degree_centrality(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.out_degree_centrality),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_pagerank(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.pagerank),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_single_source_shortest_path_length(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.single_source_shortest_path_length),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_single_target_shortest_path_length(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(
+            nx.single_target_shortest_path_length, exhaust_returned_iterator=True
+        ),
+        args=(G,),
+        kwargs=dict(
+            target=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    # exhaust_returned_iterator=True forces the result to a list, but is not
+    # needed for this algo in NX 3.3+ which returns a dict instead of an
+    # iterator. Forcing to a list does not change the benchmark timing.
+    assert type(result) is list
+
+
+def bench_ancestors(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.ancestors),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is set
+
+
+def bench_average_clustering(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    # DiGraphs are not supported by nx-cugraph
+    if G.is_directed():
+        G = G.to_undirected()
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.average_clustering),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is float
+
+
+def bench_generic_bfs_edges(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.generic_bfs_edges, exhaust_returned_iterator=True),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is list
+
+
+def bench_bfs_edges(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.bfs_edges, exhaust_returned_iterator=True),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is list
+
+
+def bench_bfs_layers(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.bfs_layers, exhaust_returned_iterator=True),
+        args=(G,),
+        kwargs=dict(
+            sources=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is list
+
+
+def bench_bfs_predecessors(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.bfs_predecessors, exhaust_returned_iterator=True),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is list
+
+
+def bench_bfs_successors(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.bfs_successors, exhaust_returned_iterator=True),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is list
+
+
+def bench_bfs_tree(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.bfs_tree),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    # Check that this at least appears to be some kind of NX-like Graph
+    assert hasattr(result, "has_node")
+
+
+def bench_clustering(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    # DiGraphs are not supported by nx-cugraph
+    if G.is_directed():
+        G = G.to_undirected()
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.clustering),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_core_number(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    # DiGraphs are not supported by nx-cugraph
+    if G.is_directed():
+        G = G.to_undirected()
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.core_number),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_descendants(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.descendants),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is set
+
+
+def bench_descendants_at_distance(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.descendants_at_distance),
+        args=(G,),
+        kwargs=dict(
+            source=node,
+            distance=1,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is set
+
+
+def bench_is_bipartite(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.is_bipartite),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is bool
+
+
+def bench_is_strongly_connected(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.is_strongly_connected),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is bool
+
+
+def bench_is_weakly_connected(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.is_weakly_connected),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is bool
+
+
+def bench_number_strongly_connected_components(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.number_strongly_connected_components),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is int
+
+
+def bench_number_weakly_connected_components(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.number_weakly_connected_components),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is int
+
+
+def bench_overall_reciprocity(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.overall_reciprocity),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is float
+
+
+def bench_reciprocity(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    node = get_highest_degree_node(graph_obj)
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.reciprocity),
+        args=(G,),
+        kwargs=dict(
+            nodes=node,
+        ),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is float
+
+
+def bench_strongly_connected_components(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(
+            nx.strongly_connected_components, exhaust_returned_iterator=True
+        ),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is list
+
+
+def bench_transitivity(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    # DiGraphs are not supported by nx-cugraph
+    if G.is_directed():
+        G = G.to_undirected()
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.transitivity),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is float
+
+
+def bench_triangles(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    # DiGraphs are not supported
+    if G.is_directed():
+        G = G.to_undirected()
+    result = benchmark.pedantic(
+        target=backend_wrapper(nx.triangles),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
+    assert type(result) is dict
+
+
+def bench_weakly_connected_components(benchmark, graph_obj, backend_wrapper):
+    G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
+    result = benchmark.pedantic(
+        target=backend_wrapper(
+            nx.weakly_connected_components, exhaust_returned_iterator=True
+        ),
+        args=(G,),
+        rounds=rounds,
+        iterations=iterations,
+        warmup_rounds=warmup_rounds,
+    )
     assert type(result) is list
diff --git a/benchmarks/nx-cugraph/pytest-based/run-2402.sh b/benchmarks/nx-cugraph/pytest-based/run-2402.sh
new file mode 100755
index 00000000000..44ed0bda43a
--- /dev/null
+++ b/benchmarks/nx-cugraph/pytest-based/run-2402.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+#
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Runs benchmarks for the 24.02 algos.
+# Pass either a or b or both. This is useful for separating batches of runs on different GPUs:
+# CUDA_VISIBLE_DEVICES=1 run-2402.sh b
+
+mkdir -p logs
+
+# benches="$benches ..." pattern is easy to comment out individual runs
+benches=
+
+while [[ $1 != "" ]]; do
+    if [[ $1 == "a" ]]; then
+        benches="$benches bench_ancestors"
+        benches="$benches bench_average_clustering"
+        benches="$benches bench_generic_bfs_edges"
+        benches="$benches bench_bfs_edges"
+        benches="$benches bench_bfs_layers"
+        benches="$benches bench_bfs_predecessors"
+        benches="$benches bench_bfs_successors"
+        benches="$benches bench_bfs_tree"
+        benches="$benches bench_clustering"
+        benches="$benches bench_core_number"
+        benches="$benches bench_descendants"
+    elif [[ $1 == "b" ]]; then
+        benches="$benches bench_descendants_at_distance"
+        benches="$benches bench_is_bipartite"
+        benches="$benches bench_is_strongly_connected"
+        benches="$benches bench_is_weakly_connected"
+        benches="$benches bench_number_strongly_connected_components"
+        benches="$benches bench_number_weakly_connected_components"
+        benches="$benches bench_overall_reciprocity"
+        benches="$benches bench_reciprocity"
+        benches="$benches bench_strongly_connected_components"
+        benches="$benches bench_transitivity"
+        benches="$benches bench_triangles"
+        benches="$benches bench_weakly_connected_components"
+    fi
+    shift
+done
+
+for bench in $benches; do
+    pytest -sv -k "soc-livejournal1" "bench_algos.py::$bench" 2>&1 | tee "logs/${bench}.log"
+done
diff --git a/benchmarks/shared/python/cugraph_benchmarking/params.py b/benchmarks/shared/python/cugraph_benchmarking/params.py
index d82cfd26117..034e22ffc37 100644
--- a/benchmarks/shared/python/cugraph_benchmarking/params.py
+++ b/benchmarks/shared/python/cugraph_benchmarking/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -14,42 +14,16 @@
 import pytest
 
 from pylibcugraph.testing.utils import gen_fixture_params
-from cugraph.testing import RAPIDS_DATASET_ROOT_DIR_PATH
 from cugraph.datasets import (
-    Dataset,
     karate,
     netscience,
     email_Eu_core,
+    hollywood,
+    europe_osm,
+    cit_patents,
+    soc_livejournal,
 )
 
-# Create Dataset objects from .csv files.
-# Once the cugraph.dataset package is updated to include the metadata files for
-# these (like karate), these will no longer need to be explicitly instantiated.
-hollywood = Dataset(
-    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/undirected/hollywood.csv",
-    csv_col_names=["src", "dst"],
-    csv_col_dtypes=["int32", "int32"],
-)
-hollywood.metadata["is_directed"] = False
-europe_osm = Dataset(
-    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/undirected/europe_osm.csv",
-    csv_col_names=["src", "dst"],
-    csv_col_dtypes=["int32", "int32"],
-)
-europe_osm.metadata["is_directed"] = False
-cit_patents = Dataset(
-    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/directed/cit-Patents.csv",
-    csv_col_names=["src", "dst"],
-    csv_col_dtypes=["int32", "int32"],
-)
-cit_patents.metadata["is_directed"] = True
-soc_livejournal = Dataset(
-    csv_file=RAPIDS_DATASET_ROOT_DIR_PATH / "csv/directed/soc-LiveJournal1.csv",
-    csv_col_names=["src", "dst"],
-    csv_col_dtypes=["int32", "int32"],
-)
-soc_livejournal.metadata["is_directed"] = True
-
 # Assume all "file_data" (.csv file on disk) datasets are too small to be useful for MG.
 undirected_datasets = [
     pytest.param(
diff --git a/build.sh b/build.sh
index eef19046d85..82de45ca9fb 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cugraph build script
 
@@ -18,7 +18,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-RAPIDS_VERSION=23.12
+RAPIDS_VERSION=24.02
 
 # Valid args to this script (all possible targets and options) - only one per line
 VALIDARGS="
@@ -31,6 +31,7 @@ VALIDARGS="
    cugraph-service
    cugraph-pyg
    cugraph-dgl
+   cugraph-equivariant
    nx-cugraph
    cpp-mgtests
    cpp-mtmgtests
@@ -60,6 +61,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
    cugraph-pyg                - build the cugraph-pyg Python package
    cugraph-dgl                - build the cugraph-dgl extensions for DGL
+   cugraph-equivariant        - build the cugraph-equivariant Python package
    nx-cugraph                 - build the nx-cugraph Python package
    cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
    cpp-mtmgtests              - build libcugraph MTMG tests. Adds UCX as a dependency (temporary).
@@ -69,7 +71,7 @@ HELP="$0 [<target> ...] [<flag> ...]
    -v                         - verbose build mode
    -g                         - build for debug
    -n                         - do not install after a successful build (does not affect Python packages)
-   --pydevelop                - use setup.py develop instead of install
+   --pydevelop                - install the Python packages in editable mode
    --allgpuarch               - build for all supported GPU architectures
    --skip_cpp_tests           - do not build the SG test binaries as part of the libcugraph and libcugraph_etl targets
    --without_cugraphops       - do not build algos that require cugraph-ops
@@ -187,14 +189,18 @@ if hasArg --cmake_default_generator; then
     CMAKE_GENERATOR_OPTION=""
 fi
 if hasArg --pydevelop; then
-    PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps -e"
+    PYTHON_ARGS_FOR_INSTALL="${PYTHON_ARGS_FOR_INSTALL} -e"
 fi
 
-# Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
 SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-  if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUGRAPH_CPP"* ]]; then
-      SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_CUGRAPH_CPP=ON"
-  fi
+
+# Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
+SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
+
+# Append `-DFIND_CUGRAPH_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
+if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUGRAPH_CPP"* ]]; then
+    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS};-DFIND_CUGRAPH_CPP=ON"
+fi
 
 # If clean or uninstall targets given, run them prior to any other steps
 if hasArg uninstall; then
@@ -213,23 +219,18 @@ if hasArg uninstall; then
     if [ -e ${LIBCUGRAPH_BUILD_DIR}/install_manifest.txt ]; then
         xargs rm -f < ${LIBCUGRAPH_BUILD_DIR}/install_manifest.txt > /dev/null 2>&1
     fi
-    # uninstall cugraph and pylibcugraph installed from a prior "setup.py
-    # install"
+    # uninstall cugraph and pylibcugraph installed from a prior install
     # FIXME: if multiple versions of these packages are installed, this only
     # removes the latest one and leaves the others installed. build.sh uninstall
     # can be run multiple times to remove all of them, but that is not obvious.
     pip uninstall -y pylibcugraph cugraph cugraph-service-client cugraph-service-server \
-        cugraph-dgl cugraph-pyg nx-cugraph
+        cugraph-dgl cugraph-pyg cugraph-equivariant nx-cugraph
 fi
 
 if hasArg clean; then
     # Ignore errors for clean since missing files, etc. are not failures
     set +e
     # remove artifacts generated inplace
-    # FIXME: ideally the "setup.py clean" command would be used for this, but
-    # currently running any setup.py command has side effects (eg. cloning
-    # repos).
-    # (cd ${REPODIR}/python && python setup.py clean)
     if [[ -d ${REPODIR}/python ]]; then
         cleanPythonDir ${REPODIR}/python
     fi
@@ -317,24 +318,7 @@ if buildDefault || hasArg pylibcugraph || hasArg all; then
     if hasArg --clean; then
         cleanPythonDir ${REPODIR}/python/pylibcugraph
     else
-        # FIXME: skbuild with setuptools>=64 has a bug when called from a "pip
-        # install -e" command, resulting in a broken editable wheel. Continue
-        # to use "setup.py bdist_ext --inplace" for a develop build until
-        # https://github.com/scikit-build/scikit-build/issues/981 is closed.
-        if hasArg --pydevelop; then
-            cd ${REPODIR}/python/pylibcugraph
-            python setup.py build_ext \
-                   --inplace \
-                   -- \
-                   -DFIND_CUGRAPH_CPP=ON \
-                   -DUSE_CUGRAPH_OPS=${BUILD_WITH_CUGRAPHOPS} \
-                   -Dcugraph_ROOT=${LIBCUGRAPH_BUILD_DIR} \
-                   -- \
-                   -j${PARALLEL_LEVEL:-1}
-            cd -
-        fi
-        SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS} -DUSE_CUGRAPH_OPS=${BUILD_WITH_CUGRAPHOPS}" \
-            SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
+        SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS};-DUSE_CUGRAPH_OPS=${BUILD_WITH_CUGRAPHOPS}" \
             python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/pylibcugraph
     fi
 fi
@@ -344,24 +328,7 @@ if buildDefault || hasArg cugraph || hasArg all; then
     if hasArg --clean; then
         cleanPythonDir ${REPODIR}/python/cugraph
     else
-        # FIXME: skbuild with setuptools>=64 has a bug when called from a "pip
-        # install -e" command, resulting in a broken editable wheel. Continue
-        # to use "setup.py bdist_ext --inplace" for a develop build until
-        # https://github.com/scikit-build/scikit-build/issues/981 is closed.
-        if hasArg --pydevelop; then
-            cd ${REPODIR}/python/cugraph
-            python setup.py build_ext \
-                   --inplace \
-                   -- \
-                   -DFIND_CUGRAPH_CPP=ON \
-                   -DUSE_CUGRAPH_OPS=${BUILD_WITH_CUGRAPHOPS} \
-                   -Dcugraph_ROOT=${LIBCUGRAPH_BUILD_DIR} \
-                   -- \
-                   -j${PARALLEL_LEVEL:-1}
-            cd -
-        fi
-        SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS} -DUSE_CUGRAPH_OPS=${BUILD_WITH_CUGRAPHOPS}" \
-            SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
+        SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS};-DUSE_CUGRAPH_OPS=${BUILD_WITH_CUGRAPHOPS}" \
             python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph
     fi
 fi
@@ -394,6 +361,15 @@ if hasArg cugraph-dgl || hasArg all; then
     fi
 fi
 
+# Build and install the cugraph-equivariant Python package
+if hasArg cugraph-equivariant || hasArg all; then
+    if hasArg --clean; then
+        cleanPythonDir ${REPODIR}/python/cugraph-equivariant
+    else
+        python ${PYTHON_ARGS_FOR_INSTALL} ${REPODIR}/python/cugraph-equivariant
+    fi
+fi
+
 # Build and install the nx-cugraph Python package
 if hasArg nx-cugraph || hasArg all; then
     if hasArg --clean; then
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index d0d13f99448..132231e4a64 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 3f765704bdb..d88c7d7bcd7 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -39,7 +39,7 @@ rapids-mamba-retry install \
 rapids-logger "Install cugraph-dgl"
 rapids-mamba-retry install "${PYTHON_CHANNEL}/linux-64/cugraph-dgl-*.tar.bz2"
 
-export RAPIDS_VERSION_NUMBER="23.12"
+export RAPIDS_VERSION_NUMBER="24.02"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 for PROJECT in libcugraphops libwholegraph; do
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 90a40c539ff..07a4f59396b 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,9 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
-source rapids-env-update
+rapids-configure-conda-channels
+
+source rapids-configure-sccache
+
+source rapids-date-string
 
 export CMAKE_GENERATOR=Ninja
 
@@ -19,7 +23,7 @@ echo "${version}" > VERSION
 rapids-logger "Begin py build"
 
 package_dir="python"
-for package_name in pylibcugraph cugraph nx-cugraph cugraph-pyg cugraph-dgl; do 
+for package_name in pylibcugraph cugraph nx-cugraph cugraph-pyg cugraph-dgl; do
   underscore_package_name=$(echo "${package_name}" | tr "-" "_")
   sed -i "/^__git_commit__/ s/= .*/= \"${git_commit}\"/g" "${package_dir}/${package_name}/${underscore_package_name}/_version.py"
 done
@@ -85,4 +89,9 @@ if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then
     conda/recipes/cugraph-dgl
 fi
 
+rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
+  conda/recipes/cugraph-equivariant
+
 rapids-upload-conda-to-s3 python
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 163520ea1da..30a1c98c106 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -36,7 +36,7 @@ if ! rapids-is-release-build; then
     alpha_spec=',>=0.0.0a0'
 fi
 
-for dep in rmm cudf raft-dask pylibcugraph pylibraft ucx-py; do
+for dep in rmm cudf cugraph raft-dask pylibcugraph pylibcugraphops pylibraft ucx-py; do
     sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
 done
 
@@ -55,7 +55,10 @@ cd "${package_dir}"
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
 # pure-python packages should not have auditwheel run on them.
-if [[ ${package_name} == "nx-cugraph" ]]; then
+if [[ ${package_name} == "nx-cugraph" ]] || \
+   [[ ${package_name} == "cugraph-dgl" ]] || \
+   [[ ${package_name} == "cugraph-pyg" ]] || \
+   [[ ${package_name} == "cugraph-equivariant" ]]; then
     RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 dist
 else
     mkdir -p final_dist
diff --git a/ci/build_wheel_cugraph-dgl.sh b/ci/build_wheel_cugraph-dgl.sh
new file mode 100755
index 00000000000..d62f810cba4
--- /dev/null
+++ b/ci/build_wheel_cugraph-dgl.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+./ci/build_wheel.sh cugraph-dgl python/cugraph-dgl
diff --git a/ci/build_wheel_cugraph-equivariant.sh b/ci/build_wheel_cugraph-equivariant.sh
new file mode 100755
index 00000000000..fcc8e0f774c
--- /dev/null
+++ b/ci/build_wheel_cugraph-equivariant.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+./ci/build_wheel.sh cugraph-equivariant python/cugraph-equivariant
diff --git a/ci/build_wheel_cugraph-pyg.sh b/ci/build_wheel_cugraph-pyg.sh
new file mode 100755
index 00000000000..97baa243f73
--- /dev/null
+++ b/ci/build_wheel_cugraph-pyg.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+./ci/build_wheel.sh cugraph-pyg python/cugraph-pyg
diff --git a/ci/build_wheel_cugraph.sh b/ci/build_wheel_cugraph.sh
index 5b5061f67c2..ffd6445f8d5 100755
--- a/ci/build_wheel_cugraph.sh
+++ b/ci/build_wheel_cugraph.sh
@@ -12,6 +12,6 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME=pylibcugraph_${RAPIDS_PY_CUDA_SUFFIX} rapids-download-wheels-from-s3 ./local-pylibcugraph
 export PIP_FIND_LINKS=$(pwd)/local-pylibcugraph
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh cugraph python/cugraph
diff --git a/ci/build_wheel_pylibcugraph.sh b/ci/build_wheel_pylibcugraph.sh
index 8d365bc250b..7c5a7299421 100755
--- a/ci/build_wheel_pylibcugraph.sh
+++ b/ci/build_wheel_pylibcugraph.sh
@@ -3,6 +3,6 @@
 
 set -euo pipefail
 
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DCUGRAPH_BUILD_WHEELS=ON -DFIND_CUGRAPH_CPP=OFF -DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_CUGRAPH_CPP=OFF;-DCPM_cugraph-ops_SOURCE=${GITHUB_WORKSPACE}/cugraph-ops/"
 
 ./ci/build_wheel.sh pylibcugraph python/pylibcugraph
diff --git a/ci/notebook_list.py b/ci/notebook_list.py
index 96e26e3ab1a..f7a284beeeb 100644
--- a/ci/notebook_list.py
+++ b/ci/notebook_list.py
@@ -42,7 +42,6 @@ def skip_book_dir(runtype):
 # Not strictly true... however what we mean is
 # Pascal or earlier
 #
-pascal = False
 ampere = False
 device = cuda.get_current_device()
 
@@ -62,8 +61,6 @@ def skip_book_dir(runtype):
 cc = getattr(device, "COMPUTE_CAPABILITY", None) or getattr(
     device, "compute_capability"
 )
-if cc[0] < 7:
-    pascal = True
 if cc[0] >= 8:
     ampere = True
 
@@ -91,10 +88,6 @@ def skip_book_dir(runtype):
                 )
                 skip = True
                 break
-            elif pascal and re.search("# Does not run on Pascal", line):
-                print(f"SKIPPING {filename} (does not run on Pascal)", file=sys.stderr)
-                skip = True
-                break
             elif ampere and re.search("# Does not run on Ampere", line):
                 print(f"SKIPPING {filename} (does not run on Ampere)", file=sys.stderr)
                 skip = True
diff --git a/ci/test.sh b/ci/test.sh
index 0032e3f3398..b3adc80c593 100755
--- a/ci/test.sh
+++ b/ci/test.sh
@@ -63,9 +63,7 @@ fi
 # EXITCODE for the script.
 set +e
 
-if (python ${CUGRAPH_ROOT}/ci/utils/is_pascal.py); then
-    echo "WARNING: skipping C++ tests on Pascal GPU arch."
-elif hasArg "--run-cpp-tests"; then
+if hasArg "--run-cpp-tests"; then
     echo "C++ gtests for cuGraph (single-GPU only)..."
     for gt in "${CONDA_PREFIX}/bin/gtests/libcugraph/"*_TEST; do
         test_name=$(basename $gt)
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 95bc55c212a..b204c1dc59b 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -19,6 +19,7 @@ conda activate test
 set -u
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
diff --git a/ci/test_python.sh b/ci/test_python.sh
index d6e92e8d1a5..b070143f076 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -63,7 +63,16 @@ pytest \
   tests
 popd
 
-# FIXME: TEMPORARILY disable single-GPU "MG" testing
+# Test runs that include tests that use dask require
+# --import-mode=append. Those tests start a LocalCUDACluster that inherits
+# changes from pytest's modifications to PYTHONPATH (which defaults to
+# prepending source tree paths to PYTHONPATH).  This causes the
+# LocalCUDACluster subprocess to import cugraph from the source tree instead of
+# the install location, and in most cases, the source tree does not have
+# extensions built in-place and will result in ImportErrors.
+#
+# FIXME: TEMPORARILY disable MG PropertyGraph tests (experimental) tests and
+# bulk sampler IO tests (hangs in CI)
 rapids-logger "pytest cugraph"
 pushd python/cugraph/cugraph
 DASK_WORKER_DEVICES="0" \
@@ -72,6 +81,7 @@ DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
 DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT="1000s" \
 pytest \
   -v \
+  --import-mode=append \
   --benchmark-disable \
   --cache-clear \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph.xml" \
@@ -79,7 +89,7 @@ pytest \
   --cov=cugraph \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-coverage.xml" \
   --cov-report=term \
-  -k "not test_property_graph_mg" \
+  -k "not test_property_graph_mg and not test_bulk_sampler_io" \
   tests
 popd
 
@@ -110,12 +120,33 @@ popd
 
 rapids-logger "pytest networkx using nx-cugraph backend"
 pushd python/nx-cugraph
+# Use editable install to make coverage work
+pip install -e . --no-deps
 ./run_nx_tests.sh
 # run_nx_tests.sh outputs coverage data, so check that total coverage is >0.0%
 # in case nx-cugraph failed to load but fallback mode allowed the run to pass.
 _coverage=$(coverage report|grep "^TOTAL")
 echo "nx-cugraph coverage from networkx tests: $_coverage"
 echo $_coverage | awk '{ if ($NF == "0.0%") exit 1 }'
+# Ensure all algorithms were called by comparing covered lines to function lines.
+# Run our tests again (they're fast enough) to add their coverage, then create coverage.json
+pytest \
+  --pyargs nx_cugraph \
+  --config-file=./pyproject.toml \
+  --cov-config=./pyproject.toml \
+  --cov=nx_cugraph \
+  --cov-append \
+  --cov-report=
+coverage report \
+  --include="*/nx_cugraph/algorithms/*" \
+  --omit=__init__.py \
+  --show-missing \
+  --rcfile=./pyproject.toml
+coverage json --rcfile=./pyproject.toml
+python -m nx_cugraph.tests.ensure_algos_covered
+# Exercise (and show results of) scripts that show implemented networkx algorithms
+python -m nx_cugraph.scripts.print_tree --dispatch-name --plc --incomplete --different
+python -m nx_cugraph.scripts.print_table
 popd
 
 rapids-logger "pytest cugraph-service (single GPU)"
@@ -208,7 +239,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       "cugraph-pyg" \
       "pytorch>=2.0,<2.1" \
       "pytorch-cuda=11.8"
-    
+
     # Install pyg dependencies (which requires pip)
     pip install \
         pyg_lib \
@@ -225,7 +256,6 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     # rmat is not tested because of multi-GPU testing
     pytest \
       --cache-clear \
-      --ignore=tests/int \
       --ignore=tests/mg \
       --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-pyg.xml" \
       --cov-config=../../.coveragerc \
@@ -248,5 +278,46 @@ else
   rapids-logger "skipping cugraph_pyg pytest on CUDA != 11.8"
 fi
 
+# test cugraph-equivariant
+if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
+  if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
+    # Reuse cugraph-dgl's test env for cugraph-equivariant
+    set +u
+    conda activate test_cugraph_dgl
+    set -u
+    rapids-mamba-retry install \
+      --channel "${CPP_CHANNEL}" \
+      --channel "${PYTHON_CHANNEL}" \
+      --channel pytorch \
+      --channel nvidia \
+      cugraph-equivariant
+    pip install e3nn==0.5.1
+
+    rapids-print-env
+
+    rapids-logger "pytest cugraph-equivariant"
+    pushd python/cugraph-equivariant/cugraph_equivariant
+    pytest \
+      --cache-clear \
+      --junitxml="${RAPIDS_TESTS_DIR}/junit-cugraph-equivariant.xml" \
+      --cov-config=../../.coveragerc \
+      --cov=cugraph_equivariant \
+      --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cugraph-equivariant-coverage.xml" \
+      --cov-report=term \
+      .
+    popd
+
+    # Reactivate the test environment back
+    set +u
+    conda deactivate
+    conda activate test
+    set -u
+  else
+    rapids-logger "skipping cugraph-equivariant pytest on ARM64"
+  fi
+else
+  rapids-logger "skipping cugraph-equivariant pytest on CUDA!=11.8"
+fi
+
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 428efd4ed21..8c5832e412f 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -eoxu pipefail
 
@@ -21,10 +21,21 @@ arch=$(uname -m)
 if [[ "${arch}" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
     python ./ci/wheel_smoke_test_${package_name}.py
 else
-    # FIXME: TEMPORARILY disable single-GPU "MG" testing
+    # Test runs that include tests that use dask require
+    # --import-mode=append. See test_python.sh for details.
+    # FIXME: Adding PY_IGNORE_IMPORTMISMATCH=1 to workaround conftest.py import
+    # mismatch error seen by nx-cugraph after using pytest 8 and
+    # --import-mode=append.
     RAPIDS_DATASET_ROOT_DIR=`pwd`/datasets \
+    PY_IGNORE_IMPORTMISMATCH=1 \
+    DASK_WORKER_DEVICES="0" \
     DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL="1000s" \
     DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT="1000s" \
     DASK_CUDA_WAIT_WORKERS_MIN_TIMEOUT="1000s" \
-    python -m pytest ./python/${package_name}/${python_package_name}/tests
+    python -m pytest \
+       -v \
+       --import-mode=append \
+       --benchmark-disable \
+       -k "not test_property_graph_mg and not test_bulk_sampler_io" \
+       ./python/${package_name}/${python_package_name}/tests
 fi
diff --git a/ci/test_wheel_cugraph-dgl.sh b/ci/test_wheel_cugraph-dgl.sh
new file mode 100755
index 00000000000..90c86af95fe
--- /dev/null
+++ b/ci/test_wheel_cugraph-dgl.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eoxu pipefail
+
+package_name="cugraph-dgl"
+package_dir="python/cugraph-dgl"
+
+python_package_name=$(echo ${package_name}|sed 's/-/_/g')
+
+mkdir -p ./dist
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# use 'ls' to expand wildcard before adding `[extra]` requires for pip
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# pip creates wheels using python package names
+python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
+
+
+PKG_CUDA_VER="$(echo ${CUDA_VERSION} | cut -d '.' -f1,2 | tr -d '.')"
+PKG_CUDA_VER_MAJOR=${PKG_CUDA_VER:0:2}
+if [[ "${PKG_CUDA_VER_MAJOR}" == "12" ]]; then
+  PYTORCH_CUDA_VER="121"
+else
+  PYTORCH_CUDA_VER=$PKG_CUDA_VER
+fi
+PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
+DGL_URL="https://data.dgl.ai/wheels/cu${PYTORCH_CUDA_VER}/repo.html"
+
+rapids-logger "Installing PyTorch and DGL"
+rapids-retry python -m pip install torch --index-url ${PYTORCH_URL}
+rapids-retry python -m pip install dgl --find-links ${DGL_URL}
+
+python -m pytest python/cugraph-dgl/tests
diff --git a/ci/test_wheel_cugraph-equivariant.sh b/ci/test_wheel_cugraph-equivariant.sh
new file mode 100755
index 00000000000..f054780b03a
--- /dev/null
+++ b/ci/test_wheel_cugraph-equivariant.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eoxu pipefail
+
+package_name="cugraph-equivariant"
+package_dir="python/cugraph-equivariant"
+
+python_package_name=$(echo ${package_name}|sed 's/-/_/g')
+
+mkdir -p ./dist
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# use 'ls' to expand wildcard before adding `[extra]` requires for pip
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# pip creates wheels using python package names
+python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
+
+
+PKG_CUDA_VER="$(echo ${CUDA_VERSION} | cut -d '.' -f1,2 | tr -d '.')"
+PKG_CUDA_VER_MAJOR=${PKG_CUDA_VER:0:2}
+if [[ "${PKG_CUDA_VER_MAJOR}" == "12" ]]; then
+  PYTORCH_CUDA_VER="121"
+else
+  PYTORCH_CUDA_VER=$PKG_CUDA_VER
+fi
+PYTORCH_URL="https://download.pytorch.org/whl/cu${PYTORCH_CUDA_VER}"
+
+rapids-logger "Installing PyTorch and e3nn"
+rapids-retry python -m pip install torch --index-url ${PYTORCH_URL}
+rapids-retry python -m pip install e3nn
+
+python -m pytest python/cugraph-equivariant/cugraph_equivariant/tests
diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh
new file mode 100755
index 00000000000..acd42224387
--- /dev/null
+++ b/ci/test_wheel_cugraph-pyg.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eoxu pipefail
+
+package_name="cugraph-pyg"
+package_dir="python/cugraph-pyg"
+
+python_package_name=$(echo ${package_name}|sed 's/-/_/g')
+
+mkdir -p ./dist
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# use 'ls' to expand wildcard before adding `[extra]` requires for pip
+RAPIDS_PY_WHEEL_NAME="${package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# pip creates wheels using python package names
+python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
+
+# RAPIDS_DATASET_ROOT_DIR is used by test scripts
+export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
+
+if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then
+  rapids-logger "Installing PyTorch and PyG dependencies"
+  PYTORCH_URL="https://download.pytorch.org/whl/cu118"
+  rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
+  rapids-retry python -m pip install torch-geometric==2.4.0
+  rapids-retry python -m pip install \
+    pyg_lib \
+    torch_scatter \
+    torch_sparse \
+    torch_cluster \
+    torch_spline_conv \
+    -f https://data.pyg.org/whl/torch-2.1.0+cu118.html
+
+  rapids-logger "pytest cugraph-pyg (single GPU)"
+  pushd python/cugraph-pyg/cugraph_pyg
+  python -m pytest \
+    --cache-clear \
+    --ignore=tests/mg \
+    tests
+  popd
+else
+  rapids-logger "skipping cugraph-pyg wheel test on CUDA!=11.8"
+fi
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index aa38defcd7c..76178269ab0 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -16,12 +16,12 @@ dependencies:
 - cmake>=3.26.4
 - cuda-version=11.8
 - cudatoolkit
-- cudf==23.12.*
+- cudf==24.2.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==23.12.*
-- dask-cudf==23.12.*
+- dask-cuda==24.2.*
+- dask-cudf==24.2.*
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -29,11 +29,11 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
-- libcudf==23.12.*
-- libcugraphops==23.12.*
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libcudf==24.2.*
+- libcugraphops==24.2.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -49,21 +49,21 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibcugraphops==23.12.*
-- pylibraft==23.12.*
-- pylibwholegraph==23.12.*
+- pylibcugraphops==24.2.*
+- pylibraft==24.2.*
+- pylibwholegraph==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- raft-dask==23.12.*
-- rapids-dask-dependency==23.12.*
+- raft-dask==24.2.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
 - requests
-- rmm==23.12.*
-- scikit-build>=0.13.1
+- rmm==24.2.*
+- scikit-build-core>=0.7.0
 - scikit-learn>=0.23.1
 - scipy
 - setuptools>=61.0.0
@@ -72,7 +72,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py==0.35.*
+- ucx-py==0.36.*
 - wget
 - wheel
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index a9f793b15f5..84a6525bf0c 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -16,12 +16,12 @@ dependencies:
 - cmake>=3.26.4
 - cuda-nvcc
 - cuda-version=12.0
-- cudf==23.12.*
+- cudf==24.2.*
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==23.12.*
-- dask-cudf==23.12.*
+- dask-cuda==24.2.*
+- dask-cudf==24.2.*
 - doxygen
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
@@ -29,11 +29,11 @@ dependencies:
 - graphviz
 - gtest>=1.13.0
 - ipython
-- libcudf==23.12.*
-- libcugraphops==23.12.*
-- libraft-headers==23.12.*
-- libraft==23.12.*
-- librmm==23.12.*
+- libcudf==24.2.*
+- libcugraphops==24.2.*
+- libraft-headers==24.2.*
+- libraft==24.2.*
+- librmm==24.2.*
 - nbsphinx
 - nccl>=2.9.9
 - networkx>=2.5.1
@@ -48,21 +48,21 @@ dependencies:
 - pandas
 - pre-commit
 - pydata-sphinx-theme
-- pylibcugraphops==23.12.*
-- pylibraft==23.12.*
-- pylibwholegraph==23.12.*
+- pylibcugraphops==24.2.*
+- pylibraft==24.2.*
+- pylibwholegraph==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
 - pytest-mpl
 - pytest-xdist
 - python-louvain
-- raft-dask==23.12.*
-- rapids-dask-dependency==23.12.*
+- raft-dask==24.2.*
+- rapids-dask-dependency==24.2.*
 - recommonmark
 - requests
-- rmm==23.12.*
-- scikit-build>=0.13.1
+- rmm==24.2.*
+- scikit-build-core>=0.7.0
 - scikit-learn>=0.23.1
 - scipy
 - setuptools>=61.0.0
@@ -71,7 +71,7 @@ dependencies:
 - sphinx<6
 - sphinxcontrib-websupport
 - ucx-proc=*=gpu
-- ucx-py==0.35.*
+- ucx-py==0.36.*
 - wget
 - wheel
 name: all_cuda-120_arch-x86_64
diff --git a/conda/recipes/cugraph-equivariant/build.sh b/conda/recipes/cugraph-equivariant/build.sh
new file mode 100644
index 00000000000..f0ff1688b55
--- /dev/null
+++ b/conda/recipes/cugraph-equivariant/build.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+
+./build.sh cugraph-equivariant
diff --git a/conda/recipes/cugraph-equivariant/meta.yaml b/conda/recipes/cugraph-equivariant/meta.yaml
new file mode 100644
index 00000000000..a952812f845
--- /dev/null
+++ b/conda/recipes/cugraph-equivariant/meta.yaml
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: cugraph-equivariant
+  version: {{ version }}
+
+source:
+  path: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  build:
+      number: {{ GIT_DESCRIBE_NUMBER }}
+      string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+
+requirements:
+  host:
+    - python
+  run:
+    - pylibcugraphops ={{ minor_version }}
+    - python
+
+tests:
+  imports:
+    - cugraph_equivariant
+
+about:
+  home: https://rapids.ai/
+  dev_url: https://github.com/rapidsai/cugraph
+  license: Apache-2.0
+  license_file: ../../../LICENSE
+  summary: GPU-accelerated equivariant convolutional layers.
diff --git a/conda/recipes/cugraph-pyg/meta.yaml b/conda/recipes/cugraph-pyg/meta.yaml
index a2a02a1d9f6..c07b97cd5da 100644
--- a/conda/recipes/cugraph-pyg/meta.yaml
+++ b/conda/recipes/cugraph-pyg/meta.yaml
@@ -24,7 +24,7 @@ requirements:
   host:
     - cython >=3.0.0
     - python
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
   run:
     - rapids-dask-dependency ={{ minor_version }}
     - numba >=0.57
diff --git a/conda/recipes/cugraph-service/conda_build_config.yaml b/conda/recipes/cugraph-service/conda_build_config.yaml
index b971a73fd39..6a0124983fd 100644
--- a/conda/recipes/cugraph-service/conda_build_config.yaml
+++ b/conda/recipes/cugraph-service/conda_build_config.yaml
@@ -1,2 +1,2 @@
 ucx_py_version:
-  - "0.35.*"
+  - "0.36.*"
diff --git a/conda/recipes/cugraph/conda_build_config.yaml b/conda/recipes/cugraph/conda_build_config.yaml
index c03d515b9f6..387f3451d8d 100644
--- a/conda/recipes/cugraph/conda_build_config.yaml
+++ b/conda/recipes/cugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.35.*"
+  - "0.36.*"
diff --git a/conda/recipes/cugraph/meta.yaml b/conda/recipes/cugraph/meta.yaml
index 58b9ea220d4..b8e3072dd38 100644
--- a/conda/recipes/cugraph/meta.yaml
+++ b/conda/recipes/cugraph/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - python
     - raft-dask ={{ minor_version }}
     - rmm ={{ minor_version }}
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
     - setuptools
   run:
     - aiohttp
diff --git a/conda/recipes/pylibcugraph/conda_build_config.yaml b/conda/recipes/pylibcugraph/conda_build_config.yaml
index c03d515b9f6..387f3451d8d 100644
--- a/conda/recipes/pylibcugraph/conda_build_config.yaml
+++ b/conda/recipes/pylibcugraph/conda_build_config.yaml
@@ -17,4 +17,4 @@ sysroot_version:
   - "2.17"
 
 ucx_py_version:
-  - "0.35.*"
+  - "0.36.*"
diff --git a/conda/recipes/pylibcugraph/meta.yaml b/conda/recipes/pylibcugraph/meta.yaml
index ad59c4de66f..0f66f55ccaa 100644
--- a/conda/recipes/pylibcugraph/meta.yaml
+++ b/conda/recipes/pylibcugraph/meta.yaml
@@ -58,7 +58,7 @@ requirements:
     - libcugraph ={{ version }}
     - pylibraft ={{ minor_version }}
     - python
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
     - setuptools
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 836d5569ef7..ecc2ebf06d3 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH)
 
-project(CUGRAPH VERSION 23.12.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH VERSION 24.02.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
@@ -41,7 +41,7 @@ endif()
 # cuhornet currently doesn't support
 #
 # >= 90
-set(supported_archs "60" "62" "70" "72" "75" "80" "86" "89" "90")
+set(supported_archs "70" "72" "75" "80" "86" "89" "90")
 foreach( arch IN LISTS CMAKE_CUDA_ARCHITECTURES)
     string(REPLACE "-real" "" arch ${arch})
     if( arch IN_LIST supported_archs )
@@ -142,8 +142,8 @@ rapids_cpm_init()
 ###
 # Linking to the `raft::raft` target implicitly links cugraph targets to the
 # following public header-only  raft dependencies:
+# * CCCL
 # * RMM
-# * Thrust
 # * GTest/GMock
 #
 # The CMakeLists.txt for each of these projects are properly configured
@@ -153,16 +153,14 @@ rapids_cpm_init()
 # lags behind.
 ###
 
-# Need to make sure rmm is found before cuco so that rmm patches the libcudacxx
-# directory to be found by cuco.
+# Need CCCL, then rmm, then cuCollections, then RAFT.
+# This ensures that libraries can be overridden for testing.
+include(cmake/thirdparty/get_cccl.cmake)
 include(${rapids-cmake-dir}/cpm/rmm.cmake)
 rapids_cpm_rmm(BUILD_EXPORT_SET cugraph-exports
-               INSTALL_EXPORT_SET  cugraph-exports)
-# Putting this before raft to override RAFT from pulling them in.
-include(cmake/thirdparty/get_libcudacxx.cmake)
+               INSTALL_EXPORT_SET cugraph-exports)
 include(${rapids-cmake-dir}/cpm/cuco.cmake)
 rapids_cpm_cuco(BUILD_EXPORT_SET cugraph-exports INSTALL_EXPORT_SET cugraph-exports)
-
 include(cmake/thirdparty/get_raft.cmake)
 
 if(USE_CUGRAPH_OPS)
@@ -189,6 +187,7 @@ endif()
 
 set(CUGRAPH_SOURCES
     src/detail/shuffle_vertices.cu
+    src/detail/permute_range.cu
     src/detail/shuffle_vertex_pairs.cu
     src/detail/collect_local_vertex_values.cu
     src/detail/groupby_and_count.cu
@@ -220,6 +219,8 @@ set(CUGRAPH_SOURCES
     src/community/louvain_mg.cu
     src/community/leiden_sg.cu
     src/community/leiden_mg.cu
+    src/community/ecg_sg.cu
+    src/community/ecg_mg.cu
     src/community/legacy/louvain.cu
     src/community/legacy/ktruss.cu
     src/community/legacy/ecg.cu
@@ -423,7 +424,7 @@ add_library(cugraph_c
         src/c_api/core_result.cpp
         src/c_api/extract_ego.cpp
         src/c_api/k_core.cpp
-	src/c_api/hierarchical_clustering_result.cpp
+        src/c_api/hierarchical_clustering_result.cpp
         src/c_api/induced_subgraph.cpp
         src/c_api/capi_helper.cu
         src/c_api/legacy_spectral.cpp
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_cccl.cmake
similarity index 68%
rename from cpp/cmake/thirdparty/get_libcudacxx.cmake
rename to cpp/cmake/thirdparty/get_cccl.cmake
index 1c51c5a84a9..72b53d4c833 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_cccl.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,12 +12,10 @@
 # the License.
 # =============================================================================
 
-# This function finds libcudacxx and sets any additional necessary environment variables.
-function(find_and_configure_libcudacxx)
-  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
-
-  rapids_cpm_libcudacxx(BUILD_EXPORT_SET cugraph-exports)
-
+# This function finds CCCL and sets any additional necessary environment variables.
+function(find_and_configure_cccl)
+  include(${rapids-cmake-dir}/cpm/cccl.cmake)
+  rapids_cpm_cccl(BUILD_EXPORT_SET cugraph-exports INSTALL_EXPORT_SET cugraph-exports)
 endfunction()
 
-find_and_configure_libcudacxx()
+find_and_configure_cccl()
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 015b5b07920..8f56372c81a 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -52,6 +52,7 @@ function(find_and_configure_raft)
                 "RAFT_COMPILE_LIBRARY ${PKG_COMPILE_RAFT_LIB}"
                 "BUILD_TESTS OFF"
                 "BUILD_BENCH OFF"
+                "BUILD_CAGRA_HNSWLIB OFF"
     )
 
     if(raft_ADDED)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 6946bd38bfe..3b74956e121 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -48,7 +48,7 @@ PROJECT_NAME           = libcugraph
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 23.12
+PROJECT_NUMBER         = 24.02
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 8501eedce5c..bb721468106 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -541,30 +541,37 @@ weight_t hungarian(raft::handle_t const& handle,
  *    community hierarchies in large networks, J Stat Mech P10008 (2008),
  *    http://arxiv.org/abs/0803.0476
  *
- * @throws     cugraph::logic_error when an error occurs.
- *
- * @tparam     graph_view_t          Type of graph
+ * @throws cugraph::logic_error when an error occurs.
  *
- * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
- * @param[in]  graph                 input graph object
- * @param[out] clustering            Pointer to device array where the clustering should be stored
- * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
- * @param[in]  threshold             (optional) threshold for convergence at each level (default
- * 1e-7)
- * @param[in]  resolution            (optional) The value of the resolution parameter to use.
- *                                   Called gamma in the modularity formula, this changes the size
- *                                   of the communities.  Higher resolutions lead to more smaller
- *                                   communities, lower resolutions lead to fewer larger
- *                                   communities. (default 1)
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  *
- * @return                           a pair containing:
- *                                     1) number of levels of the returned clustering
- *                                     2) modularity of the returned clustering
+ * @param[in]  handle            Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  rng_state         The RngState instance holding pseudo-random number generator state.
+ * @param[in]  graph_view        Input graph view object.
+ * @param[in]  edge_weight_view  Optional view object holding edge weights for @p graph_view.
+ *                               If @pedge_weight_view.has_value() == false, edge weights
+ *                               are assumed to be 1.0.
+  @param[out] clustering         Pointer to device array where the clustering should be stored
+ * @param[in]  max_level         (optional) maximum number of levels to run (default 100)
+ * @param[in]  threshold         (optional) threshold for convergence at each level (default 1e-7)
+ * @param[in]  resolution        (optional) The value of the resolution parameter to use.
+ *                               Called gamma in the modularity formula, this changes the size
+ *                               of the communities.  Higher resolutions lead to more smaller
+ *                               communities, lower resolutions lead to fewer larger
+ *                               communities. (default 1)
+ *
+ * @return                       a pair containing:
+ *                                 1) number of levels of the returned clustering
+ *                                 2) modularity of the returned clustering
  *
  */
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<size_t, weight_t> louvain(
   raft::handle_t const& handle,
+  std::optional<std::reference_wrapper<raft::random::RngState>> rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   vertex_t* clustering,
@@ -593,25 +600,33 @@ std::pair<size_t, weight_t> louvain(
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
- * @tparam     graph_view_t          Type of graph
- *
- * @param[in]  handle                Library handle (RAFT)
- * @param[in]  graph_view            Input graph view object
- * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
- * @param[in]  resolution            (optional) The value of the resolution parameter to use.
- *                                   Called gamma in the modularity formula, this changes the size
- *                                   of the communities.  Higher resolutions lead to more smaller
- *                                   communities, lower resolutions lead to fewer larger
- *                                   communities. (default 1)
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
  *
- * @return                           a pair containing:
- *                                     1) unique pointer to dendrogram
- *                                     2) modularity of the returned clustering
+ * @param[in]  handle            Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  rng_state         The RngState instance holding pseudo-random number generator state.
+ * @param[in]  graph_view        Input graph view object.
+ * @param[in]  edge_weight_view  Optional view object holding edge weights for @p graph_view.
+ *                               If @pedge_weight_view.has_value() == false, edge weights
+ *                               are assumed to be 1.0.
+ * @param[in]  max_level         (optional) maximum number of levels to run (default 100)
+ * @param[in]  threshold         (optional) threshold for convergence at each level (default 1e-7)
+ * @param[in]  resolution        (optional) The value of the resolution parameter to use.
+ *                               Called gamma in the modularity formula, this changes the size
+ *                               of the communities.  Higher resolutions lead to more smaller
+ *                               communities, lower resolutions lead to fewer larger
+ *                               communities. (default 1)
+ * @return                       a pair containing:
+ *                                 1) unique pointer to dendrogram
+ *                                 2) modularity of the returned clustering
  *
  */
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   raft::handle_t const& handle,
+  std::optional<std::reference_wrapper<raft::random::RngState>> rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level    = 100,
@@ -779,6 +794,55 @@ void ecg(raft::handle_t const& handle,
          vertex_t ensemble_size,
          vertex_t* clustering);
 
+/**
+ * @brief Computes the ecg clustering of the given graph.
+ *
+ * ECG runs truncated Louvain on an ensemble of permutations of the input graph,
+ * then uses the ensemble partitions to determine weights for the input graph.
+ * The final result is found by running full Louvain on the input graph using
+ * the determined weights. See https://arxiv.org/abs/1809.05578 for further
+ * information.
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ * @tparam edge_t Type of edge identifiers. Needs to be an integral type.
+ * @tparam weight_t Type of edge weights. Needs to be a floating point type.
+ * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false)
+ *
+ * @param[in]  handle            Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  rng_state         The RngState instance holding pseudo-random number generator state.
+ * @param[in]  graph_view        Input graph view object
+ * @param[in]  edge_weight_view  View object holding edge weights for @p graph_view.
+ * @param[in]  min_weight        Minimum edge weight to use in the final call of the clustering
+ *                               algorithm if an edge does not appear in any of the ensemble runs.
+ * @param[in]  ensemble_size     The ensemble size parameter
+ * @param[in]  max_level         (optional) maximum number of levels to run (default 100)
+ * @param[in]  threshold         (optional) threshold for convergence at each level (default 1e-7)
+ * @param[in]  resolution        (optional) The value of the resolution parameter to use.
+ *                               Called gamma in the modularity formula, this changes the size
+ *                               of the communities.  Higher resolutions lead to more smaller
+ *                               communities, lower resolutions lead to fewer larger
+ *                               communities. (default 1)
+ *
+ * @return                       a tuple containing:
+ *                                 1) Device vector containing clustering result
+ *                                 2) number of levels of the returned clustering
+ *                                 3) modularity of the returned clustering
+ *
+ */
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, size_t, weight_t> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  weight_t min_weight,
+  size_t ensemble_size,
+  size_t max_level    = 100,
+  weight_t threshold  = weight_t{1e-7},
+  weight_t resolution = weight_t{1});
+
 /**
  * @brief Generate edges in a minimum spanning forest of an undirected weighted graph.
  *
diff --git a/cpp/include/cugraph/detail/collect_comm_wrapper.hpp b/cpp/include/cugraph/detail/collect_comm_wrapper.hpp
index b791c593f41..4a2f5d7c44e 100644
--- a/cpp/include/cugraph/detail/collect_comm_wrapper.hpp
+++ b/cpp/include/cugraph/detail/collect_comm_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <raft/core/device_span.hpp>
 #include <raft/core/handle.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/include/cugraph/detail/shuffle_wrappers.hpp b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
index 55ea6a0e355..c77ecb7aa01 100644
--- a/cpp/include/cugraph/detail/shuffle_wrappers.hpp
+++ b/cpp/include/cugraph/detail/shuffle_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <raft/core/handle.hpp>
+#include <raft/random/rng_state.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <optional>
@@ -138,6 +139,28 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<vertex_t>&& vertices,
   rmm::device_uvector<value_t>&& values);
 
+/**
+ * @brief Permute a range.
+ *
+ * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type.
+ *
+ * @param[in] handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator,
+ * and handles to various CUDA libraries) to run graph algorithms.
+ * @param[in]  rng_state The RngState instance holding pseudo-random number generator state.
+ * @param[in] local_range_size Size of local range assigned to this process.
+ * @param[in] local_start Start of local range assigned to this process.
+ *
+ * @return permuted range.
+ */
+
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> permute_range(raft::handle_t const& handle,
+                                            raft::random::RngState& rng_state,
+                                            vertex_t local_start,
+                                            vertex_t local_range_size,
+                                            bool multi_gpu          = false,
+                                            bool do_expensive_check = false);
+
 /**
  * @brief Shuffle internal (i.e. renumbered) vertices to their local GPUs based on vertex
  * partitioning.
diff --git a/cpp/include/cugraph/detail/utility_wrappers.hpp b/cpp/include/cugraph/detail/utility_wrappers.hpp
index faa0fbb841b..61ac1bd2804 100644
--- a/cpp/include/cugraph/detail/utility_wrappers.hpp
+++ b/cpp/include/cugraph/detail/utility_wrappers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -174,5 +174,20 @@ bool is_equal(raft::handle_t const& handle,
               raft::device_span<data_t> span1,
               raft::device_span<data_t> span2);
 
+/**
+ * @brief Count the number of times a value appears in a span
+ *
+ * @tparam data_t type of data in span
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param span The span of data to compare
+ * @param value The value to count
+ * @return The count of how many instances of that value occur
+ */
+template <typename data_t>
+size_t count_values(raft::handle_t const& handle,
+                    raft::device_span<data_t const> span,
+                    data_t value);
+
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh
index 213f9b9497a..d1c2cf3df52 100644
--- a/cpp/include/cugraph/edge_partition_device_view.cuh
+++ b/cpp/include/cugraph/edge_partition_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -298,6 +298,20 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_range_first_ + major_offset;
   }
 
+  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  {
+    if (major_hypersparse_first_ && (major >= *major_hypersparse_first_)) {
+      auto major_hypersparse_idx =
+        detail::major_hypersparse_idx_from_major_nocheck_impl(*dcs_nzd_vertices_, major);
+      return major_hypersparse_idx
+               ? thrust::make_optional((*major_hypersparse_first_ - major_range_first_) +
+                                       *major_hypersparse_idx)
+               : thrust::nullopt;
+    } else {
+      return major - major_range_first_;
+    }
+  }
+
   __device__ vertex_t major_from_major_idx_nocheck(vertex_t major_idx) const noexcept
   {
     if (major_hypersparse_first_) {
@@ -339,6 +353,7 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return dcs_nzd_vertices_ ? thrust::optional<vertex_t const*>{(*dcs_nzd_vertices_).data()}
                              : thrust::nullopt;
   }
+
   __host__ __device__ thrust::optional<vertex_t> dcs_nzd_vertex_count() const
   {
     return dcs_nzd_vertices_
@@ -460,6 +475,11 @@ class edge_partition_device_view_t<vertex_t, edge_t, multi_gpu, std::enable_if_t
     return major_offset;
   }
 
+  __device__ thrust::optional<vertex_t> major_idx_from_major_nocheck(vertex_t major) const noexcept
+  {
+    return major_offset_from_major_nocheck(major);
+  }
+
   __device__ vertex_t major_from_major_idx_nocheck(vertex_t major_idx) const noexcept
   {
     return major_from_major_offset_nocheck(major_idx);
diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 60b9f1a4054..a723fde24df 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -90,24 +90,25 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
     graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
     bool do_expensive_check = false);
 
+  edge_t number_of_edges() const { return this->number_of_edges_; }
+
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> view() const
   {
-    std::vector<edge_t const*> offsets(edge_partition_offsets_.size(), nullptr);
-    std::vector<vertex_t const*> indices(edge_partition_indices_.size(), nullptr);
-    auto dcs_nzd_vertices      = edge_partition_dcs_nzd_vertices_
-                                   ? std::make_optional<std::vector<vertex_t const*>>(
-                                  (*edge_partition_dcs_nzd_vertices_).size(), nullptr)
-                                   : std::nullopt;
-    auto dcs_nzd_vertex_counts = edge_partition_dcs_nzd_vertex_counts_
-                                   ? std::make_optional<std::vector<vertex_t>>(
-                                       (*edge_partition_dcs_nzd_vertex_counts_).size(), vertex_t{0})
-                                   : std::nullopt;
+    std::vector<raft::device_span<edge_t const>> offsets(edge_partition_offsets_.size());
+    std::vector<raft::device_span<vertex_t const>> indices(edge_partition_indices_.size());
+    auto dcs_nzd_vertices = edge_partition_dcs_nzd_vertices_
+                              ? std::make_optional<std::vector<raft::device_span<vertex_t const>>>(
+                                  (*edge_partition_dcs_nzd_vertices_).size())
+                              : std::nullopt;
     for (size_t i = 0; i < offsets.size(); ++i) {
-      offsets[i] = edge_partition_offsets_[i].data();
-      indices[i] = edge_partition_indices_[i].data();
+      offsets[i] = raft::device_span<edge_t const>(edge_partition_offsets_[i].data(),
+                                                   edge_partition_offsets_[i].size());
+      indices[i] = raft::device_span<vertex_t const>(edge_partition_indices_[i].data(),
+                                                     edge_partition_indices_[i].size());
       if (dcs_nzd_vertices) {
-        (*dcs_nzd_vertices)[i]      = (*edge_partition_dcs_nzd_vertices_)[i].data();
-        (*dcs_nzd_vertex_counts)[i] = (*edge_partition_dcs_nzd_vertex_counts_)[i];
+        (*dcs_nzd_vertices)[i] =
+          raft::device_span<vertex_t const>((*edge_partition_dcs_nzd_vertices_)[i].data(),
+                                            (*edge_partition_dcs_nzd_vertices_)[i].size());
       }
     }
 
@@ -196,15 +197,13 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
     }
 
     return graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>(
-      *(this->handle_ptr()),
       offsets,
       indices,
       dcs_nzd_vertices,
-      dcs_nzd_vertex_counts,
       graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
         this->number_of_vertices(),
         this->number_of_edges(),
-        this->graph_properties(),
+        this->properties_,
         partition_,
         edge_partition_segment_offsets_,
         local_sorted_unique_edge_srcs,
@@ -224,7 +223,6 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mu
   // nzd: nonzero (local) degree
   std::optional<std::vector<rmm::device_uvector<vertex_t>>> edge_partition_dcs_nzd_vertices_{
     std::nullopt};
-  std::optional<std::vector<vertex_t>> edge_partition_dcs_nzd_vertex_counts_{std::nullopt};
   partition_t<vertex_t> partition_{};
 
   // segment offsets within the vertex partition based on vertex degree
@@ -283,16 +281,15 @@ class graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!m
           graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 
+  edge_t number_of_edges() const { return this->number_of_edges_; }
+
   graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> view() const
   {
     return graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>(
-      *(this->handle_ptr()),
-      offsets_.data(),
-      indices_.data(),
-      graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{this->number_of_vertices(),
-                                                                       this->number_of_edges(),
-                                                                       this->graph_properties(),
-                                                                       segment_offsets_});
+      raft::device_span<edge_t const>(offsets_.data(), offsets_.size()),
+      raft::device_span<vertex_t const>(indices_.data(), indices_.size()),
+      graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu>{
+        this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_});
   }
 
  private:
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 6a75a420bf8..6684d31d8fd 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1005,9 +1005,14 @@ remove_self_loops(raft::handle_t const& handle,
                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types);
 
 /**
- * @brief Remove all but one edge when a multi-edge exists.  Note that this function does not use
- *    stable methods.  When a multi-edge exists, one of the edges will remain, there is no
- *    guarantee on which one will remain.
+ * @brief Remove all but one edge when a multi-edge exists.
+ *
+ * When a multi-edge exists, one of the edges will remain. If @p keep_min_value_edge is false, an
+ * arbitrary edge will be selected among the edges in the multi-edge. If @p keep_min_value_edge is
+ * true, the edge with the minimum value will be selected. The edge weights will be first compared
+ * (if @p edgelist_weights.has_value() is true); edge IDs will be compared next (if @p
+ * edgelist_edge_ids.has_value() is true); and edge types (if @p edgelist_edge_types.has_value() is
+ * true) will compared last.
  *
  * In an MG context it is assumed that edges have been shuffled to the proper GPU,
  * in which case any multi-edges will be on the same GPU.
@@ -1024,6 +1029,11 @@ remove_self_loops(raft::handle_t const& handle,
  * @param edgelist_weights  Optional list of edge weights
  * @param edgelist_edge_ids  Optional list of edge ids
  * @param edgelist_edge_types  Optional list of edge types
+ * @param keep_min_value_edge Flag indicating whether to keep an arbitrary edge (false) or the
+ * minimum value edge (true) among the edges in a multi-edge. Relevant only if @p
+ * edgelist_weights.has_value() | @p edgelist_edge_ids.has_value() | @p
+ * edgelist_edge_types.has_value() is true. Setting this to true incurs performance overhead as this
+ * requires more comparisons.
  * @return Tuple of vectors storing edge sources, destinations, optional weights,
  *    optional edge ids, optional edge types.
  */
@@ -1038,6 +1048,7 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<vertex_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<edge_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types);
+                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge = false);
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index f30a8b7e2af..93d884a56d9 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -258,17 +258,12 @@ class graph_base_t {
  public:
   graph_base_t() = default;
 
-  graph_base_t(raft::handle_t const& handle,
-               vertex_t number_of_vertices,
-               edge_t number_of_edges,
-               graph_properties_t properties)
-    : handle_ptr_(&handle),
-      number_of_vertices_(number_of_vertices),
+  graph_base_t(vertex_t number_of_vertices, edge_t number_of_edges, graph_properties_t properties)
+    : number_of_vertices_(number_of_vertices),
       number_of_edges_(number_of_edges),
       properties_(properties){};
 
   vertex_t number_of_vertices() const { return number_of_vertices_; }
-  edge_t number_of_edges() const { return number_of_edges_; }
 
   template <typename vertex_type = vertex_t>
   std::enable_if_t<std::is_signed<vertex_type>::value, bool> is_valid_vertex(vertex_type v) const
@@ -286,16 +281,11 @@ class graph_base_t {
   bool is_multigraph() const { return properties_.is_multigraph; }
 
  protected:
-  raft::handle_t const* handle_ptr() const { return handle_ptr_; };
-  graph_properties_t graph_properties() const { return properties_; }
+  edge_t number_of_edges_{0};
+  graph_properties_t properties_{};
 
  private:
-  raft::handle_t const* handle_ptr_{nullptr};
-
   vertex_t number_of_vertices_{0};
-  edge_t number_of_edges_{0};
-
-  graph_properties_t properties_{};
 };
 
 }  // namespace detail
@@ -385,11 +375,10 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   static constexpr bool is_storage_transposed = store_transposed;
   static constexpr bool is_multi_gpu          = multi_gpu;
 
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& edge_partition_offsets,
-               std::vector<vertex_t const*> const& edge_partition_indices,
-               std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-               std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
+  graph_view_t(std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+               std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
+               std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+                 edge_partition_dcs_nzd_vertices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta);
 
   std::vector<vertex_t> vertex_partition_range_offsets() const
@@ -604,25 +593,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
         this->local_edge_partition_src_value_start_offset(partition_idx);
     }
     std::optional<vertex_t> major_hypersparse_first{std::nullopt};
-    vertex_t offset_size = (major_range_last - major_range_first) + 1;
     if (this->use_dcs()) {
       major_hypersparse_first =
         major_range_first + (*(this->local_edge_partition_segment_offsets(
                               partition_idx)))[detail::num_sparse_segments_per_vertex_partition];
-      offset_size = ((*major_hypersparse_first) - major_range_first) +
-                    (*edge_partition_dcs_nzd_vertex_counts_)[partition_idx] + 1;
     }
     return edge_partition_view_t<vertex_t, edge_t, true>(
-      raft::device_span<edge_t const>(edge_partition_offsets_[partition_idx],
-                                      edge_partition_offsets_[partition_idx] + offset_size),
-      raft::device_span<vertex_t const>(
-        edge_partition_indices_[partition_idx],
-        edge_partition_indices_[partition_idx] + edge_partition_number_of_edges_[partition_idx]),
+      edge_partition_offsets_[partition_idx],
+      edge_partition_indices_[partition_idx],
       edge_partition_dcs_nzd_vertices_
-        ? std::make_optional<raft::device_span<vertex_t const>>(
-            (*edge_partition_dcs_nzd_vertices_)[partition_idx],
-            (*edge_partition_dcs_nzd_vertices_)[partition_idx] +
-              (*edge_partition_dcs_nzd_vertex_counts_)[partition_idx])
+        ? std::make_optional((*edge_partition_dcs_nzd_vertices_)[partition_idx])
         : std::nullopt,
       major_hypersparse_first,
       major_range_first,
@@ -632,6 +612,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
       major_value_range_start_offset);
   }
 
+  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // masking)
+  edge_t number_of_edges() const
+  {
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
+    return this->number_of_edges_;
+  }
+
+  edge_t compute_number_of_edges(raft::handle_t const& handle) const;
+
   rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
   rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
 
@@ -641,6 +631,19 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   edge_t count_self_loops(raft::handle_t const& handle) const;
   edge_t count_multi_edges(raft::handle_t const& handle) const;
 
+  rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
+                                     /* (edge_srcs, edge_dsts) should be pre-shuffled */
+                                     raft::device_span<vertex_t const> edge_srcs,
+                                     raft::device_span<vertex_t const> edge_dsts,
+                                     bool do_expensive_check = false);
+
+  rmm::device_uvector<edge_t> compute_multiplicity(
+    raft::handle_t const& handle,
+    /* (edge_srcs, edge_dsts) should be pre-shuffled */
+    raft::device_span<vertex_t const> edge_srcs,
+    raft::device_span<vertex_t const> edge_dsts,
+    bool do_expensive_check = false);
+
   template <bool transposed = is_storage_transposed>
   std::enable_if_t<transposed, std::optional<raft::device_span<vertex_t const>>>
   local_sorted_unique_edge_srcs() const
@@ -746,14 +749,11 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   }
 
  private:
-  std::vector<edge_t const*> edge_partition_offsets_{};
-  std::vector<vertex_t const*> edge_partition_indices_{};
+  std::vector<raft::device_span<edge_t const>> edge_partition_offsets_{};
+  std::vector<raft::device_span<vertex_t const>> edge_partition_indices_{};
 
   // relevant only if we use the CSR + DCSR (or CSC + DCSC) hybrid format
-  std::optional<std::vector<vertex_t const*>> edge_partition_dcs_nzd_vertices_{};
-  std::optional<std::vector<vertex_t>> edge_partition_dcs_nzd_vertex_counts_{};
-
-  std::vector<edge_t> edge_partition_number_of_edges_{};
+  std::optional<std::vector<raft::device_span<vertex_t const>>> edge_partition_dcs_nzd_vertices_{};
 
   partition_t<vertex_t> partition_{};
 
@@ -804,9 +804,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   static constexpr bool is_storage_transposed = store_transposed;
   static constexpr bool is_multi_gpu          = multi_gpu;
 
-  graph_view_t(raft::handle_t const& handle,
-               edge_t const* offsets,
-               vertex_t const* indices,
+  graph_view_t(raft::device_span<edge_t const> offsets,
+               raft::device_span<vertex_t const> indices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta);
 
   std::vector<vertex_t> vertex_partition_range_offsets() const
@@ -920,11 +919,19 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   {
     assert(partition_idx == 0);  // there is only one edge partition in single-GPU
     return edge_partition_view_t<vertex_t, edge_t, false>(
-      raft::device_span<edge_t const>(offsets_, offsets_ + (this->number_of_vertices() + 1)),
-      raft::device_span<vertex_t const>(indices_, indices_ + this->number_of_edges()),
-      this->number_of_vertices());
+      offsets_, indices_, this->number_of_vertices());
   }
 
+  // FIXME: deprecated, replaced with copmute_number_of_edges (which works with or without edge
+  // masking)
+  edge_t number_of_edges() const
+  {
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
+    return this->number_of_edges_;
+  }
+
+  edge_t compute_number_of_edges(raft::handle_t const& handle) const;
+
   rmm::device_uvector<edge_t> compute_in_degrees(raft::handle_t const& handle) const;
   rmm::device_uvector<edge_t> compute_out_degrees(raft::handle_t const& handle) const;
 
@@ -934,6 +941,16 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   edge_t count_self_loops(raft::handle_t const& handle) const;
   edge_t count_multi_edges(raft::handle_t const& handle) const;
 
+  rmm::device_uvector<bool> has_edge(raft::handle_t const& handle,
+                                     raft::device_span<vertex_t const> edge_srcs,
+                                     raft::device_span<vertex_t const> edge_dsts,
+                                     bool do_expensive_check = false);
+
+  rmm::device_uvector<edge_t> compute_multiplicity(raft::handle_t const& handle,
+                                                   raft::device_span<vertex_t const> edge_srcs,
+                                                   raft::device_span<vertex_t const> edge_dsts,
+                                                   bool do_expensive_check = false);
+
   template <bool transposed = is_storage_transposed>
   std::enable_if_t<transposed, std::optional<raft::device_span<vertex_t const>>>
   local_sorted_unique_edge_srcs() const
@@ -1027,8 +1044,8 @@ class graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if
   }
 
  private:
-  edge_t const* offsets_{nullptr};
-  vertex_t const* indices_{nullptr};
+  raft::device_span<edge_t const> offsets_{};
+  raft::device_span<vertex_t const> indices_{};
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
diff --git a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
index c4cacb401af..5fbe7bc9f01 100644
--- a/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
+++ b/cpp/include/cugraph/mtmg/detail/device_shared_wrapper.hpp
@@ -57,10 +57,10 @@ class device_shared_wrapper_t {
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    auto pos = objects_.find(handle.get_local_rank());
+    auto pos = objects_.find(handle.get_rank());
     CUGRAPH_EXPECTS(pos == objects_.end(), "Cannot overwrite wrapped object");
 
-    objects_.insert(std::make_pair(handle.get_local_rank(), std::move(obj)));
+    objects_.insert(std::make_pair(handle.get_rank(), std::move(obj)));
   }
 
   /**
@@ -79,7 +79,6 @@ class device_shared_wrapper_t {
     objects_.insert(std::make_pair(local_rank, std::move(obj)));
   }
 
- public:
   /**
    * @brief Get reference to an object for a particular thread
    *
@@ -90,7 +89,7 @@ class device_shared_wrapper_t {
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    auto pos = objects_.find(handle.get_local_rank());
+    auto pos = objects_.find(handle.get_rank());
     CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
 
     return pos->second;
@@ -106,7 +105,7 @@ class device_shared_wrapper_t {
   {
     std::lock_guard<std::mutex> lock(lock_);
 
-    auto pos = objects_.find(handle.get_local_rank());
+    auto pos = objects_.find(handle.get_rank());
 
     CUGRAPH_EXPECTS(pos != objects_.end(), "Uninitialized wrapped object");
 
diff --git a/cpp/include/cugraph/mtmg/graph_view.hpp b/cpp/include/cugraph/mtmg/graph_view.hpp
index 94347e016ea..8e202ab4904 100644
--- a/cpp/include/cugraph/mtmg/graph_view.hpp
+++ b/cpp/include/cugraph/mtmg/graph_view.hpp
@@ -27,8 +27,27 @@ namespace mtmg {
  * @brief Graph view for each GPU
  */
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
-using graph_view_t = detail::device_shared_wrapper_t<
-  cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>>;
+class graph_view_t : public detail::device_shared_wrapper_t<
+                       cugraph::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu>> {
+ public:
+  /**
+   * @brief Get the vertex_partition_view for this graph
+   */
+  vertex_partition_view_t<vertex_t, multi_gpu> get_vertex_partition_view(
+    cugraph::mtmg::handle_t const& handle) const
+  {
+    return this->get(handle).local_vertex_partition_view();
+  }
+
+  /**
+   * @brief Get the vertex_partition_view for this graph
+   */
+  std::vector<vertex_t> get_vertex_partition_range_lasts(
+    cugraph::mtmg::handle_t const& handle) const
+  {
+    return this->get(handle).vertex_partition_range_lasts();
+  }
+};
 
 }  // namespace mtmg
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/mtmg/handle.hpp b/cpp/include/cugraph/mtmg/handle.hpp
index 6223de1781d..0b02091a3cc 100644
--- a/cpp/include/cugraph/mtmg/handle.hpp
+++ b/cpp/include/cugraph/mtmg/handle.hpp
@@ -32,18 +32,19 @@ namespace mtmg {
  *
  */
 class handle_t {
+  handle_t(handle_t const&)           = delete;
+  handle_t operator=(handle_t const&) = delete;
+
  public:
   /**
    * @brief Constructor
    *
    * @param raft_handle   Raft handle for the resources
    * @param thread_rank   Rank for this thread
+   * @param device_id     Device id for the device this handle operates on
    */
-  handle_t(raft::handle_t const& raft_handle, int thread_rank, size_t device_id)
-    : raft_handle_(raft_handle),
-      thread_rank_(thread_rank),
-      local_rank_(raft_handle.get_comms().get_rank()),  // FIXME: update for multi-node
-      device_id_(device_id)
+  handle_t(raft::handle_t const& raft_handle, int thread_rank, rmm::cuda_device_id device_id)
+    : raft_handle_(raft_handle), thread_rank_(thread_rank), device_id_raii_(device_id)
   {
   }
 
@@ -118,18 +119,10 @@ class handle_t {
    */
   int get_rank() const { return raft_handle_.get_comms().get_rank(); }
 
-  /**
-   * @brief Get local gpu rank
-   *
-   * @return local gpu rank
-   */
-  int get_local_rank() const { return local_rank_; }
-
  private:
   raft::handle_t const& raft_handle_;
   int thread_rank_;
-  int local_rank_;
-  size_t device_id_;
+  rmm::cuda_set_device_raii device_id_raii_;
 };
 
 }  // namespace mtmg
diff --git a/cpp/include/cugraph/mtmg/instance_manager.hpp b/cpp/include/cugraph/mtmg/instance_manager.hpp
index f819a5a0abe..f60063c4101 100644
--- a/cpp/include/cugraph/mtmg/instance_manager.hpp
+++ b/cpp/include/cugraph/mtmg/instance_manager.hpp
@@ -47,15 +47,10 @@ class instance_manager_t {
 
   ~instance_manager_t()
   {
-    int current_device{};
-    RAFT_CUDA_TRY(cudaGetDevice(&current_device));
-
     for (size_t i = 0; i < nccl_comms_.size(); ++i) {
-      RAFT_CUDA_TRY(cudaSetDevice(device_ids_[i].value()));
+      rmm::cuda_set_device_raii local_set_device(device_ids_[i]);
       RAFT_NCCL_TRY(ncclCommDestroy(*nccl_comms_[i]));
     }
-
-    RAFT_CUDA_TRY(cudaSetDevice(current_device));
   }
 
   /**
@@ -75,8 +70,7 @@ class instance_manager_t {
     int gpu_id    = local_id % raft_handle_.size();
     int thread_id = local_id / raft_handle_.size();
 
-    RAFT_CUDA_TRY(cudaSetDevice(device_ids_[gpu_id].value()));
-    return handle_t(*raft_handle_[gpu_id], thread_id, static_cast<size_t>(gpu_id));
+    return handle_t(*raft_handle_[gpu_id], thread_id, device_ids_[gpu_id]);
   }
 
   /**
diff --git a/cpp/include/cugraph/mtmg/resource_manager.hpp b/cpp/include/cugraph/mtmg/resource_manager.hpp
index 127944cf7ba..a9e4b81f894 100644
--- a/cpp/include/cugraph/mtmg/resource_manager.hpp
+++ b/cpp/include/cugraph/mtmg/resource_manager.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -89,7 +89,7 @@ class resource_manager_t {
 
     local_rank_map_.insert(std::pair(global_rank, local_device_id));
 
-    RAFT_CUDA_TRY(cudaSetDevice(local_device_id.value()));
+    rmm::cuda_set_device_raii local_set_device(local_device_id);
 
     // FIXME: There is a bug in the cuda_memory_resource that results in a Hang.
     //   using the pool resource as a work-around.
@@ -106,9 +106,9 @@ class resource_manager_t {
     auto per_device_it = per_device_rmm_resources_.insert(
       std::pair{global_rank, std::make_shared<rmm::mr::cuda_memory_resource>()});
 #else
-    auto const [free, total] = rmm::detail::available_device_memory();
+    auto const [free, total] = rmm::available_device_memory();
     auto const min_alloc =
-      rmm::detail::align_down(std::min(free, total / 6), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+      rmm::align_down(std::min(free, total / 6), rmm::CUDA_ALLOCATION_ALIGNMENT);
 
     auto per_device_it = per_device_rmm_resources_.insert(
       std::pair{global_rank,
@@ -182,14 +182,12 @@ class resource_manager_t {
       --gpu_row_comm_size;
     }
 
-    int current_device{};
-    RAFT_CUDA_TRY(cudaGetDevice(&current_device));
     RAFT_NCCL_TRY(ncclGroupStart());
 
     for (size_t i = 0; i < local_ranks_to_include.size(); ++i) {
       int rank = local_ranks_to_include[i];
       auto pos = local_rank_map_.find(rank);
-      RAFT_CUDA_TRY(cudaSetDevice(pos->second.value()));
+      rmm::cuda_set_device_raii local_set_device(pos->second);
 
       nccl_comms.push_back(std::make_unique<ncclComm_t>());
       handles.push_back(
@@ -204,7 +202,6 @@ class resource_manager_t {
         handles[i].get(), *nccl_comms[i], ranks_to_include.size(), rank);
     }
     RAFT_NCCL_TRY(ncclGroupEnd());
-    RAFT_CUDA_TRY(cudaSetDevice(current_device));
 
     std::vector<std::thread> running_threads;
 
@@ -217,9 +214,7 @@ class resource_manager_t {
                                     &device_ids,
                                     &nccl_comms,
                                     &handles]() {
-        int rank = local_ranks_to_include[idx];
-        RAFT_CUDA_TRY(cudaSetDevice(device_ids[idx].value()));
-
+        rmm::cuda_set_device_raii local_set_device(device_ids[idx]);
         cugraph::partition_manager::init_subcomm(*handles[idx], gpu_row_comm_size);
       });
     }
diff --git a/cpp/include/cugraph/mtmg/vertex_result_view.hpp b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
index a349bb95333..42b80cea62f 100644
--- a/cpp/include/cugraph/mtmg/vertex_result_view.hpp
+++ b/cpp/include/cugraph/mtmg/vertex_result_view.hpp
@@ -39,11 +39,12 @@ class vertex_result_view_t : public detail::device_shared_device_span_t<result_t
   /**
    * @brief Gather results from specified vertices into a device vector
    */
-  template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+  template <typename vertex_t, bool multi_gpu>
   rmm::device_uvector<result_t> gather(
     handle_t const& handle,
     raft::device_span<vertex_t const> vertices,
-    cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+    std::vector<vertex_t> const& vertex_partition_range_lasts,
+    cugraph::vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
     std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view);
 };
 
diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh
index a62e8ce85ec..04aeac49c9d 100644
--- a/cpp/include/cugraph/utilities/misc_utils.cuh
+++ b/cpp/include/cugraph/utilities/misc_utils.cuh
@@ -19,12 +19,15 @@
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/atomic>
 #include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 #include <vector>
@@ -43,7 +46,8 @@ std::tuple<std::vector<vertex_t>, std::vector<edge_t>> compute_offset_aligned_ed
 {
   auto search_offset_first = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{1}),
-    [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; });
+    cuda::proclaim_return_type<size_t>(
+      [approx_edge_chunk_size] __device__(auto i) { return i * approx_edge_chunk_size; }));
   auto num_chunks = (num_edges + approx_edge_chunk_size - 1) / approx_edge_chunk_size;
 
   if (num_chunks > 1) {
diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh
index ab6a54cc1c0..414d9b36992 100644
--- a/cpp/include/cugraph/utilities/shuffle_comm.cuh
+++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tabulate.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <vector>
@@ -197,12 +199,13 @@ void multi_partition(ValueIterator value_first,
     value_last,
     thrust::make_zip_iterator(
       thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-    [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
-      auto group_id = value_to_group_id_op(value);
-      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-      return thrust::make_tuple(group_id,
-                                counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-    });
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [value_to_group_id_op, group_first, counts = counts.data()] __device__(auto value) {
+        auto group_id = value_to_group_id_op(value);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -245,17 +248,19 @@ void multi_partition(KeyIterator key_first,
   rmm::device_uvector<int> group_ids(num_keys, stream_view);
   rmm::device_uvector<size_t> intra_partition_offsets(num_keys, stream_view);
   thrust::fill(rmm::exec_policy(stream_view), counts.begin(), counts.end(), size_t{0});
-  thrust::transform(rmm::exec_policy(stream_view),
-                    key_first,
-                    key_last,
-                    thrust::make_zip_iterator(
-                      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
-                    [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
-                      auto group_id = key_to_group_id_op(key);
-                      cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
-                      return thrust::make_tuple(
-                        group_id, counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
-                    });
+  thrust::transform(
+    rmm::exec_policy(stream_view),
+    key_first,
+    key_last,
+    thrust::make_zip_iterator(
+      thrust::make_tuple(group_ids.begin(), intra_partition_offsets.begin())),
+    cuda::proclaim_return_type<thrust::tuple<int, size_t>>(
+      [key_to_group_id_op, group_first, counts = counts.data()] __device__(auto key) {
+        auto group_id = key_to_group_id_op(key);
+        cuda::std::atomic_ref<size_t> counter(counts[group_id - group_first]);
+        return thrust::make_tuple(group_id,
+                                  counter.fetch_add(size_t{1}, cuda::std::memory_order_relaxed));
+      }));
 
   rmm::device_uvector<size_t> displacements(num_groups, stream_view);
   thrust::exclusive_scan(
@@ -761,8 +766,9 @@ rmm::device_uvector<size_t> groupby_and_count(ValueIterator tx_value_first /* [I
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_value_first,
-    [value_to_group_id_op] __device__(auto value) { return value_to_group_id_op(value); });
+    tx_value_first, cuda::proclaim_return_type<int>([value_to_group_id_op] __device__(auto value) {
+      return value_to_group_id_op(value);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
@@ -795,7 +801,9 @@ rmm::device_uvector<size_t> groupby_and_count(VertexIterator tx_key_first /* [IN
                              stream_view);
 
   auto group_id_first = thrust::make_transform_iterator(
-    tx_key_first, [key_to_group_id_op] __device__(auto key) { return key_to_group_id_op(key); });
+    tx_key_first, cuda::proclaim_return_type<int>([key_to_group_id_op] __device__(auto key) {
+      return key_to_group_id_op(key);
+    }));
   rmm::device_uvector<int> d_tx_dst_ranks(num_groups, stream_view);
   rmm::device_uvector<size_t> d_tx_value_counts(d_tx_dst_ranks.size(), stream_view);
   auto rank_count_pair_first = thrust::make_zip_iterator(
diff --git a/cpp/include/cugraph_c/community_algorithms.h b/cpp/include/cugraph_c/community_algorithms.h
index feab15c7eeb..e8a71a40162 100644
--- a/cpp/include/cugraph_c/community_algorithms.h
+++ b/cpp/include/cugraph_c/community_algorithms.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 #include <cugraph_c/resource_handle.h>
 
 /** @defgroup community Community algorithms
- *  @{
  */
 
 #ifdef __cplusplus
@@ -60,18 +59,21 @@ cugraph_error_code_t cugraph_triangle_count(const cugraph_resource_handle_t* han
                                             cugraph_error_t** error);
 
 /**
+ * @ingroup community
  * @brief     Get triangle counting vertices
  */
 cugraph_type_erased_device_array_view_t* cugraph_triangle_count_result_get_vertices(
   cugraph_triangle_count_result_t* result);
 
 /**
+ * @ingroup community
  * @brief     Get triangle counting counts
  */
 cugraph_type_erased_device_array_view_t* cugraph_triangle_count_result_get_counts(
   cugraph_triangle_count_result_t* result);
 
 /**
+ * @ingroup community
  * @brief     Free a triangle count result
  *
  * @param [in] result     The result from a sampling algorithm
@@ -147,24 +149,28 @@ cugraph_error_code_t cugraph_leiden(const cugraph_resource_handle_t* handle,
                                     cugraph_error_t** error);
 
 /**
+ * @ingroup community
  * @brief     Get hierarchical clustering vertices
  */
 cugraph_type_erased_device_array_view_t* cugraph_hierarchical_clustering_result_get_vertices(
   cugraph_hierarchical_clustering_result_t* result);
 
 /**
+ * @ingroup community
  * @brief     Get hierarchical clustering clusters
  */
 cugraph_type_erased_device_array_view_t* cugraph_hierarchical_clustering_result_get_clusters(
   cugraph_hierarchical_clustering_result_t* result);
 
 /**
+ * @ingroup community
  * @brief     Get modularity
  */
 double cugraph_hierarchical_clustering_result_get_modularity(
   cugraph_hierarchical_clustering_result_t* result);
 
 /**
+ * @ingroup community
  * @brief     Free a hierarchical clustering result
  *
  * @param [in] result     The result from a sampling algorithm
@@ -423,7 +429,3 @@ void cugraph_clustering_result_free(cugraph_clustering_result_t* result);
 #ifdef __cplusplus
 }
 #endif
-
-/**
- *  @}
- */
diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h
index 782bb5a3790..5760d2098aa 100644
--- a/cpp/include/cugraph_c/sampling_algorithms.h
+++ b/cpp/include/cugraph_c/sampling_algorithms.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 #include <cugraph_c/resource_handle.h>
 
 /** @defgroup samplingC Sampling algorithms
- *  @{
  */
 
 #ifdef __cplusplus
@@ -134,6 +133,7 @@ cugraph_error_code_t cugraph_node2vec(const cugraph_resource_handle_t* handle,
                                       cugraph_error_t** error);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the max path length from random walk result
  *
  * @param [in]   result   The result from random walks
@@ -145,6 +145,7 @@ size_t cugraph_random_walk_result_get_max_path_length(cugraph_random_walk_result
 //         difference at the moment is that RW results contain weights
 //         and extract_paths results don't.  But that's probably wrong.
 /**
+ * @ingroup samplingC
  * @brief     Get the matrix (row major order) of vertices in the paths
  *
  * @param [in]   result   The result from a random walk algorithm
@@ -154,6 +155,7 @@ cugraph_type_erased_device_array_view_t* cugraph_random_walk_result_get_paths(
   cugraph_random_walk_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the matrix (row major order) of edge weights in the paths
  *
  * @param [in]   result   The result from a random walk algorithm
@@ -163,6 +165,7 @@ cugraph_type_erased_device_array_view_t* cugraph_random_walk_result_get_weights(
   cugraph_random_walk_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     If the random walk result is compressed, get the path sizes
  * @deprecated This call will no longer be relevant once the new node2vec are called
  *
@@ -173,6 +176,7 @@ cugraph_type_erased_device_array_view_t* cugraph_random_walk_result_get_path_siz
   cugraph_random_walk_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Free random walks result
  *
  * @param [in]   result   The result from random walks
@@ -220,6 +224,7 @@ typedef enum cugraph_compression_type_t {
 } cugraph_compression_type_t;
 
 /**
+ * @ingroup samplingC
  * @brief   Create sampling options object
  *
  * All sampling options set to FALSE
@@ -232,6 +237,7 @@ cugraph_error_code_t cugraph_sampling_options_create(cugraph_sampling_options_t*
                                                      cugraph_error_t** error);
 
 /**
+ * @ingroup samplingC
  * @brief   Set flag to renumber results
  *
  * @param options - opaque pointer to the sampling options
@@ -240,6 +246,7 @@ cugraph_error_code_t cugraph_sampling_options_create(cugraph_sampling_options_t*
 void cugraph_sampling_set_renumber_results(cugraph_sampling_options_t* options, bool_t value);
 
 /**
+ * @ingroup samplingC
  * @brief   Set whether to compress per-hop (True) or globally (False)
  *
  * @param options - opaque pointer to the sampling options
@@ -248,6 +255,7 @@ void cugraph_sampling_set_renumber_results(cugraph_sampling_options_t* options,
 void cugraph_sampling_set_compress_per_hop(cugraph_sampling_options_t* options, bool_t value);
 
 /**
+ * @ingroup samplingC
  * @brief   Set flag to sample with_replacement
  *
  * @param options - opaque pointer to the sampling options
@@ -256,6 +264,7 @@ void cugraph_sampling_set_compress_per_hop(cugraph_sampling_options_t* options,
 void cugraph_sampling_set_with_replacement(cugraph_sampling_options_t* options, bool_t value);
 
 /**
+ * @ingroup samplingC
  * @brief   Set flag to sample return_hops
  *
  * @param options - opaque pointer to the sampling options
@@ -264,6 +273,7 @@ void cugraph_sampling_set_with_replacement(cugraph_sampling_options_t* options,
 void cugraph_sampling_set_return_hops(cugraph_sampling_options_t* options, bool_t value);
 
 /**
+ * @ingroup samplingC
  * @brief   Set compression type
  *
  * @param options - opaque pointer to the sampling options
@@ -273,6 +283,7 @@ void cugraph_sampling_set_compression_type(cugraph_sampling_options_t* options,
                                            cugraph_compression_type_t value);
 
 /**
+ * @ingroup samplingC
  * @brief   Set prior sources behavior
  *
  * @param options - opaque pointer to the sampling options
@@ -282,6 +293,7 @@ void cugraph_sampling_set_prior_sources_behavior(cugraph_sampling_options_t* opt
                                                  cugraph_prior_sources_behavior_t value);
 
 /**
+ * @ingroup samplingC
  * @brief   Set flag to sample dedupe_sources prior to sampling
  *
  * @param options - opaque pointer to the sampling options
@@ -290,6 +302,7 @@ void cugraph_sampling_set_prior_sources_behavior(cugraph_sampling_options_t* opt
 void cugraph_sampling_set_dedupe_sources(cugraph_sampling_options_t* options, bool_t value);
 
 /**
+ * @ingroup samplingC
  * @brief     Free sampling options object
  *
  * @param [in]   options   Opaque pointer to sampling object
@@ -369,6 +382,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_destinations(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the major vertices from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -378,6 +392,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_majors(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the minor vertices from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -387,6 +402,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_minors(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the major offsets from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -396,6 +412,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_major_offsets
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the start labels from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -405,6 +422,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_start_labels(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the edge_id from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -414,6 +432,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_id(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the edge_type from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -423,6 +442,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_type(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the edge_weight from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -432,6 +452,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_edge_weight(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the hop from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -441,6 +462,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_hop(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the label-hop offsets from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -450,6 +472,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_label_hop_off
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the index from the sampling algorithm result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -469,6 +492,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_offsets(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the renumber map
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -478,6 +502,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map(
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Get the renumber map offsets
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -487,6 +512,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map_
   const cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Free a sampling result
  *
  * @param [in]   result   The result from a sampling algorithm
@@ -494,6 +520,7 @@ cugraph_type_erased_device_array_view_t* cugraph_sample_result_get_renumber_map_
 void cugraph_sample_result_free(cugraph_sample_result_t* result);
 
 /**
+ * @ingroup samplingC
  * @brief     Create a sampling result (testing API)
  *
  * @param [in]   handle         Handle for accessing resources
@@ -524,6 +551,7 @@ cugraph_error_code_t cugraph_test_sample_result_create(
   cugraph_error_t** error);
 
 /**
+ * @ingroup samplingC
  * @brief     Create a sampling result (testing API)
  *
  * @param [in]   handle         Handle for accessing resources
@@ -554,6 +582,7 @@ cugraph_error_code_t cugraph_test_uniform_neighborhood_sample_result_create(
   cugraph_error_t** error);
 
 /**
+ * @ingroup samplingC
  * @brief Select random vertices from the graph
  *
  * @param [in]      handle        Handle for accessing resources
@@ -576,7 +605,3 @@ cugraph_error_code_t cugraph_select_random_vertices(const cugraph_resource_handl
 #ifdef __cplusplus
 }
 #endif
-
-/**
- *  @}
- */
diff --git a/cpp/include/cugraph_c/traversal_algorithms.h b/cpp/include/cugraph_c/traversal_algorithms.h
index 8959366ac17..e25fa167e43 100644
--- a/cpp/include/cugraph_c/traversal_algorithms.h
+++ b/cpp/include/cugraph_c/traversal_algorithms.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 
 /** @defgroup traversal Traversal Algorithms
  *  @ingroup c_api
- *  @{
  */
 
 #ifdef __cplusplus
@@ -40,6 +39,7 @@ typedef struct {
 } cugraph_paths_result_t;
 
 /**
+ * @ingroup traversal
  * @brief     Get the vertex ids from the paths result
  *
  * @param [in]   result   The result from bfs or sssp
@@ -49,6 +49,7 @@ cugraph_type_erased_device_array_view_t* cugraph_paths_result_get_vertices(
   cugraph_paths_result_t* result);
 
 /**
+ * @ingroup traversal
  * @brief     Get the distances from the paths result
  *
  * @param [in]   result   The result from bfs or sssp
@@ -58,6 +59,7 @@ cugraph_type_erased_device_array_view_t* cugraph_paths_result_get_distances(
   cugraph_paths_result_t* result);
 
 /**
+ * @ingroup traversal
  * @brief     Get the predecessors from the paths result
  *
  * @param [in]   result   The result from bfs or sssp
@@ -69,6 +71,7 @@ cugraph_type_erased_device_array_view_t* cugraph_paths_result_get_predecessors(
   cugraph_paths_result_t* result);
 
 /**
+ * @ingroup traversal
  * @brief     Free paths result
  *
  * @param [in]   result   The result from bfs or sssp
@@ -188,6 +191,7 @@ cugraph_error_code_t cugraph_extract_paths(
 size_t cugraph_extract_paths_result_get_max_path_length(cugraph_extract_paths_result_t* result);
 
 /**
+ * @ingroup traversal
  * @brief     Get the matrix (row major order) of paths
  *
  * @param [in]   result   The result from extract_paths
@@ -197,6 +201,7 @@ cugraph_type_erased_device_array_view_t* cugraph_extract_paths_result_get_paths(
   cugraph_extract_paths_result_t* result);
 
 /**
+ * @ingroup traversal
  * @brief     Free extract_paths result
  *
  * @param [in]   result   The result from extract_paths
@@ -206,7 +211,3 @@ void cugraph_extract_paths_result_free(cugraph_extract_paths_result_t* result);
 #ifdef __cplusplus
 }
 #endif
-
-/**
- *  @}
- */
diff --git a/cpp/libcugraph_etl/CMakeLists.txt b/cpp/libcugraph_etl/CMakeLists.txt
index ac0cb6959e8..8874c75896c 100644
--- a/cpp/libcugraph_etl/CMakeLists.txt
+++ b/cpp/libcugraph_etl/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUGRAPH_ETL)
 
-project(CUGRAPH_ETL VERSION 23.12.00 LANGUAGES C CXX CUDA)
+project(CUGRAPH_ETL VERSION 24.02.00 LANGUAGES C CXX CUDA)
 
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND
    CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.0)
diff --git a/cpp/src/c_api/abstract_functor.hpp b/cpp/src/c_api/abstract_functor.hpp
index 7bff5b37380..72b433aa9af 100644
--- a/cpp/src/c_api/abstract_functor.hpp
+++ b/cpp/src/c_api/abstract_functor.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,8 +32,14 @@ struct abstract_functor {
 
   void unsupported()
   {
-    error_code_            = CUGRAPH_UNSUPPORTED_TYPE_COMBINATION;
-    error_->error_message_ = "Type Dispatcher executing unsupported combination of types";
+    mark_error(CUGRAPH_UNSUPPORTED_TYPE_COMBINATION,
+               "Type Dispatcher executing unsupported combination of types");
+  }
+
+  void mark_error(cugraph_error_code_t error_code, std::string const& error_message)
+  {
+    error_code_            = error_code;
+    error_->error_message_ = error_message;
   }
 };
 
diff --git a/cpp/src/c_api/bfs.cpp b/cpp/src/c_api/bfs.cpp
index ae7667375d2..32841b2dd3c 100644
--- a/cpp/src/c_api/bfs.cpp
+++ b/cpp/src/c_api/bfs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -113,6 +113,21 @@ struct bfs_functor : public abstract_functor {
                                                  graph_view.local_vertex_partition_range_last(),
                                                  do_expensive_check_);
 
+      size_t invalid_count = cugraph::detail::count_values(
+        handle_,
+        raft::device_span<vertex_t const>{sources.data(), sources.size()},
+        cugraph::invalid_vertex_id<vertex_t>::value);
+
+      if constexpr (multi_gpu) {
+        invalid_count = cugraph::host_scalar_allreduce(
+          handle_.get_comms(), invalid_count, raft::comms::op_t::SUM, handle_.get_stream());
+      }
+
+      if (invalid_count != 0) {
+        mark_error(CUGRAPH_INVALID_INPUT, "Found invalid vertex in the input sources");
+        return;
+      }
+
       cugraph::bfs<vertex_t, edge_t, multi_gpu>(
         handle_,
         graph_view,
diff --git a/cpp/src/c_api/capi_helper.cu b/cpp/src/c_api/capi_helper.cu
index 0ee49f87265..f08af4137db 100644
--- a/cpp/src/c_api/capi_helper.cu
+++ b/cpp/src/c_api/capi_helper.cu
@@ -74,6 +74,104 @@ template void sort_by_key(raft::handle_t const& handle,
                           raft::device_span<int64_t> keys,
                           raft::device_span<int64_t> values);
 
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<size_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<vertex_t>&& edge_srcs,
+                          rmm::device_uvector<vertex_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<weight_t>>&& edge_weights)
+{
+  rmm::device_uvector<size_t> sort_indices(edge_srcs.size(), handle.get_stream());
+  thrust::tabulate(
+    handle.get_thrust_policy(),
+    sort_indices.begin(),
+    sort_indices.end(),
+    [offset_lasts   = raft::device_span<size_t const>(offsets.begin() + 1, offsets.end()),
+     source_indices = raft::device_span<size_t const>(source_indices.data(),
+                                                      source_indices.size())] __device__(size_t i) {
+      auto idx = static_cast<size_t>(thrust::distance(
+        offset_lasts.begin(),
+        thrust::upper_bound(thrust::seq, offset_lasts.begin(), offset_lasts.end(), i)));
+      return source_indices[idx];
+    });
+  source_indices.resize(0, handle.get_stream());
+  source_indices.shrink_to_fit(handle.get_stream());
+
+  auto triplet_first =
+    thrust::make_zip_iterator(sort_indices.begin(), edge_srcs.begin(), edge_dsts.begin());
+  if (edge_weights) {
+    thrust::sort_by_key(handle.get_thrust_policy(),
+                        triplet_first,
+                        triplet_first + sort_indices.size(),
+                        (*edge_weights).begin());
+  } else {
+    thrust::sort(handle.get_thrust_policy(), triplet_first, triplet_first + sort_indices.size());
+  }
+
+  thrust::tabulate(
+    handle.get_thrust_policy(),
+    offsets.begin() + 1,
+    offsets.end(),
+    [sort_indices = raft::device_span<size_t const>(sort_indices.data(),
+                                                    sort_indices.size())] __device__(size_t i) {
+      return static_cast<size_t>(thrust::distance(
+        sort_indices.begin(),
+        thrust::upper_bound(thrust::seq, sort_indices.begin(), sort_indices.end(), i)));
+    });
+
+  return std::make_tuple(
+    std::move(offsets), std::move(edge_srcs), std::move(edge_dsts), std::move(edge_weights));
+}
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<float>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int32_t>&& edge_srcs,
+                          rmm::device_uvector<int32_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<float>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int32_t>,
+                    rmm::device_uvector<int32_t>,
+                    std::optional<rmm::device_uvector<double>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int32_t>&& edge_srcs,
+                          rmm::device_uvector<int32_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<double>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<float>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int64_t>&& edge_srcs,
+                          rmm::device_uvector<int64_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<float>>&& edge_weights);
+
+template std::tuple<rmm::device_uvector<size_t>,
+                    rmm::device_uvector<int64_t>,
+                    rmm::device_uvector<int64_t>,
+                    std::optional<rmm::device_uvector<double>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<int64_t>&& edge_srcs,
+                          rmm::device_uvector<int64_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<double>>&& edge_weights);
+
 }  // namespace detail
 }  // namespace c_api
 }  // namespace cugraph
diff --git a/cpp/src/c_api/capi_helper.hpp b/cpp/src/c_api/capi_helper.hpp
index ce08e8d90d3..56401606477 100644
--- a/cpp/src/c_api/capi_helper.hpp
+++ b/cpp/src/c_api/capi_helper.hpp
@@ -36,6 +36,18 @@ void sort_by_key(raft::handle_t const& handle,
                  raft::device_span<key_t> keys,
                  raft::device_span<value_t> values);
 
+template <typename vertex_t, typename weight_t>
+std::tuple<rmm::device_uvector<size_t>,
+           rmm::device_uvector<vertex_t>,
+           rmm::device_uvector<vertex_t>,
+           std::optional<rmm::device_uvector<weight_t>>>
+reorder_extracted_egonets(raft::handle_t const& handle,
+                          rmm::device_uvector<size_t>&& source_indices,
+                          rmm::device_uvector<size_t>&& offsets,
+                          rmm::device_uvector<vertex_t>&& edge_srcs,
+                          rmm::device_uvector<vertex_t>&& edge_dsts,
+                          std::optional<rmm::device_uvector<weight_t>>&& edge_weights);
+
 }  // namespace detail
 }  // namespace c_api
 }  // namespace cugraph
diff --git a/cpp/src/c_api/extract_ego.cpp b/cpp/src/c_api/extract_ego.cpp
index 931d58b5185..cbe07af2e77 100644
--- a/cpp/src/c_api/extract_ego.cpp
+++ b/cpp/src/c_api/extract_ego.cpp
@@ -17,6 +17,7 @@
 #include <cugraph_c/algorithms.h>
 
 #include <c_api/abstract_functor.hpp>
+#include <c_api/capi_helper.hpp>
 #include <c_api/graph.hpp>
 #include <c_api/induced_subgraph_result.hpp>
 #include <c_api/resource_handle.hpp>
@@ -26,7 +27,10 @@
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/device_comm.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
 
+#include <numeric>
 #include <optional>
 
 namespace {
@@ -91,9 +95,22 @@ struct extract_ego_functor : public cugraph::c_api::abstract_functor {
                  source_vertices.size(),
                  handle_.get_stream());
 
+      std::optional<rmm::device_uvector<size_t>> source_indices{std::nullopt};
+
       if constexpr (multi_gpu) {
-        source_vertices = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(
-          handle_, std::move(source_vertices));
+        auto displacements = cugraph::host_scalar_allgather(
+          handle_.get_comms(), source_vertices.size(), handle_.get_stream());
+        std::exclusive_scan(
+          displacements.begin(), displacements.end(), displacements.begin(), size_t{0});
+        source_indices = rmm::device_uvector<size_t>(source_vertices.size(), handle_.get_stream());
+        cugraph::detail::sequence_fill(handle_.get_stream(),
+                                       (*source_indices).data(),
+                                       (*source_indices).size(),
+                                       displacements[handle_.get_comms().get_rank()]);
+
+        std::tie(source_vertices, source_indices) =
+          cugraph::detail::shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+            handle_, std::move(source_vertices), std::move(*source_indices));
       }
 
       cugraph::renumber_ext_vertices<vertex_t, multi_gpu>(
@@ -130,6 +147,31 @@ struct extract_ego_functor : public cugraph::c_api::abstract_functor {
         graph_view.vertex_partition_range_lasts(),
         do_expensive_check_);
 
+      if constexpr (multi_gpu) {
+        auto recvcounts = cugraph::host_scalar_allgather(
+          handle_.get_comms(), (*source_indices).size(), handle_.get_stream());
+        std::vector<size_t> displacements(recvcounts.size());
+        std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0});
+        rmm::device_uvector<size_t> allgathered_indices(displacements.back() + recvcounts.back(),
+                                                        handle_.get_stream());
+        cugraph::device_allgatherv(handle_.get_comms(),
+                                   (*source_indices).begin(),
+                                   allgathered_indices.begin(),
+                                   recvcounts,
+                                   displacements,
+                                   handle_.get_stream());
+        source_indices = std::move(allgathered_indices);
+
+        std::tie(edge_offsets, src, dst, wgt) =
+          cugraph::c_api::detail::reorder_extracted_egonets<vertex_t, weight_t>(
+            handle_,
+            std::move(*source_indices),
+            std::move(edge_offsets),
+            std::move(src),
+            std::move(dst),
+            std::move(wgt));
+      }
+
       result_ = new cugraph::c_api::cugraph_induced_subgraph_result_t{
         new cugraph::c_api::cugraph_type_erased_device_array_t(src, graph_->vertex_type_),
         new cugraph::c_api::cugraph_type_erased_device_array_t(dst, graph_->vertex_type_),
diff --git a/cpp/src/c_api/graph_mg.cpp b/cpp/src/c_api/graph_mg.cpp
index 326022a3fa9..57a589caf02 100644
--- a/cpp/src/c_api/graph_mg.cpp
+++ b/cpp/src/c_api/graph_mg.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -217,7 +217,10 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
                                       std::move(edgelist_dsts),
                                       std::move(edgelist_weights),
                                       std::move(edgelist_edge_ids),
-                                      std::move(edgelist_edge_types));
+                                      std::move(edgelist_edge_types),
+                                      properties_->is_symmetric
+                                        ? true /* keep minimum weight edges to maintain symmetry */
+                                        : false);
       }
 
       std::tie(*graph, new_edge_weights, new_edge_ids, new_edge_types, new_number_map) =
diff --git a/cpp/src/c_api/graph_sg.cpp b/cpp/src/c_api/graph_sg.cpp
index 7793458b53a..6745be01f95 100644
--- a/cpp/src/c_api/graph_sg.cpp
+++ b/cpp/src/c_api/graph_sg.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -200,7 +200,10 @@ struct create_graph_functor : public cugraph::c_api::abstract_functor {
                                       std::move(edgelist_dsts),
                                       std::move(edgelist_weights),
                                       std::move(edgelist_edge_ids),
-                                      std::move(edgelist_edge_types));
+                                      std::move(edgelist_edge_types),
+                                      properties_->is_symmetric
+                                        ? true /* keep minimum weight edges to maintain symmetry */
+                                        : false);
       }
 
       std::tie(*graph, new_edge_weights, new_edge_ids, new_edge_types, new_number_map) =
diff --git a/cpp/src/c_api/louvain.cpp b/cpp/src/c_api/louvain.cpp
index 0e48b29388a..a131ee6a3ad 100644
--- a/cpp/src/c_api/louvain.cpp
+++ b/cpp/src/c_api/louvain.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -95,18 +95,19 @@ struct louvain_functor : public cugraph::c_api::abstract_functor {
       // could add support in Louvain for std::nullopt as the edge weights behaving
       // as desired and only instantiating a real edge_property_view_t for the
       // coarsened graphs.
-      auto [level, modularity] =
-        cugraph::louvain(handle_,
-                         graph_view,
-                         (edge_weights != nullptr)
-                           ? std::make_optional(edge_weights->view())
-                           : std::make_optional(cugraph::c_api::create_constant_edge_property(
-                                                  handle_, graph_view, weight_t{1})
-                                                  .view()),
-                         clusters.data(),
-                         max_level_,
-                         static_cast<weight_t>(threshold_),
-                         static_cast<weight_t>(resolution_));
+      auto [level, modularity] = cugraph::louvain(
+        handle_,
+        std::optional<std::reference_wrapper<raft::random::RngState>>{std::nullopt},
+        graph_view,
+        (edge_weights != nullptr)
+          ? std::make_optional(edge_weights->view())
+          : std::make_optional(
+              cugraph::c_api::create_constant_edge_property(handle_, graph_view, weight_t{1})
+                .view()),
+        clusters.data(),
+        max_level_,
+        static_cast<weight_t>(threshold_),
+        static_cast<weight_t>(resolution_));
 
       rmm::device_uvector<vertex_t> vertices(graph_view.local_vertex_partition_range_size(),
                                              handle_.get_stream());
diff --git a/cpp/src/centrality/eigenvector_centrality_impl.cuh b/cpp/src/centrality/eigenvector_centrality_impl.cuh
index 8d1bea4004d..2129dca6985 100644
--- a/cpp/src/centrality/eigenvector_centrality_impl.cuh
+++ b/cpp/src/centrality/eigenvector_centrality_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,7 +117,7 @@ rmm::device_uvector<weight_t> eigenvector_centrality(
         edge_src_centralities.view(),
         edge_dst_dummy_property_t{}.view(),
         edge_dummy_property_t{}.view(),
-        [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val * 1.0; },
+        [] __device__(vertex_t, vertex_t, auto src_val, auto, auto) { return src_val; },
         weight_t{0},
         reduce_op::plus<weight_t>{},
         centralities.begin());
diff --git a/cpp/src/community/detail/mis_impl.cuh b/cpp/src/community/detail/mis_impl.cuh
index bcd71af5a08..2659a982183 100644
--- a/cpp/src/community/detail/mis_impl.cuh
+++ b/cpp/src/community/detail/mis_impl.cuh
@@ -37,6 +37,8 @@
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
+#include <cuda/functional>
+
 #include <cmath>
 
 namespace cugraph {
@@ -78,13 +80,13 @@ rmm::device_uvector<vertex_t> maximal_independent_set(
   thrust::copy(handle.get_thrust_policy(), vertex_begin, vertex_end, ranks.begin());
 
   // Set ranks of zero out-degree vetices to std::numeric_limits<vertex_t>::lowest()
-  thrust::transform_if(
-    handle.get_thrust_policy(),
-    out_degrees.begin(),
-    out_degrees.end(),
-    ranks.begin(),
-    [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); },
-    [] __device__(auto deg) { return deg == 0; });
+  thrust::transform_if(handle.get_thrust_policy(),
+                       out_degrees.begin(),
+                       out_degrees.end(),
+                       ranks.begin(),
+                       cuda::proclaim_return_type<vertex_t>(
+                         [] __device__(auto) { return std::numeric_limits<vertex_t>::lowest(); }),
+                       [] __device__(auto deg) { return deg == 0; });
 
   out_degrees.resize(0, handle.get_stream());
   out_degrees.shrink_to_fit(handle.get_stream());
diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh
index ebaae498d04..eb874657f01 100644
--- a/cpp/src/community/detail/refine_impl.cuh
+++ b/cpp/src/community/detail/refine_impl.cuh
@@ -46,6 +46,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 CUCO_DECLARE_BITWISE_COMPARABLE(float)
 CUCO_DECLARE_BITWISE_COMPARABLE(double)
 // FIXME: a temporary workaround for a compiler error, should be deleted once cuco gets patched.
@@ -213,16 +215,17 @@ refine_clustering(
                                 : detail::edge_minor_property_view_t<vertex_t, vertex_t const*>(
                                     louvain_assignment_of_vertices.data(), vertex_t{0}),
     *edge_weight_view,
-    [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
-      weight_t weighted_cut_contribution{0};
+    cuda::proclaim_return_type<weight_t>(
+      [] __device__(auto src, auto dst, auto src_cluster, auto dst_cluster, auto wt) {
+        weight_t weighted_cut_contribution{0};
 
-      if (src == dst)  // self loop
-        weighted_cut_contribution = 0;
-      else if (src_cluster == dst_cluster)
-        weighted_cut_contribution = wt;
+        if (src == dst)  // self loop
+          weighted_cut_contribution = 0;
+        else if (src_cluster == dst_cluster)
+          weighted_cut_contribution = wt;
 
-      return weighted_cut_contribution;
-    },
+        return weighted_cut_contribution;
+      }),
     weight_t{0},
     cugraph::reduce_op::plus<weight_t>{},
     weighted_cut_of_vertices_to_louvain.begin());
@@ -243,13 +246,14 @@ refine_clustering(
                     wcut_deg_and_cluster_vol_triple_begin,
                     wcut_deg_and_cluster_vol_triple_end,
                     singleton_and_connected_flags.begin(),
-                    [resolution, total_edge_weight] __device__(auto wcut_wdeg_and_louvain_volume) {
+                    cuda::proclaim_return_type<uint8_t>([resolution, total_edge_weight] __device__(
+                                                          auto wcut_wdeg_and_louvain_volume) {
                       auto wcut           = thrust::get<0>(wcut_wdeg_and_louvain_volume);
                       auto wdeg           = thrust::get<1>(wcut_wdeg_and_louvain_volume);
                       auto louvain_volume = thrust::get<2>(wcut_wdeg_and_louvain_volume);
-                      return wcut >
-                             (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight);
-                    });
+                      return static_cast<uint8_t>(
+                        wcut > (resolution * wdeg * (louvain_volume - wdeg) / total_edge_weight));
+                    }));
 
   edge_src_property_t<GraphViewType, weight_t> src_louvain_cluster_weight_cache(handle);
   edge_src_property_t<GraphViewType, weight_t> src_cut_to_louvain_cache(handle);
@@ -718,11 +722,12 @@ refine_clustering(
       vertices_in_mis.begin(),
       vertices_in_mis.end(),
       dst_vertices.begin(),
-      [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
-       v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
-        auto dst = *(dst_first + v - v_first);
-        return dst;
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [dst_first = thrust::get<1>(gain_and_dst_first.get_iterator_tuple()),
+         v_first   = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) {
+          auto dst = *(dst_first + v - v_first);
+          return dst;
+        }));
 
     cugraph::resize_dataframe_buffer(gain_and_dst_output_pairs, 0, handle.get_stream());
     cugraph::shrink_to_fit_dataframe_buffer(gain_and_dst_output_pairs, handle.get_stream());
diff --git a/cpp/src/community/ecg_impl.cuh b/cpp/src/community/ecg_impl.cuh
new file mode 100644
index 00000000000..f885952dfe6
--- /dev/null
+++ b/cpp/src/community/ecg_impl.cuh
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <prims/fill_edge_property.cuh>
+#include <prims/transform_e.cuh>
+#include <prims/update_edge_src_dst_property.cuh>
+
+#include <community/detail/common_methods.hpp>
+#include <cugraph/algorithms.hpp>
+#include <cugraph/edge_property.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/random/rng_state.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, size_t, weight_t> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  weight_t min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  weight_t threshold,
+  weight_t resolution)
+{
+  using graph_view_t = cugraph::graph_view_t<vertex_t, edge_t, false, multi_gpu>;
+
+  CUGRAPH_EXPECTS(min_weight >= weight_t{0.0},
+                  "Invalid input arguments: min_weight must be positive");
+  CUGRAPH_EXPECTS(ensemble_size >= 1,
+                  "Invalid input arguments: ensemble_size must be a non-zero integer");
+  CUGRAPH_EXPECTS(
+    threshold > 0.0 && threshold <= 1.0,
+    "Invalid input arguments: threshold must be a positive number in range (0.0, 1.0]");
+  CUGRAPH_EXPECTS(
+    resolution > 0.0 && resolution <= 1.0,
+    "Invalid input arguments: resolution must be a positive number in range (0.0, 1.0]");
+
+  edge_src_property_t<graph_view_t, vertex_t> src_cluster_assignments(handle, graph_view);
+  edge_dst_property_t<graph_view_t, vertex_t> dst_cluster_assignments(handle, graph_view);
+  edge_property_t<graph_view_t, weight_t> modified_edge_weights(handle, graph_view);
+
+  cugraph::fill_edge_property(handle, graph_view, weight_t{0}, modified_edge_weights);
+
+  weight_t modularity = -1.0;
+  rmm::device_uvector<vertex_t> cluster_assignments(graph_view.local_vertex_partition_range_size(),
+                                                    handle.get_stream());
+
+  for (size_t i = 0; i < ensemble_size; i++) {
+    std::tie(std::ignore, modularity) = cugraph::louvain(
+      handle,
+      std::make_optional(std::reference_wrapper<raft::random::RngState>(rng_state)),
+      graph_view,
+      edge_weight_view,
+      cluster_assignments.data(),
+      size_t{1},
+      threshold,
+      resolution);
+
+    cugraph::update_edge_src_property(
+      handle, graph_view, cluster_assignments.begin(), src_cluster_assignments);
+    cugraph::update_edge_dst_property(
+      handle, graph_view, cluster_assignments.begin(), dst_cluster_assignments);
+
+    cugraph::transform_e(
+      handle,
+      graph_view,
+      src_cluster_assignments.view(),
+      dst_cluster_assignments.view(),
+      modified_edge_weights.view(),
+      [] __device__(auto, auto, auto src_property, auto dst_property, auto edge_property) {
+        return edge_property + (src_property == dst_property);
+      },
+      modified_edge_weights.mutable_view());
+  }
+
+  cugraph::transform_e(
+    handle,
+    graph_view,
+    edge_src_dummy_property_t{}.view(),
+    edge_dst_dummy_property_t{}.view(),
+    view_concat(*edge_weight_view, modified_edge_weights.view()),
+    [min_weight, ensemble_size = static_cast<weight_t>(ensemble_size)] __device__(
+      auto, auto, thrust::nullopt_t, thrust::nullopt_t, auto edge_properties) {
+      auto e_weight    = thrust::get<0>(edge_properties);
+      auto e_frequency = thrust::get<1>(edge_properties);
+      return min_weight + (e_weight - min_weight) * e_frequency / ensemble_size;
+    },
+    modified_edge_weights.mutable_view());
+
+  std::tie(max_level, modularity) =
+    cugraph::louvain(handle,
+                     std::make_optional(std::reference_wrapper<raft::random::RngState>(rng_state)),
+                     graph_view,
+                     std::make_optional(modified_edge_weights.view()),
+                     cluster_assignments.data(),
+                     max_level,
+                     threshold,
+                     resolution);
+
+  // Compute final modularity using original edge weights
+  weight_t total_edge_weight =
+    cugraph::compute_total_edge_weight(handle, graph_view, *edge_weight_view);
+
+  if constexpr (multi_gpu) {
+    cugraph::update_edge_src_property(
+      handle, graph_view, cluster_assignments.begin(), src_cluster_assignments);
+    cugraph::update_edge_dst_property(
+      handle, graph_view, cluster_assignments.begin(), dst_cluster_assignments);
+  }
+
+  auto [cluster_keys, cluster_weights] = cugraph::detail::compute_cluster_keys_and_values(
+    handle, graph_view, edge_weight_view, cluster_assignments, src_cluster_assignments);
+
+  modularity = detail::compute_modularity(handle,
+                                          graph_view,
+                                          edge_weight_view,
+                                          src_cluster_assignments,
+                                          dst_cluster_assignments,
+                                          cluster_assignments,
+                                          cluster_weights,
+                                          total_edge_weight,
+                                          resolution);
+
+  return std::make_tuple(std::move(cluster_assignments), max_level, modularity);
+}
+
+}  // namespace detail
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+std::tuple<rmm::device_uvector<vertex_t>, size_t, weight_t> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
+  std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
+  weight_t min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  weight_t threshold,
+  weight_t resolution)
+{
+  return detail::ecg(handle,
+                     rng_state,
+                     graph_view,
+                     edge_weight_view,
+                     min_weight,
+                     ensemble_size,
+                     max_level,
+                     threshold,
+                     resolution);
+}
+
+}  // namespace cugraph
diff --git a/cpp/src/community/ecg_mg.cu b/cpp/src/community/ecg_mg.cu
new file mode 100644
index 00000000000..9c910c70739
--- /dev/null
+++ b/cpp/src/community/ecg_mg.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <community/ecg_impl.cuh>
+
+namespace cugraph {
+template std::tuple<rmm::device_uvector<int32_t>, size_t, float> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+
+  float min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  float threshold,
+  float resolution);
+
+template std::tuple<rmm::device_uvector<int32_t>, size_t, float> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+
+  float min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  float threshold,
+  float resolution);
+
+template std::tuple<rmm::device_uvector<int64_t>, size_t, float> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+
+  float min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  float threshold,
+  float resolution);
+
+template std::tuple<rmm::device_uvector<int32_t>, size_t, double> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+
+  double min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  double threshold,
+  double resolution);
+
+template std::tuple<rmm::device_uvector<int32_t>, size_t, double> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+
+  double min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  double threshold,
+  double resolution);
+
+template std::tuple<rmm::device_uvector<int64_t>, size_t, double> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, true> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+
+  double min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  double threshold,
+  double resolution);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/ecg_sg.cu b/cpp/src/community/ecg_sg.cu
new file mode 100644
index 00000000000..530fb035ed5
--- /dev/null
+++ b/cpp/src/community/ecg_sg.cu
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <community/ecg_impl.cuh>
+
+namespace cugraph {
+template std::tuple<rmm::device_uvector<int32_t>, size_t, float> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, float const*>> edge_weight_view,
+
+  float min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  float threshold,
+  float resolution);
+
+template std::tuple<rmm::device_uvector<int32_t>, size_t, float> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+
+  float min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  float threshold,
+  float resolution);
+
+template std::tuple<rmm::device_uvector<int64_t>, size_t, float> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, float const*>> edge_weight_view,
+
+  float min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  float threshold,
+  float resolution);
+
+template std::tuple<rmm::device_uvector<int32_t>, size_t, double> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int32_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int32_t, double const*>> edge_weight_view,
+
+  double min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  double threshold,
+  double resolution);
+
+template std::tuple<rmm::device_uvector<int32_t>, size_t, double> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int32_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+
+  double min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  double threshold,
+  double resolution);
+
+template std::tuple<rmm::device_uvector<int64_t>, size_t, double> ecg(
+  raft::handle_t const& handle,
+  raft::random::RngState& rng_state,
+  graph_view_t<int64_t, int64_t, false, false> const& graph_view,
+  std::optional<edge_property_view_t<int64_t, double const*>> edge_weight_view,
+
+  double min_weight,
+  size_t ensemble_size,
+  size_t max_level,
+  double threshold,
+  double resolution);
+
+}  // namespace cugraph
diff --git a/cpp/src/community/flatten_dendrogram.hpp b/cpp/src/community/flatten_dendrogram.hpp
index eac20389765..a4299f17d52 100644
--- a/cpp/src/community/flatten_dendrogram.hpp
+++ b/cpp/src/community/flatten_dendrogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ void leiden_partition_at_level(raft::handle_t const& handle,
     thrust::make_counting_iterator<size_t>(0),
     thrust::make_counting_iterator<size_t>((level - 1) / 2),
     [&handle, &dendrogram, &local_vertex_ids_v, &d_partition, local_num_verts](size_t l) {
-      cugraph::relabel<vertex_t, false>(
+      cugraph::relabel<vertex_t, multi_gpu>(
         handle,
         std::tuple<vertex_t const*, vertex_t const*>(dendrogram.get_level_ptr_nocheck(2 * l + 1),
                                                      dendrogram.get_level_ptr_nocheck(2 * l + 2)),
diff --git a/cpp/src/community/leiden_impl.cuh b/cpp/src/community/leiden_impl.cuh
index b6e20272de9..1e2b8f2ad44 100644
--- a/cpp/src/community/leiden_impl.cuh
+++ b/cpp/src/community/leiden_impl.cuh
@@ -568,17 +568,17 @@ void flatten_leiden_dendrogram(raft::handle_t const& handle,
   leiden_partition_at_level<vertex_t, multi_gpu>(
     handle, dendrogram, clustering, dendrogram.num_levels());
 
-  rmm::device_uvector<vertex_t> unique_cluster_ids(graph_view.number_of_vertices(),
+  rmm::device_uvector<vertex_t> unique_cluster_ids(graph_view.local_vertex_partition_range_size(),
                                                    handle.get_stream());
   thrust::copy(handle.get_thrust_policy(),
                clustering,
-               clustering + graph_view.number_of_vertices(),
+               clustering + graph_view.local_vertex_partition_range_size(),
                unique_cluster_ids.begin());
 
   remove_duplicates<vertex_t, multi_gpu>(handle, unique_cluster_ids);
 
   relabel_cluster_ids<vertex_t, multi_gpu>(
-    handle, unique_cluster_ids, clustering, graph_view.number_of_vertices());
+    handle, unique_cluster_ids, clustering, graph_view.local_vertex_partition_range_size());
 }
 
 }  // namespace detail
diff --git a/cpp/src/community/louvain_impl.cuh b/cpp/src/community/louvain_impl.cuh
index 7777921a091..4919dda5a75 100644
--- a/cpp/src/community/louvain_impl.cuh
+++ b/cpp/src/community/louvain_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,15 +18,18 @@
 
 // #define TIMING
 
+// FIXME: Only outstanding items preventing this becoming a .hpp file
+#include <prims/update_edge_src_dst_property.cuh>
+
 #include <community/detail/common_methods.hpp>
 #include <community/flatten_dendrogram.hpp>
-#include <prims/update_edge_src_dst_property.cuh>
-// FIXME: Only outstanding items preventing this becoming a .hpp file
+#include <cugraph/detail/collect_comm_wrapper.hpp>
 #include <cugraph/detail/shuffle_wrappers.hpp>
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
 
+#include <raft/random/rng_state.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace cugraph {
@@ -44,6 +47,7 @@ void check_clustering(graph_view_t<vertex_t, edge_t, false, multi_gpu> const& gr
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   raft::handle_t const& handle,
+  std::optional<std::reference_wrapper<raft::random::RngState>> rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level,
@@ -82,11 +86,25 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
                           current_graph_view.local_vertex_partition_range_size(),
                           handle.get_stream());
 
-    detail::sequence_fill(handle.get_stream(),
-                          dendrogram->current_level_begin(),
-                          dendrogram->current_level_size(),
-                          current_graph_view.local_vertex_partition_range_first());
-
+    if (rng_state) {
+      auto random_cluster_assignments = cugraph::detail::permute_range<vertex_t>(
+        handle,
+        *rng_state,
+        current_graph_view.local_vertex_partition_range_first(),
+        current_graph_view.local_vertex_partition_range_size(),
+        multi_gpu);
+
+      raft::copy(dendrogram->current_level_begin(),
+                 random_cluster_assignments.begin(),
+                 random_cluster_assignments.size(),
+                 handle.get_stream());
+
+    } else {
+      detail::sequence_fill(handle.get_stream(),
+                            dendrogram->current_level_begin(),
+                            dendrogram->current_level_size(),
+                            current_graph_view.local_vertex_partition_range_first());
+    }
     //
     //  Compute the vertex and cluster weights, these are different for each
     //  graph in the hierarchical decomposition
@@ -289,6 +307,7 @@ void flatten_dendrogram(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   raft::handle_t const& handle,
+  std::optional<std::reference_wrapper<raft::random::RngState>> rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   size_t max_level,
@@ -298,7 +317,9 @@ std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   CUGRAPH_EXPECTS(edge_weight_view.has_value(), "Graph must be weighted");
-  return detail::louvain(handle, graph_view, edge_weight_view, max_level, threshold, resolution);
+
+  return detail::louvain(
+    handle, rng_state, graph_view, edge_weight_view, max_level, threshold, resolution);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -315,6 +336,7 @@ void flatten_dendrogram(raft::handle_t const& handle,
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 std::pair<size_t, weight_t> louvain(
   raft::handle_t const& handle,
+  std::optional<std::reference_wrapper<raft::random::RngState>> rng_state,
   graph_view_t<vertex_t, edge_t, false, multi_gpu> const& graph_view,
   std::optional<edge_property_view_t<edge_t, weight_t const*>> edge_weight_view,
   vertex_t* clustering,
@@ -330,8 +352,8 @@ std::pair<size_t, weight_t> louvain(
   std::unique_ptr<Dendrogram<vertex_t>> dendrogram;
   weight_t modularity;
 
-  std::tie(dendrogram, modularity) =
-    detail::louvain(handle, graph_view, edge_weight_view, max_level, threshold, resolution);
+  std::tie(dendrogram, modularity) = detail::louvain(
+    handle, rng_state, graph_view, edge_weight_view, max_level, threshold, resolution);
 
   detail::flatten_dendrogram(handle, graph_view, *dendrogram, clustering);
 
diff --git a/cpp/src/community/louvain_mg.cu b/cpp/src/community/louvain_mg.cu
index 0be32ed049f..51fb5e3d93d 100644
--- a/cpp/src/community/louvain_mg.cu
+++ b/cpp/src/community/louvain_mg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ namespace cugraph {
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, true> const&,
   std::optional<edge_property_view_t<int32_t, float const*>>,
   size_t,
@@ -29,6 +30,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
@@ -36,6 +38,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   float);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
@@ -43,6 +46,7 @@ template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, true> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   size_t,
@@ -50,6 +54,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   double);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
@@ -57,6 +62,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   double);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
@@ -65,6 +71,7 @@ template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
 
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, true> const&,
   std::optional<edge_property_view_t<int32_t, float const*>>,
   int32_t*,
@@ -73,6 +80,7 @@ template std::pair<size_t, float> louvain(
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, true> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   int32_t*,
@@ -81,6 +89,7 @@ template std::pair<size_t, double> louvain(
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int32_t*,
@@ -89,6 +98,7 @@ template std::pair<size_t, float> louvain(
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int32_t*,
@@ -97,6 +107,7 @@ template std::pair<size_t, double> louvain(
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int64_t*,
@@ -105,6 +116,7 @@ template std::pair<size_t, float> louvain(
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, true> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int64_t*,
diff --git a/cpp/src/community/louvain_sg.cu b/cpp/src/community/louvain_sg.cu
index 3fc0ffab928..557c219d424 100644
--- a/cpp/src/community/louvain_sg.cu
+++ b/cpp/src/community/louvain_sg.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@ namespace cugraph {
 
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, false> const&,
   std::optional<edge_property_view_t<int32_t, float const*>>,
   size_t,
@@ -29,6 +30,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
@@ -36,6 +38,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
   float);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   size_t,
@@ -43,6 +46,7 @@ template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
   float);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, false> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   size_t,
@@ -50,6 +54,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   double);
 template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
@@ -57,6 +62,7 @@ template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
   double);
 template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   size_t,
@@ -65,6 +71,7 @@ template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
 
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, false> const&,
   std::optional<edge_property_view_t<int32_t, float const*>>,
   int32_t*,
@@ -73,6 +80,7 @@ template std::pair<size_t, float> louvain(
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int32_t, false, false> const&,
   std::optional<edge_property_view_t<int32_t, double const*>>,
   int32_t*,
@@ -81,6 +89,7 @@ template std::pair<size_t, double> louvain(
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int32_t*,
@@ -89,6 +98,7 @@ template std::pair<size_t, float> louvain(
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int32_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int32_t*,
@@ -97,6 +107,7 @@ template std::pair<size_t, double> louvain(
   double);
 template std::pair<size_t, float> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, float const*>>,
   int64_t*,
@@ -105,6 +116,7 @@ template std::pair<size_t, float> louvain(
   float);
 template std::pair<size_t, double> louvain(
   raft::handle_t const&,
+  std::optional<std::reference_wrapper<raft::random::RngState>>,
   graph_view_t<int64_t, int64_t, false, false> const&,
   std::optional<edge_property_view_t<int64_t, double const*>>,
   int64_t*,
diff --git a/cpp/src/detail/collect_local_vertex_values.cu b/cpp/src/detail/collect_local_vertex_values.cu
index 9d5d2cb553b..795902dfd87 100644
--- a/cpp/src/detail/collect_local_vertex_values.cu
+++ b/cpp/src/detail/collect_local_vertex_values.cu
@@ -19,6 +19,8 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
+#include <cuda/functional>
+
 namespace cugraph {
 namespace detail {
 
@@ -64,7 +66,8 @@ rmm::device_uvector<value_t> collect_local_vertex_values_from_ext_vertex_value_p
 
   auto vertex_iterator = thrust::make_transform_iterator(
     d_vertices.begin(),
-    [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; });
+    cuda::proclaim_return_type<vertex_t>(
+      [local_vertex_first] __device__(vertex_t v) { return v - local_vertex_first; }));
 
   d_local_values.resize(local_vertex_last - local_vertex_first, handle.get_stream());
   thrust::fill(
diff --git a/cpp/src/detail/permute_range.cu b/cpp/src/detail/permute_range.cu
new file mode 100644
index 00000000000..cc77f022616
--- /dev/null
+++ b/cpp/src/detail/permute_range.cu
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cugraph/detail/collect_comm_wrapper.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/detail/utility_wrappers.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+#include <cugraph/utilities/shuffle_comm.cuh>
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/random/rng_state.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sort.h>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t>
+rmm::device_uvector<vertex_t> permute_range(raft::handle_t const& handle,
+                                            raft::random::RngState& rng_state,
+                                            vertex_t local_range_start,
+                                            vertex_t local_range_size,
+                                            bool multi_gpu,
+                                            bool do_expensive_check)
+{
+  if (do_expensive_check && multi_gpu) {
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    auto global_start =
+      cugraph::host_scalar_bcast(handle.get_comms(), local_range_start, 0, handle.get_stream());
+    auto sub_range_sizes =
+      cugraph::host_scalar_allgather(handle.get_comms(), local_range_size, handle.get_stream());
+    std::exclusive_scan(
+      sub_range_sizes.begin(), sub_range_sizes.end(), sub_range_sizes.begin(), global_start);
+    CUGRAPH_EXPECTS(
+      sub_range_sizes[comm_rank] == local_range_start,
+      "Invalid input arguments: a rage must have contiguous and non-overlapping values");
+  }
+  rmm::device_uvector<vertex_t> permuted_integers(local_range_size, handle.get_stream());
+
+  // generate as many integers as #local_range_size on each GPU
+  detail::sequence_fill(
+    handle.get_stream(), permuted_integers.begin(), permuted_integers.size(), local_range_start);
+
+  if (multi_gpu) {
+    // randomly distribute integers to all GPUs
+    auto& comm           = handle.get_comms();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
+
+    std::vector<size_t> tx_value_counts(comm_size, 0);
+
+    {
+      rmm::device_uvector<vertex_t> d_target_ranks(permuted_integers.size(), handle.get_stream());
+
+      cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                           d_target_ranks.data(),
+                                           d_target_ranks.size(),
+                                           vertex_t{0},
+                                           vertex_t{comm_size},
+                                           rng_state);
+
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          d_target_ranks.begin(),
+                          d_target_ranks.end(),
+                          permuted_integers.begin());
+
+      rmm::device_uvector<vertex_t> d_reduced_ranks(comm_size, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_reduced_counts(comm_size, handle.get_stream());
+
+      auto output_end = thrust::reduce_by_key(handle.get_thrust_policy(),
+                                              d_target_ranks.begin(),
+                                              d_target_ranks.end(),
+                                              thrust::make_constant_iterator(1),
+                                              d_reduced_ranks.begin(),
+                                              d_reduced_counts.begin(),
+                                              thrust::equal_to<int>());
+
+      auto nr_output_pairs =
+        static_cast<vertex_t>(thrust::distance(d_reduced_ranks.begin(), output_end.first));
+
+      std::vector<vertex_t> h_reduced_ranks(comm_size);
+      std::vector<vertex_t> h_reduced_counts(comm_size);
+
+      raft::update_host(
+        h_reduced_ranks.data(), d_reduced_ranks.data(), nr_output_pairs, handle.get_stream());
+
+      raft::update_host(
+        h_reduced_counts.data(), d_reduced_counts.data(), nr_output_pairs, handle.get_stream());
+
+      for (int i = 0; i < static_cast<int>(nr_output_pairs); i++) {
+        tx_value_counts[h_reduced_ranks[i]] = static_cast<size_t>(h_reduced_counts[i]);
+      }
+    }
+
+    std::tie(permuted_integers, std::ignore) = cugraph::shuffle_values(
+      handle.get_comms(), permuted_integers.begin(), tx_value_counts, handle.get_stream());
+  }
+
+  // permute locally
+  rmm::device_uvector<float> fractional_random_numbers(permuted_integers.size(),
+                                                       handle.get_stream());
+
+  cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                       fractional_random_numbers.data(),
+                                       fractional_random_numbers.size(),
+                                       float{0.0},
+                                       float{1.0},
+                                       rng_state);
+  thrust::sort_by_key(handle.get_thrust_policy(),
+                      fractional_random_numbers.begin(),
+                      fractional_random_numbers.end(),
+                      permuted_integers.begin());
+
+  if (multi_gpu) {
+    // take care of deficits and extras numbers
+    auto& comm           = handle.get_comms();
+    auto const comm_rank = comm.get_rank();
+
+    size_t nr_extras{0};
+    size_t nr_deficits{0};
+    if (permuted_integers.size() > static_cast<size_t>(local_range_size)) {
+      nr_extras = permuted_integers.size() - static_cast<size_t>(local_range_size);
+    } else {
+      nr_deficits = static_cast<size_t>(local_range_size) - permuted_integers.size();
+    }
+
+    auto extra_cluster_ids = cugraph::detail::device_allgatherv(
+      handle,
+      comm,
+      raft::device_span<vertex_t const>(permuted_integers.data() + local_range_size,
+                                        nr_extras > 0 ? nr_extras : 0));
+
+    permuted_integers.resize(local_range_size, handle.get_stream());
+    auto deficits =
+      cugraph::host_scalar_allgather(handle.get_comms(), nr_deficits, handle.get_stream());
+
+    std::exclusive_scan(deficits.begin(), deficits.end(), deficits.begin(), vertex_t{0});
+
+    raft::copy(permuted_integers.data() + local_range_size - nr_deficits,
+               extra_cluster_ids.begin() + deficits[comm_rank],
+               nr_deficits,
+               handle.get_stream());
+  }
+
+  assert(permuted_integers.size() == local_range_size);
+  return permuted_integers;
+}
+
+template rmm::device_uvector<int32_t> permute_range(raft::handle_t const& handle,
+                                                    raft::random::RngState& rng_state,
+                                                    int32_t local_range_start,
+                                                    int32_t local_range_size,
+                                                    bool multi_gpu,
+                                                    bool do_expensive_check);
+
+template rmm::device_uvector<int64_t> permute_range(raft::handle_t const& handle,
+                                                    raft::random::RngState& rng_state,
+                                                    int64_t local_range_start,
+                                                    int64_t local_range_size,
+                                                    bool multi_gpu,
+                                                    bool do_expensive_check);
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/src/detail/shuffle_vertices.cu b/cpp/src/detail/shuffle_vertices.cu
index bc450ce3bbf..94729a770f7 100644
--- a/cpp/src/detail/shuffle_vertices.cu
+++ b/cpp/src/detail/shuffle_vertices.cu
@@ -200,6 +200,12 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int32_t>&& vertices,
   rmm::device_uvector<int32_t>&& values);
 
+template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<size_t>>
+shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int32_t>&& vertices,
+  rmm::device_uvector<size_t>&& values);
+
 template std::tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
@@ -224,6 +230,12 @@ shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   rmm::device_uvector<int64_t>&& vertices,
   rmm::device_uvector<int64_t>&& values);
 
+template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<size_t>>
+shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
+  raft::handle_t const& handle,
+  rmm::device_uvector<int64_t>&& vertices,
+  rmm::device_uvector<size_t>&& values);
+
 template std::tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>>
 shuffle_ext_vertex_value_pairs_to_local_gpu_by_vertex_partitioning(
   raft::handle_t const& handle,
diff --git a/cpp/src/detail/utility_wrappers.cu b/cpp/src/detail/utility_wrappers.cu
index 2d5bf6215b1..9100ecbd5e1 100644
--- a/cpp/src/detail/utility_wrappers.cu
+++ b/cpp/src/detail/utility_wrappers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,13 @@
  */
 #include <cugraph/detail/utility_wrappers.hpp>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
 
 #include <raft/random/rng.cuh>
 
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/zip_iterator.h>
@@ -227,5 +229,20 @@ template bool is_equal(raft::handle_t const& handle,
                        raft::device_span<int64_t const> span1,
                        raft::device_span<int64_t const> span2);
 
+template <typename data_t>
+size_t count_values(raft::handle_t const& handle,
+                    raft::device_span<data_t const> span,
+                    data_t value)
+{
+  return thrust::count(handle.get_thrust_policy(), span.begin(), span.end(), value);
+}
+
+template size_t count_values<int32_t>(raft::handle_t const& handle,
+                                      raft::device_span<int32_t const> span,
+                                      int32_t value);
+template size_t count_values<int64_t>(raft::handle_t const& handle,
+                                      raft::device_span<int64_t const> span,
+                                      int64_t value);
+
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/src/generators/erdos_renyi_generator.cu b/cpp/src/generators/erdos_renyi_generator.cu
index 6d847ae0bde..8448eeaf960 100644
--- a/cpp/src/generators/erdos_renyi_generator.cu
+++ b/cpp/src/generators/erdos_renyi_generator.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,8 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 namespace cugraph {
 
 template <typename vertex_t>
@@ -42,12 +44,13 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                   "Implementation cannot support specified value");
 
   auto random_iterator = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_t>(0), [seed] __device__(size_t index) {
+    thrust::make_counting_iterator<size_t>(0),
+    cuda::proclaim_return_type<float>([seed] __device__(size_t index) {
       thrust::default_random_engine rng(seed);
       thrust::uniform_real_distribution<float> dist(0.0, 1.0);
       rng.discard(index);
       return dist(rng);
-    });
+    }));
 
   size_t count = thrust::count_if(handle.get_thrust_policy(),
                                   random_iterator,
@@ -69,13 +72,14 @@ generate_erdos_renyi_graph_edgelist_gnp(raft::handle_t const& handle,
                     indices_v.begin(),
                     indices_v.end(),
                     thrust::make_zip_iterator(thrust::make_tuple(src_v.begin(), src_v.end())),
-                    [num_vertices] __device__(size_t index) {
-                      size_t src = index / num_vertices;
-                      size_t dst = index % num_vertices;
-
-                      return thrust::make_tuple(static_cast<vertex_t>(src),
-                                                static_cast<vertex_t>(dst));
-                    });
+                    cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+                      [num_vertices] __device__(size_t index) {
+                        size_t src = index / num_vertices;
+                        size_t dst = index % num_vertices;
+
+                        return thrust::make_tuple(static_cast<vertex_t>(src),
+                                                  static_cast<vertex_t>(dst));
+                      }));
 
   handle.sync_stream();
 
diff --git a/cpp/src/generators/simple_generators.cu b/cpp/src/generators/simple_generators.cu
index 6dba63909c3..65647be5de0 100644
--- a/cpp/src/generators/simple_generators.cu
+++ b/cpp/src/generators/simple_generators.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <thrust/sequence.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 
 namespace cugraph {
@@ -264,23 +266,24 @@ generate_complete_graph_edgelist(
 
     auto transform_iter = thrust::make_transform_iterator(
       thrust::make_counting_iterator<size_t>(0),
-      [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
-        size_t graph_index = index / (num_vertices * num_vertices);
-        size_t local_index = index % (num_vertices * num_vertices);
-
-        vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
-        vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
-
-        if (src == dst) {
-          src = invalid_vertex;
-          dst = invalid_vertex;
-        } else {
-          src += (graph_index * num_vertices);
-          dst += (graph_index * num_vertices);
-        }
-
-        return thrust::make_tuple(src, dst);
-      });
+      cuda::proclaim_return_type<thrust::tuple<vertex_t, vertex_t>>(
+        [base_vertex_id, num_vertices, invalid_vertex] __device__(size_t index) {
+          size_t graph_index = index / (num_vertices * num_vertices);
+          size_t local_index = index % (num_vertices * num_vertices);
+
+          vertex_t src = base_vertex_id + static_cast<vertex_t>(local_index / num_vertices);
+          vertex_t dst = base_vertex_id + static_cast<vertex_t>(local_index % num_vertices);
+
+          if (src == dst) {
+            src = invalid_vertex;
+            dst = invalid_vertex;
+          } else {
+            src += (graph_index * num_vertices);
+            dst += (graph_index * num_vertices);
+          }
+
+          return thrust::make_tuple(src, dst);
+        }));
 
     output_iterator = thrust::copy_if(handle.get_thrust_policy(),
                                       transform_iter,
diff --git a/cpp/src/link_analysis/hits_impl.cuh b/cpp/src/link_analysis/hits_impl.cuh
index 674046745b1..5cdf1b9dc6a 100644
--- a/cpp/src/link_analysis/hits_impl.cuh
+++ b/cpp/src/link_analysis/hits_impl.cuh
@@ -80,6 +80,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
   if (num_vertices == 0) { return std::make_tuple(diff_sum, final_iteration_count); }
 
   CUGRAPH_EXPECTS(epsilon >= 0.0, "Invalid input argument: epsilon should be non-negative.");
+  auto tolerance = static_cast<result_t>(graph_view.number_of_vertices()) * epsilon;
 
   // Check validity of initial guess if supplied
   if (has_initial_hubs_guess && do_expensive_check) {
@@ -171,7 +172,7 @@ std::tuple<result_t, size_t> hits(raft::handle_t const& handle,
     std::swap(prev_hubs, curr_hubs);
     iter++;
 
-    if (diff_sum < epsilon) {
+    if (diff_sum < tolerance) {
       break;
     } else if (iter >= max_iterations) {
       CUGRAPH_FAIL("HITS failed to converge.");
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index 92c70fcff20..9a76ba73f92 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -288,7 +288,7 @@ centrality_algorithm_metadata_t pagerank(
         edge_dst_dummy_property_t{}.view(),
         edge_dummy_property_t{}.view(),
         [alpha] __device__(vertex_t, vertex_t, auto src_val, auto, auto) {
-          return src_val * 1.0 * alpha;
+          return src_val * alpha;
         },
         unvarying_part,
         reduce_op::plus<result_t>{},
diff --git a/cpp/src/link_prediction/similarity_impl.cuh b/cpp/src/link_prediction/similarity_impl.cuh
index 55e8f5c88d7..7ac294d7719 100644
--- a/cpp/src/link_prediction/similarity_impl.cuh
+++ b/cpp/src/link_prediction/similarity_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <prims/count_if_e.cuh>
 #include <prims/per_v_pair_transform_dst_nbr_intersection.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
diff --git a/cpp/src/mtmg/vertex_result.cu b/cpp/src/mtmg/vertex_result.cu
index 97fcd291c87..414f1bdfa88 100644
--- a/cpp/src/mtmg/vertex_result.cu
+++ b/cpp/src/mtmg/vertex_result.cu
@@ -21,21 +21,21 @@
 
 #include <detail/graph_partition_utils.cuh>
 
+#include <thrust/functional.h>
 #include <thrust/gather.h>
 
 namespace cugraph {
 namespace mtmg {
 
 template <typename result_t>
-template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+template <typename vertex_t, bool multi_gpu>
 rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   handle_t const& handle,
   raft::device_span<vertex_t const> vertices,
-  cugraph::mtmg::graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  std::vector<vertex_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<vertex_t, multi_gpu> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<vertex_t>>& renumber_map_view)
 {
-  auto this_gpu_graph_view = graph_view.get(handle);
-
   rmm::device_uvector<vertex_t> local_vertices(vertices.size(), handle.get_stream());
   rmm::device_uvector<int> vertex_gpu_ids(vertices.size(), handle.get_stream());
   rmm::device_uvector<size_t> vertex_pos(vertices.size(), handle.get_stream());
@@ -47,11 +47,11 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
   cugraph::detail::sequence_fill(
     handle.get_stream(), vertex_pos.data(), vertex_pos.size(), size_t{0});
 
-  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(
-    this_gpu_graph_view.vertex_partition_range_lasts().size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(),
+                                                               handle.get_stream());
   raft::update_device(d_vertex_partition_range_lasts.data(),
-                      this_gpu_graph_view.vertex_partition_range_lasts().data(),
-                      this_gpu_graph_view.vertex_partition_range_lasts().size(),
+                      vertex_partition_range_lasts.data(),
+                      vertex_partition_range_lasts.size(),
                       handle.get_stream());
 
   if (renumber_map_view) {
@@ -60,8 +60,8 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       local_vertices.data(),
       local_vertices.size(),
       renumber_map_view->get(handle).data(),
-      this_gpu_graph_view.local_vertex_partition_range_first(),
-      this_gpu_graph_view.local_vertex_partition_range_last());
+      vertex_partition_view.local_vertex_partition_range_first(),
+      vertex_partition_view.local_vertex_partition_range_last());
   }
 
   auto const major_comm_size =
@@ -89,13 +89,14 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
 
   auto& wrapped = this->get(handle);
 
-  auto vertex_partition = vertex_partition_device_view_t<vertex_t, multi_gpu>(
-    this_gpu_graph_view.local_vertex_partition_view());
+  auto vertex_partition =
+    vertex_partition_device_view_t<vertex_t, multi_gpu>(vertex_partition_view);
 
-  auto iter =
-    thrust::make_transform_iterator(local_vertices.begin(), [vertex_partition] __device__(auto v) {
+  auto iter = thrust::make_transform_iterator(
+    local_vertices.begin(),
+    cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
       return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-    });
+    }));
 
   thrust::gather(handle.get_thrust_policy(),
                  iter,
@@ -112,7 +113,7 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
       vertex_gpu_ids.begin(),
       vertex_gpu_ids.end(),
       thrust::make_zip_iterator(local_vertices.begin(), vertex_pos.begin(), tmp_result.begin()),
-      [] __device__(int gpu) { return gpu; },
+      thrust::identity{},
       handle.get_stream());
 
   //
@@ -130,37 +131,85 @@ rmm::device_uvector<result_t> vertex_result_view_t<result_t>::gather(
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, false> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
+template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, false> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
 template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
   handle_t const& handle,
   raft::device_span<int64_t const> vertices,
-  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, false> const& graph_view,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int32_t, true, true> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
   handle_t const& handle,
   raft::device_span<int32_t const> vertices,
-  cugraph::mtmg::graph_view_t<int32_t, int64_t, true, true> const& graph_view,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
 
-template rmm::device_uvector<float> vertex_result_view_t<float>::gather(
+template rmm::device_uvector<double> vertex_result_view_t<double>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<int32_t> vertex_result_view_t<int32_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<int32_t> vertex_result_view_t<int32_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int32_t const> vertices,
+  std::vector<int32_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int32_t, true> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int32_t>>& renumber_map_view);
+
+template rmm::device_uvector<int64_t> vertex_result_view_t<int64_t>::gather(
+  handle_t const& handle,
+  raft::device_span<int64_t const> vertices,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, false> vertex_partition_view,
+  std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
+
+template rmm::device_uvector<int64_t> vertex_result_view_t<int64_t>::gather(
   handle_t const& handle,
   raft::device_span<int64_t const> vertices,
-  cugraph::mtmg::graph_view_t<int64_t, int64_t, true, true> const& graph_view,
+  std::vector<int64_t> const& vertex_partition_range_lasts,
+  vertex_partition_view_t<int64_t, true> vertex_partition_view,
   std::optional<cugraph::mtmg::renumber_map_view_t<int64_t>>& renumber_map_view);
 
 }  // namespace mtmg
diff --git a/cpp/src/prims/count_if_e.cuh b/cpp/src/prims/count_if_e.cuh
index f6e4bc9bead..9cff4f5eceb 100644
--- a/cpp/src/prims/count_if_e.cuh
+++ b/cpp/src/prims/count_if_e.cuh
@@ -74,8 +74,6 @@ typename GraphViewType::edge_type count_if_e(raft::handle_t const& handle,
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/detail/nbr_intersection.cuh b/cpp/src/prims/detail/nbr_intersection.cuh
index cefc1836fa6..8261ec747f9 100644
--- a/cpp/src/prims/detail/nbr_intersection.cuh
+++ b/cpp/src/prims/detail/nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <prims/detail/optional_dataframe_buffer.hpp>
 #include <prims/kv_store.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_edge_property_device_view.cuh>
@@ -63,35 +64,6 @@ namespace cugraph {
 
 namespace detail {
 
-// check vertices in the pair are valid and first element of the pair is within the local vertex
-// partition range
-template <typename vertex_t>
-struct is_invalid_input_vertex_pair_t {
-  vertex_t num_vertices{};
-  raft::device_span<vertex_t const> edge_partition_major_range_firsts{};
-  raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
-  vertex_t edge_partition_minor_range_first{};
-  vertex_t edge_partition_minor_range_last{};
-
-  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> pair) const
-  {
-    auto major = thrust::get<0>(pair);
-    auto minor = thrust::get<1>(pair);
-    if (!is_valid_vertex(num_vertices, major) || !is_valid_vertex(num_vertices, minor)) {
-      return true;
-    }
-    auto it = thrust::upper_bound(thrust::seq,
-                                  edge_partition_major_range_lasts.begin(),
-                                  edge_partition_major_range_lasts.end(),
-                                  major);
-    if (it == edge_partition_major_range_lasts.end()) { return true; }
-    auto edge_partition_idx =
-      static_cast<size_t>(thrust::distance(edge_partition_major_range_lasts.begin(), it));
-    if (major < edge_partition_major_range_firsts[edge_partition_idx]) { return true; }
-    return (minor < edge_partition_minor_range_first) || (minor >= edge_partition_minor_range_last);
-  }
-};
-
 // group index determined by major_comm_rank (primary key) and local edge partition index (secondary
 // key)
 template <typename vertex_t>
@@ -154,24 +126,11 @@ struct update_rx_major_local_degree_t {
     auto major =
       rx_majors[rx_group_firsts[major_comm_rank * minor_comm_size + local_edge_partition_idx] +
                 offset_in_local_edge_partition];
-    vertex_t major_idx{0};
-    edge_t local_degree{0};
-    if (multi_gpu && (edge_partition.major_hypersparse_first() &&
-                      (major >= *(edge_partition.major_hypersparse_first())))) {
-      auto major_hypersparse_idx = edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-      if (major_hypersparse_idx) {
-        major_idx =
-          (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-          *major_hypersparse_idx;
-        local_degree = edge_partition.local_degree(major_idx);
-      }
-    } else {
-      major_idx    = edge_partition.major_offset_from_major_nocheck(major);
-      local_degree = edge_partition.local_degree(major_idx);
-    }
+    auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+    auto local_degree = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
     if (edge_partition_e_mask && (local_degree > edge_t{0})) {
-      auto local_offset = edge_partition.local_offset(major_idx);
+      auto local_offset = edge_partition.local_offset(*major_idx);
       local_degree      = static_cast<edge_t>(
         count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree));
     }
@@ -325,29 +284,11 @@ struct pick_min_degree_t {
     edge_t local_degree0{0};
     vertex_t major0 = thrust::get<0>(pair);
     if constexpr (std::is_same_v<FirstElementToIdxMap, void*>) {
-      vertex_t major_idx{0};
-      if constexpr (multi_gpu) {
-        if (edge_partition.major_hypersparse_first() &&
-            (major0 >= *(edge_partition.major_hypersparse_first()))) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major0);
-          if (major_hypersparse_idx) {
-            major_idx =
-              (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-              *major_hypersparse_idx;
-            local_degree0 = edge_partition.local_degree(major_idx);
-          }
-        } else {
-          major_idx     = edge_partition.major_offset_from_major_nocheck(major0);
-          local_degree0 = edge_partition.local_degree(major_idx);
-        }
-      } else {
-        major_idx     = edge_partition.major_offset_from_major_nocheck(major0);
-        local_degree0 = edge_partition.local_degree(major_idx);
-      }
+      auto major_idx = edge_partition.major_idx_from_major_nocheck(major0);
+      local_degree0  = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
       if (edge_partition_e_mask && (local_degree0 > edge_t{0})) {
-        auto local_offset = edge_partition.local_offset(major_idx);
+        auto local_offset = edge_partition.local_offset(*major_idx);
         local_degree0 =
           count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree0);
       }
@@ -360,29 +301,11 @@ struct pick_min_degree_t {
     edge_t local_degree1{0};
     vertex_t major1 = thrust::get<1>(pair);
     if constexpr (std::is_same_v<SecondElementToIdxMap, void*>) {
-      vertex_t major_idx{0};
-      if constexpr (multi_gpu) {
-        if (edge_partition.major_hypersparse_first() &&
-            (major1 >= *(edge_partition.major_hypersparse_first()))) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major1);
-          if (major_hypersparse_idx) {
-            major_idx =
-              (*(edge_partition.major_hypersparse_first()) - edge_partition.major_range_first()) +
-              *major_hypersparse_idx;
-            local_degree1 = edge_partition.local_degree(major_idx);
-          }
-        } else {
-          major_idx     = edge_partition.major_offset_from_major_nocheck(major1);
-          local_degree1 = edge_partition.local_degree(major_idx);
-        }
-      } else {
-        major_idx     = edge_partition.major_offset_from_major_nocheck(major1);
-        local_degree1 = edge_partition.local_degree(major_idx);
-      }
+      auto major_idx = edge_partition.major_idx_from_major_nocheck(major1);
+      local_degree1  = major_idx ? edge_partition.local_degree(*major_idx) : edge_t{0};
 
       if (edge_partition_e_mask && (local_degree1 > edge_t{0})) {
-        auto local_offset = edge_partition.local_offset(major_idx);
+        auto local_offset = edge_partition.local_offset(*major_idx);
         local_degree1 =
           count_set_bits((*edge_partition_e_mask).value_first(), local_offset, local_degree1);
       }
@@ -699,77 +622,6 @@ struct gatherv_indices_t {
   }
 };
 
-template <typename GraphViewType, typename VertexPairIterator>
-size_t count_invalid_vertex_pairs(raft::handle_t const& handle,
-                                  GraphViewType const& graph_view,
-                                  VertexPairIterator vertex_pair_first,
-                                  VertexPairIterator vertex_pair_last)
-{
-  using vertex_t = typename GraphViewType::vertex_type;
-
-  std::vector<vertex_t> h_edge_partition_major_range_firsts(
-    graph_view.number_of_local_edge_partitions());
-  std::vector<vertex_t> h_edge_partition_major_range_lasts(
-    h_edge_partition_major_range_firsts.size());
-  vertex_t edge_partition_minor_range_first{};
-  vertex_t edge_partition_minor_range_last{};
-  if constexpr (GraphViewType::is_multi_gpu) {
-    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
-      if constexpr (GraphViewType::is_storage_transposed) {
-        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_dst_range_first(i);
-        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_dst_range_last(i);
-      } else {
-        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_src_range_first(i);
-        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_src_range_last(i);
-      }
-    }
-    if constexpr (GraphViewType::is_storage_transposed) {
-      edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
-      edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
-    } else {
-      edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
-      edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
-    }
-  } else {
-    h_edge_partition_major_range_firsts[0] = vertex_t{0};
-    h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
-    edge_partition_minor_range_first       = vertex_t{0};
-    edge_partition_minor_range_last        = graph_view.number_of_vertices();
-  }
-  rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
-    h_edge_partition_major_range_firsts.size(), handle.get_stream());
-  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
-    h_edge_partition_major_range_lasts.size(), handle.get_stream());
-  raft::update_device(d_edge_partition_major_range_firsts.data(),
-                      h_edge_partition_major_range_firsts.data(),
-                      h_edge_partition_major_range_firsts.size(),
-                      handle.get_stream());
-  raft::update_device(d_edge_partition_major_range_lasts.data(),
-                      h_edge_partition_major_range_lasts.data(),
-                      h_edge_partition_major_range_lasts.size(),
-                      handle.get_stream());
-
-  auto num_invalid_pairs = thrust::count_if(
-    handle.get_thrust_policy(),
-    vertex_pair_first,
-    vertex_pair_last,
-    is_invalid_input_vertex_pair_t<vertex_t>{
-      graph_view.number_of_vertices(),
-      raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
-                                        d_edge_partition_major_range_firsts.end()),
-      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
-                                        d_edge_partition_major_range_lasts.end()),
-      edge_partition_minor_range_first,
-      edge_partition_minor_range_last});
-  if constexpr (GraphViewType::is_multi_gpu) {
-    auto& comm = handle.get_comms();
-    num_invalid_pairs =
-      host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
-  }
-
-  return num_invalid_pairs;
-}
-
 // In multi-GPU, the first element of every vertex pair in [vertex_pair_first, vertex_pair) should
 // be within the valid edge partition major range assigned to this process and the second element
 // should be within the valid edge partition minor range assigned to this process.
diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh
new file mode 100644
index 00000000000..2785ba38dfd
--- /dev/null
+++ b/cpp/src/prims/detail/prim_functors.cuh
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cugraph/edge_partition_device_view.cuh>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename GraphViewType,
+          typename EdgePartitionSrcValueInputWrapper,
+          typename EdgePartitionDstValueInputWrapper,
+          typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgeOp>
+struct call_e_op_t {
+  edge_partition_device_view_t<typename GraphViewType::vertex_type,
+                               typename GraphViewType::edge_type,
+                               GraphViewType::is_multi_gpu> const& edge_partition{};
+  EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input{};
+  EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input{};
+  EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input{};
+  EdgeOp const& e_op{};
+  typename GraphViewType::vertex_type major{};
+  typename GraphViewType::vertex_type major_offset{};
+  typename GraphViewType::vertex_type const* indices{nullptr};
+  typename GraphViewType::edge_type edge_offset{};
+
+  __device__ auto operator()(typename GraphViewType::edge_type i) const
+  {
+    auto minor        = indices[i];
+    auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+    auto src          = GraphViewType::is_storage_transposed ? minor : major;
+    auto dst          = GraphViewType::is_storage_transposed ? major : minor;
+    auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+    auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+    return e_op(src,
+                dst,
+                edge_partition_src_value_input.get(src_offset),
+                edge_partition_dst_value_input.get(dst_offset),
+                edge_partition_e_value_input.get(edge_offset + i));
+  }
+};
+
+}  // namespace detail
+
+}  // namespace cugraph
diff --git a/cpp/src/prims/fill_edge_property.cuh b/cpp/src/prims/fill_edge_property.cuh
index d446944b65b..e6875576044 100644
--- a/cpp/src/prims/fill_edge_property.cuh
+++ b/cpp/src/prims/fill_edge_property.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
 #include <cugraph/edge_property.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/error.hpp>
@@ -23,6 +24,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
 
 #include <cstddef>
 
@@ -38,21 +40,78 @@ void fill_edge_property(raft::handle_t const& handle,
 {
   static_assert(std::is_same_v<T, typename EdgePropertyOutputWrapper::value_type>);
 
+  using edge_t = typename GraphViewType::edge_type;
+
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   auto value_firsts = edge_property_output.value_firsts();
   auto edge_counts  = edge_property_output.edge_counts();
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+
     if constexpr (cugraph::has_packed_bool_element<
                     std::remove_reference_t<decltype(value_firsts[i])>,
                     T>()) {
       static_assert(std::is_arithmetic_v<T>, "unimplemented for thrust::tuple types.");
       auto packed_input = input ? packed_bool_full_mask() : packed_bool_empty_mask();
-      thrust::fill_n(handle.get_thrust_policy(),
-                     value_firsts[i],
-                     packed_bool_size(static_cast<size_t>(edge_counts[i])),
-                     packed_input);
+      auto rem          = edge_counts[i] % packed_bools_per_word();
+      if (edge_partition_e_mask) {
+        auto input_first =
+          thrust::make_zip_iterator(value_firsts[i], (*edge_partition_e_mask).value_first());
+        thrust::transform(handle.get_thrust_policy(),
+                          input_first,
+                          input_first + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+                          value_firsts[i],
+                          [packed_input] __device__(thrust::tuple<T, uint32_t> pair) {
+                            auto old_value = thrust::get<0>(pair);
+                            auto mask      = thrust::get<1>(pair);
+                            return (old_value & ~mask) | (packed_input & mask);
+                          });
+        if (rem > 0) {
+          thrust::transform(
+            handle.get_thrust_policy(),
+            input_first + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+            input_first + packed_bool_size(static_cast<size_t>(edge_counts[i])),
+            value_firsts[i] + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+            [packed_input, rem] __device__(thrust::tuple<T, uint32_t> pair) {
+              auto old_value = thrust::get<0>(pair);
+              auto mask      = thrust::get<1>(pair);
+              return ((old_value & ~mask) | (packed_input & mask)) & packed_bool_partial_mask(rem);
+            });
+        }
+      } else {
+        thrust::fill_n(handle.get_thrust_policy(),
+                       value_firsts[i],
+                       packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+                       packed_input);
+        if (rem > 0) {
+          thrust::fill_n(
+            handle.get_thrust_policy(),
+            value_firsts[i] + packed_bool_size(static_cast<size_t>(edge_counts[i] - rem)),
+            1,
+            packed_input & packed_bool_partial_mask(rem));
+        }
+      }
     } else {
-      thrust::fill_n(
-        handle.get_thrust_policy(), value_firsts[i], static_cast<size_t>(edge_counts[i]), input);
+      if (edge_partition_e_mask) {
+        thrust::transform_if(handle.get_thrust_policy(),
+                             thrust::make_constant_iterator(input),
+                             thrust::make_constant_iterator(input) + edge_counts[i],
+                             thrust::make_counting_iterator(edge_t{0}),
+                             value_firsts[i],
+                             thrust::identity<T>{},
+                             [edge_partition_e_mask = *edge_partition_e_mask] __device__(edge_t i) {
+                               return edge_partition_e_mask.get(i);
+                             });
+      } else {
+        thrust::fill_n(
+          handle.get_thrust_policy(), value_firsts[i], static_cast<size_t>(edge_counts[i]), input);
+      }
     }
   }
 }
@@ -79,8 +138,6 @@ void fill_edge_property(raft::handle_t const& handle,
                         edge_property_t<GraphViewType, T>& edge_property_output,
                         bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
index 201c08325d7..469bfcb4e47 100644
--- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
+++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <prims/detail/nbr_intersection.cuh>
 #include <prims/property_op_utils.cuh>
 #include <utilities/collect_comm.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_partition_device_view.cuh>
 #include <cugraph/edge_partition_edge_property_device_view.cuh>
diff --git a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
index 5fee97790f1..4c5c43c7d1e 100644
--- a/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
@@ -44,6 +44,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 #include <tuple>
 
@@ -596,8 +598,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
       if (tmp_storage_bytes > d_tmp_storage.size()) {
         d_tmp_storage = rmm::device_uvector<std::byte>(tmp_storage_bytes, handle.get_stream());
@@ -615,8 +618,9 @@ rmm::device_uvector<edge_t> get_sampling_index_without_replacement(
                                         multiplier_t<size_t>{high_partition_over_sampling_K}),
         thrust::make_transform_iterator(
           thrust::make_counting_iterator(size_t{0}),
-          [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
-            size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; }),
+          cuda::proclaim_return_type<size_t>(
+            [high_partition_over_sampling_K, unique_counts = unique_counts.data()] __device__(
+              size_t i) { return i * high_partition_over_sampling_K + unique_counts[i]; })),
         handle.get_stream());
 
       // copy the neighbor indices back to sample_nbr_indices
diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
index 0b6c6a554bb..24b4f0857b1 100644
--- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
+++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <detail/graph_partition_utils.cuh>
+#include <prims/detail/prim_functors.cuh>
 #include <prims/fill_edge_src_dst_property.cuh>
 #include <prims/property_op_utils.cuh>
 #include <prims/reduce_op.cuh>
@@ -51,6 +52,8 @@
 #include <thrust/tuple.h>
 #include <thrust/type_traits/integer_sequence.h>
 
+#include <cuda/functional>
+
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -61,11 +64,84 @@ namespace detail {
 
 int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512;
 
+template <typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename ResultValueOutputIteratorOrWrapper>
+struct transform_and_atomic_reduce_t {
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition{};
+  result_t identity_element{};
+  vertex_t const* indices{nullptr};
+  TransformOp const& transform_op{};
+  ResultValueOutputIteratorOrWrapper& result_value_output{};
+
+  __device__ void operator()(edge_t i) const
+  {
+    auto e_op_result = transform_op(i);
+    if (e_op_result != identity_element) {
+      auto minor        = indices[i];
+      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+      if constexpr (multi_gpu) {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+      } else {
+        reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+      }
+    }
+  }
+};
+
+template <bool update_major,
+          typename vertex_t,
+          typename edge_t,
+          bool multi_gpu,
+          typename result_t,
+          typename TransformOp,
+          typename ReduceOp,
+          typename ResultValueOutputIteratorOrWrapper>
+__device__ void update_result_value_output(
+  edge_partition_device_view_t<vertex_t, edge_t, multi_gpu> const& edge_partition,
+  vertex_t const* indices,
+  edge_t local_degree,
+  TransformOp const& transform_op,
+  result_t init,
+  ReduceOp const& reduce_op,
+  size_t output_idx /* relevent only when update_major === true */,
+  result_t identity_element,
+  ResultValueOutputIteratorOrWrapper& result_value_output)
+{
+  if constexpr (update_major) {
+    *(result_value_output + output_idx) =
+      thrust::transform_reduce(thrust::seq,
+                               thrust::make_counting_iterator(edge_t{0}),
+                               thrust::make_counting_iterator(local_degree),
+                               transform_op,
+                               init,
+                               reduce_op);
+  } else {
+    thrust::for_each(
+      thrust::seq,
+      thrust::make_counting_iterator(edge_t{0}),
+      thrust::make_counting_iterator(local_degree),
+      transform_and_atomic_reduce_t<vertex_t,
+                                    edge_t,
+                                    multi_gpu,
+                                    result_t,
+                                    TransformOp,
+                                    ReduceOp,
+                                    ResultValueOutputIteratorOrWrapper>{
+        edge_partition, identity_element, indices, transform_op, result_value_output});
+  }
+}
+
 template <bool update_major,
           typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -80,9 +156,11 @@ __global__ void per_v_transform_reduce_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
   ReduceOp reduce_op)
 {
   static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
@@ -102,6 +180,7 @@ __global__ void per_v_transform_reduce_e_hypersparse(
   while (idx < static_cast<size_t>(dcs_nzd_vertex_count)) {
     auto major =
       *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast<vertex_t>(idx)));
+    auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
     auto major_idx =
       major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
     vertex_t const* indices{nullptr};
@@ -109,60 +188,50 @@ __global__ void per_v_transform_reduce_e_hypersparse(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_idx));
-    auto transform_op = [&edge_partition,
-                         &edge_partition_src_value_input,
-                         &edge_partition_dst_value_input,
-                         &edge_partition_e_value_input,
-                         &e_op,
-                         major,
-                         indices,
-                         edge_offset] __device__(auto i) {
-      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed ? minor : major;
-      auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-      auto src_offset   = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-      auto dst_offset   = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-      return e_op(src,
-                  dst,
-                  edge_partition_src_value_input.get(src_offset),
-                  edge_partition_dst_value_input.get(dst_offset),
-                  edge_partition_e_value_input.get(edge_offset + i));
-    };
 
-    if constexpr (update_major) {
-      *(result_value_output + (major - *(edge_partition.major_hypersparse_first()))) =
-        thrust::transform_reduce(thrust::seq,
-                                 thrust::make_counting_iterator(edge_t{0}),
-                                 thrust::make_counting_iterator(local_degree),
-                                 transform_op,
-                                 init,
-                                 reduce_op);
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      auto transform_op =
+        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return identity_element;
+          }
+        };
+
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               transform_op,
+                                               init,
+                                               reduce_op,
+                                               major - *(edge_partition).major_hypersparse_first(),
+                                               identity_element,
+                                               result_value_output);
     } else {
-      if constexpr (GraphViewType::is_multi_gpu) {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          });
-      } else {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          });
-      }
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               call_e_op,
+                                               init,
+                                               reduce_op,
+                                               major - *(edge_partition).major_hypersparse_first(),
+                                               identity_element,
+                                               result_value_output);
     }
     idx += gridDim.x * blockDim.x;
   }
@@ -173,6 +242,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -189,9 +259,11 @@ __global__ void per_v_transform_reduce_e_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
+  T identity_element /* relevant only if update_major == true */,
   ReduceOp reduce_op)
 {
   static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v<
@@ -207,71 +279,57 @@ __global__ void per_v_transform_reduce_e_low_degree(
   auto idx = static_cast<size_t>(tid);
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) =
       edge_partition.local_edges(static_cast<vertex_t>(major_offset));
-    auto transform_op = [&edge_partition,
-                         &edge_partition_src_value_input,
-                         &edge_partition_dst_value_input,
-                         &edge_partition_e_value_input,
-                         &e_op,
-                         major_offset,
-                         indices,
-                         edge_offset] __device__(auto i) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      return e_op(src,
-                  dst,
-                  edge_partition_src_value_input.get(src_offset),
-                  edge_partition_dst_value_input.get(dst_offset),
-                  edge_partition_e_value_input.get(edge_offset + i));
-    };
 
-    if constexpr (update_major) {
-      *(result_value_output + idx) =
-        thrust::transform_reduce(thrust::seq,
-                                 thrust::make_counting_iterator(edge_t{0}),
-                                 thrust::make_counting_iterator(local_degree),
-                                 transform_op,
-                                 init,
-                                 reduce_op);
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
+    if (edge_partition_e_mask) {
+      auto transform_op =
+        [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) {
+          if ((*edge_partition_e_mask).get(edge_offset + i)) {
+            return call_e_op(i);
+          } else {
+            return identity_element;
+          }
+        };
+
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               transform_op,
+                                               init,
+                                               reduce_op,
+                                               idx,
+                                               identity_element,
+                                               result_value_output);
     } else {
-      if constexpr (GraphViewType::is_multi_gpu) {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
-          });
-      } else {
-        thrust::for_each(
-          thrust::seq,
-          thrust::make_counting_iterator(edge_t{0}),
-          thrust::make_counting_iterator(local_degree),
-          [&edge_partition, indices, &result_value_output, &transform_op] __device__(auto i) {
-            auto e_op_result  = transform_op(i);
-            auto minor        = indices[i];
-            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
-          });
-      }
+      update_result_value_output<update_major>(edge_partition,
+                                               indices,
+                                               local_degree,
+                                               call_e_op,
+                                               init,
+                                               reduce_op,
+                                               idx,
+                                               identity_element,
+                                               result_value_output);
     }
     idx += gridDim.x * blockDim.x;
   }
@@ -282,6 +340,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -298,6 +357,7 @@ __global__ void per_v_transform_reduce_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -325,41 +385,61 @@ __global__ void per_v_transform_reduce_e_mid_degree(
                  raft::warp_size()];  // relevant only if update_major == true
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
     [[maybe_unused]] auto reduced_e_op_result =
       lane_id == 0 ? init : identity_element;  // relevant only if update_major == true
-    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      if constexpr (update_major) {
-        reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-      } else {
-        if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+    if (edge_partition_e_mask) {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          if constexpr (update_major) {
+            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+          } else {
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+            if constexpr (GraphViewType::is_multi_gpu) {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+            } else {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+            }
+          }
+        }
+      }
+    } else {
+      for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+        auto e_op_result = call_e_op(i);
+        if constexpr (update_major) {
+          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
         } else {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+          if constexpr (GraphViewType::is_multi_gpu) {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+          } else {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          }
         }
       }
     }
+
     if constexpr (update_major) {
       reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()])
                               .Reduce(reduced_e_op_result, reduce_op);
@@ -375,6 +455,7 @@ template <bool update_major,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultValueOutputIteratorOrWrapper /* wrapper if update_major &&
                                                          GraphViewType::is_multi_gpu, iterator
                                                          otherwise */
@@ -391,6 +472,7 @@ __global__ void per_v_transform_reduce_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultValueOutputIteratorOrWrapper result_value_output,
   EdgeOp e_op,
   T init /* relevant only if update_major == true */,
@@ -414,41 +496,61 @@ __global__ void per_v_transform_reduce_e_high_degree(
     typename BlockReduce::TempStorage temp_storage;  // relevant only if update_major == true
 
   while (idx < static_cast<size_t>(major_range_last - major_range_first)) {
-    auto major_offset = major_start_offset + idx;
+    auto major_offset = static_cast<vertex_t>(major_start_offset + idx);
+    auto major        = edge_partition.major_from_major_offset_nocheck(major_offset);
     vertex_t const* indices{nullptr};
     edge_t edge_offset{};
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
+
+    auto call_e_op = call_e_op_t<GraphViewType,
+                                 EdgePartitionSrcValueInputWrapper,
+                                 EdgePartitionDstValueInputWrapper,
+                                 EdgePartitionEdgeValueInputWrapper,
+                                 EdgeOp>{edge_partition,
+                                         edge_partition_src_value_input,
+                                         edge_partition_dst_value_input,
+                                         edge_partition_e_value_input,
+                                         e_op,
+                                         major,
+                                         major_offset,
+                                         indices,
+                                         edge_offset};
+
     [[maybe_unused]] auto reduced_e_op_result =
       threadIdx.x == 0 ? init : identity_element;  // relevant only if update_major == true
-    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      if constexpr (update_major) {
-        reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
-      } else {
-        if constexpr (GraphViewType::is_multi_gpu) {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+    if (edge_partition_e_mask) {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        if ((*edge_partition_e_mask).get(edge_offset + i)) {
+          auto e_op_result = call_e_op(i);
+          if constexpr (update_major) {
+            reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
+          } else {
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+            if constexpr (GraphViewType::is_multi_gpu) {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+            } else {
+              reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+            }
+          }
+        }
+      }
+    } else {
+      for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+        auto e_op_result = call_e_op(i);
+        if constexpr (update_major) {
+          reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result);
         } else {
-          reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]);
+          if constexpr (GraphViewType::is_multi_gpu) {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output, minor_offset, e_op_result);
+          } else {
+            reduce_op::atomic_reduce<ReduceOp>(result_value_output + minor_offset, e_op_result);
+          }
         }
       }
     }
+
     if constexpr (update_major) {
       reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op);
       if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; }
@@ -591,10 +693,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
           value_size = sizeof(T);
         }
 
-        auto avg_vertex_degree = graph_view.number_of_vertices() > 0
-                                   ? (static_cast<double>(graph_view.number_of_edges()) /
-                                      static_cast<double>(graph_view.number_of_vertices()))
-                                   : double{0.0};
+        auto avg_vertex_degree =
+          graph_view.number_of_vertices() > 0
+            ? (static_cast<double>(graph_view.compute_number_of_edges(handle)) /
+               static_cast<double>(graph_view.number_of_vertices()))
+            : double{0.0};
 
         num_streams =
           std::min(static_cast<size_t>(avg_vertex_degree * (static_cast<double>(sizeof(vertex_t)) /
@@ -653,10 +756,18 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
 
   if (stream_pool_indices) { handle.sync_stream(); }
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     auto major_init = ReduceOp::identity_element;
     if constexpr (update_major) {
@@ -734,9 +845,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
               edge_partition_src_value_input,
               edge_partition_dst_value_input,
               edge_partition_e_value_input,
+              edge_partition_e_mask,
               segment_output_buffer,
               e_op,
               major_init,
+              ReduceOp::identity_element,
               reduce_op);
         }
       }
@@ -758,9 +871,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             segment_output_buffer,
             e_op,
             major_init,
+            ReduceOp::identity_element,
             reduce_op);
       }
       if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
@@ -781,6 +896,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             segment_output_buffer,
             e_op,
             major_init,
@@ -803,6 +919,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             output_buffer,
             e_op,
             major_init,
@@ -822,9 +939,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             output_buffer,
             e_op,
             major_init,
+            ReduceOp::identity_element,
             reduce_op);
       }
     }
@@ -940,16 +1059,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle,
                      minor_init);
         auto value_first = thrust::make_transform_iterator(
           view.value_first(),
-          [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); });
-        thrust::scatter(
-          handle.get_thrust_policy(),
-          value_first + (*minor_key_offsets)[i],
-          value_first + (*minor_key_offsets)[i + 1],
-          thrust::make_transform_iterator(
-            (*(view.keys())).begin() + (*minor_key_offsets)[i],
-            [key_first = graph_view.vertex_partition_range_first(
-               this_segment_vertex_partition_id)] __device__(auto key) { return key - key_first; }),
-          tx_buffer_first);
+          cuda::proclaim_return_type<T>(
+            [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); }));
+        thrust::scatter(handle.get_thrust_policy(),
+                        value_first + (*minor_key_offsets)[i],
+                        value_first + (*minor_key_offsets)[i + 1],
+                        thrust::make_transform_iterator(
+                          (*(view.keys())).begin() + (*minor_key_offsets)[i],
+                          cuda::proclaim_return_type<vertex_t>(
+                            [key_first = graph_view.vertex_partition_range_first(
+                               this_segment_vertex_partition_id)] __device__(auto key) {
+                              return key - key_first;
+                            })),
+                        tx_buffer_first);
         device_reduce(major_comm,
                       tx_buffer_first,
                       vertex_value_output_first,
@@ -1050,8 +1172,6 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -1131,8 +1251,6 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle,
                                        VertexValueOutputIterator vertex_value_output_first,
                                        bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/transform_e.cuh b/cpp/src/prims/transform_e.cuh
index edacdc8a970..93a2d040b60 100644
--- a/cpp/src/prims/transform_e.cuh
+++ b/cpp/src/prims/transform_e.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,12 @@
 #pragma once
 
 #include <cugraph/edge_partition_device_view.cuh>
+#include <cugraph/edge_partition_edge_property_device_view.cuh>
 #include <cugraph/edge_partition_endpoint_property_device_view.cuh>
 #include <cugraph/edge_src_dst_property.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
 #include <cugraph/utilities/packed_bool_utils.hpp>
 
 #include <raft/core/handle.hpp>
@@ -44,6 +46,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename EdgePartitionEdgeValueOutputWrapper,
           typename EdgeOp>
 __global__ void transform_e_packed_bool(
@@ -53,6 +56,7 @@ __global__ void transform_e_packed_bool(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   EdgePartitionEdgeValueOutputWrapper edge_partition_e_value_output,
   EdgeOp e_op)
 {
@@ -68,11 +72,14 @@ __global__ void transform_e_packed_bool(
 
   auto num_edges = edge_partition.number_of_edges();
   while (idx < static_cast<edge_t>(packed_bool_size(num_edges))) {
+    auto edge_mask = packed_bool_full_mask();
+    if (edge_partition_e_mask) { edge_mask = *((*edge_partition_e_mask).value_first() + idx); }
+
     auto local_edge_idx =
       idx * static_cast<edge_t>(packed_bools_per_word()) + static_cast<edge_t>(lane_id);
-    uint32_t mask{0};
     int predicate{0};
-    if (local_edge_idx < num_edges) {
+
+    if ((local_edge_idx < num_edges) && (edge_mask & packed_bool_mask(lane_id))) {
       auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(local_edge_idx);
       auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
       auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
@@ -91,8 +98,15 @@ __global__ void transform_e_packed_bool(
                           ? int{1}
                           : int{0};
     }
-    mask = __ballot_sync(uint32_t{0xffffffff}, predicate);
-    if (lane_id == 0) { *(edge_partition_e_value_output.value_first() + idx) = mask; }
+    uint32_t new_val = __ballot_sync(uint32_t{0xffffffff}, predicate);
+    if (lane_id == 0) {
+      if (edge_mask == packed_bool_full_mask()) {
+        *(edge_partition_e_value_output.value_first() + idx) = new_val;
+      } else {
+        auto old_val = *(edge_partition_e_value_output.value_first() + idx);
+        *(edge_partition_e_value_output.value_first() + idx) = (old_val & ~edge_mask) | new_val;
+      }
+    }
 
     idx += static_cast<edge_t>(gridDim.x * (blockDim.x / raft::warp_size()));
   }
@@ -178,12 +192,18 @@ void transform_e(raft::handle_t const& handle,
     typename EdgeValueOutputWrapper::value_iterator,
     typename EdgeValueOutputWrapper::value_type>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
+  auto edge_mask_view = graph_view.edge_mask_view();
 
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -214,35 +234,40 @@ void transform_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             edge_partition_e_value_output,
             e_op);
       }
     } else {
-      thrust::transform(
+      thrust::for_each(
         handle.get_thrust_policy(),
         thrust::make_counting_iterator(edge_t{0}),
         thrust::make_counting_iterator(num_edges),
-        edge_partition_e_value_output.value_first(),
         [e_op,
          edge_partition,
          edge_partition_src_value_input,
          edge_partition_dst_value_input,
-         edge_partition_e_value_input] __device__(edge_t i) {
-          auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
-          auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
-          auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-          auto minor        = *(edge_partition.indices() + i);
-          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-
-          auto src        = GraphViewType::is_storage_transposed ? minor : major;
-          auto dst        = GraphViewType::is_storage_transposed ? major : minor;
-          auto src_offset = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
-          auto dst_offset = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
-          return e_op(src,
-                      dst,
-                      edge_partition_src_value_input.get(src_offset),
-                      edge_partition_dst_value_input.get(dst_offset),
-                      edge_partition_e_value_input.get(i));
+         edge_partition_e_value_input,
+         edge_partition_e_mask,
+         edge_partition_e_value_output] __device__(edge_t i) {
+          if (!edge_partition_e_mask || (*edge_partition_e_mask).get(i)) {
+            auto major_idx    = edge_partition.major_idx_from_local_edge_idx_nocheck(i);
+            auto major        = edge_partition.major_from_major_idx_nocheck(major_idx);
+            auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+            auto minor        = *(edge_partition.indices() + i);
+            auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+
+            auto src         = GraphViewType::is_storage_transposed ? minor : major;
+            auto dst         = GraphViewType::is_storage_transposed ? major : minor;
+            auto src_offset  = GraphViewType::is_storage_transposed ? minor_offset : major_offset;
+            auto dst_offset  = GraphViewType::is_storage_transposed ? major_offset : minor_offset;
+            auto e_op_result = e_op(src,
+                                    dst,
+                                    edge_partition_src_value_input.get(src_offset),
+                                    edge_partition_dst_value_input.get(dst_offset),
+                                    edge_partition_e_value_input.get(i));
+            edge_partition_e_value_output.set(i, e_op_result);
+          }
         });
     }
   }
@@ -336,14 +361,12 @@ void transform_e(raft::handle_t const& handle,
     typename EdgeValueOutputWrapper::value_iterator,
     typename EdgeValueOutputWrapper::value_type>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   auto major_first =
     GraphViewType::is_storage_transposed ? edge_list.dst_begin() : edge_list.src_begin();
   auto minor_first =
     GraphViewType::is_storage_transposed ? edge_list.src_begin() : edge_list.dst_begin();
 
-  auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(major_first, minor_first));
+  auto edge_first = thrust::make_zip_iterator(major_first, minor_first);
 
   if (do_expensive_check) {
     CUGRAPH_EXPECTS(
@@ -382,10 +405,18 @@ void transform_e(raft::handle_t const& handle,
     edge_partition_offsets.back() = edge_list.size();
   }
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     if (do_expensive_check) {
       CUGRAPH_EXPECTS(
@@ -393,31 +424,30 @@ void transform_e(raft::handle_t const& handle,
           handle.get_thrust_policy(),
           edge_first + edge_partition_offsets[i],
           edge_first + edge_partition_offsets[i + 1],
-          [edge_partition] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
-            auto major = thrust::get<0>(edge);
-            auto minor = thrust::get<1>(edge);
-            vertex_t major_idx{};
-            auto major_hypersparse_first = edge_partition.major_hypersparse_first();
-            if (major_hypersparse_first) {
-              if (major < *major_hypersparse_first) {
-                major_idx = edge_partition.major_offset_from_major_nocheck(major);
-              } else {
-                auto major_hypersparse_idx =
-                  edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-                if (!major_hypersparse_idx) { return true; }
-                major_idx =
-                  edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
-                  *major_hypersparse_idx;
-              }
-            } else {
-              major_idx = edge_partition.major_offset_from_major_nocheck(major);
-            }
+          [edge_partition,
+           edge_partition_e_mask] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
+            auto major     = thrust::get<0>(edge);
+            auto minor     = thrust::get<1>(edge);
+            auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+            if (!major_idx) { return true; }
             vertex_t const* indices{nullptr};
             edge_t edge_offset{};
             edge_t local_degree{};
-            thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
-            auto it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
-            return *it != minor;
+            thrust::tie(indices, edge_offset, local_degree) =
+              edge_partition.local_edges(*major_idx);
+            auto lower_it =
+              thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+            if (*lower_it != minor) { return true; }
+            if (edge_partition_e_mask) {
+              auto upper_it =
+                thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
+              if (detail::count_set_bits((*edge_partition_e_mask).value_first(),
+                                         edge_offset + thrust::distance(indices, lower_it),
+                                         thrust::distance(lower_it, upper_it)) == 0) {
+                return true;
+              }
+            }
+            return false;
           }) == 0,
         "Invalid input arguments: edge_list contains edges that do not exist in the input graph.");
     }
@@ -446,30 +476,23 @@ void transform_e(raft::handle_t const& handle,
        edge_partition_src_value_input,
        edge_partition_dst_value_input,
        edge_partition_e_value_input,
+       edge_partition_e_mask,
        edge_partition_e_value_output] __device__(thrust::tuple<vertex_t, vertex_t> edge) {
         auto major = thrust::get<0>(edge);
         auto minor = thrust::get<1>(edge);
 
-        auto major_hypersparse_first = edge_partition.major_hypersparse_first();
-        auto major_offset            = edge_partition.major_offset_from_major_nocheck(major);
-        vertex_t major_idx{major_offset};
-
-        if ((major_hypersparse_first) && (major >= *major_hypersparse_first)) {
-          auto major_hypersparse_idx =
-            edge_partition.major_hypersparse_idx_from_major_nocheck(major);
-          assert(major_hypersparse_idx);
-          major_idx = edge_partition.major_offset_from_major_nocheck(*major_hypersparse_first) +
-                      *major_hypersparse_idx;
-        }
+        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+        auto major_idx    = edge_partition.major_idx_from_major_nocheck(major);
+        assert(major_idx);
 
         auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
 
         vertex_t const* indices{nullptr};
         edge_t edge_offset{};
         edge_t local_degree{};
-        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_idx);
+        thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(*major_idx);
         auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
-        auto upper_it = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+        auto upper_it = thrust::upper_bound(thrust::seq, lower_it, indices + local_degree, minor);
 
         auto src        = GraphViewType::is_storage_transposed ? minor : major;
         auto dst        = GraphViewType::is_storage_transposed ? major : minor;
@@ -478,14 +501,17 @@ void transform_e(raft::handle_t const& handle,
 
         for (auto it = lower_it; it != upper_it; ++it) {
           assert(*it == minor);
-          auto e_op_result =
-            e_op(src,
-                 dst,
-                 edge_partition_src_value_input.get(src_offset),
-                 edge_partition_dst_value_input.get(dst_offset),
-                 edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
-          edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
-                                            e_op_result);
+          if (!edge_partition_e_mask ||
+              ((*edge_partition_e_mask).get(edge_offset + thrust::distance(indices, it)))) {
+            auto e_op_result =
+              e_op(src,
+                   dst,
+                   edge_partition_src_value_input.get(src_offset),
+                   edge_partition_dst_value_input.get(dst_offset),
+                   edge_partition_e_value_input.get(edge_offset + thrust::distance(indices, it)));
+            edge_partition_e_value_output.set(edge_offset + thrust::distance(indices, it),
+                                              e_op_result);
+          }
         }
       });
   }
diff --git a/cpp/src/prims/transform_reduce_e.cuh b/cpp/src/prims/transform_reduce_e.cuh
index 9c23f3fca18..483ab64dcd9 100644
--- a/cpp/src/prims/transform_reduce_e.cuh
+++ b/cpp/src/prims/transform_reduce_e.cuh
@@ -56,6 +56,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_hypersparse(
@@ -65,6 +66,7 @@ __global__ void transform_reduce_e_hypersparse(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -101,24 +103,31 @@ __global__ void transform_reduce_e_hypersparse(
        &edge_partition_src_value_input,
        &edge_partition_dst_value_input,
        &edge_partition_e_value_input,
+       &edge_partition_e_mask,
        &e_op,
        major,
        indices,
        edge_offset] __device__(auto i) {
-        auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed ? minor : major;
-        auto dst          = GraphViewType::is_storage_transposed ? major : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        return e_op(src,
-                    dst,
-                    edge_partition_src_value_input.get(src_offset),
-                    edge_partition_dst_value_input.get(dst_offset),
-                    edge_partition_e_value_input.get(edge_offset + i));
+        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+          auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+          auto minor        = indices[i];
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+          auto src          = GraphViewType::is_storage_transposed ? minor : major;
+          auto dst          = GraphViewType::is_storage_transposed ? major : minor;
+          auto src_offset   = GraphViewType::is_storage_transposed
+                                                                       ? minor_offset
+                                                                       : static_cast<vertex_t>(major_offset);
+          auto dst_offset   = GraphViewType::is_storage_transposed
+                                                                       ? static_cast<vertex_t>(major_offset)
+                                                                       : minor_offset;
+          return e_op(src,
+                      dst,
+                      edge_partition_src_value_input.get(src_offset),
+                      edge_partition_dst_value_input.get(dst_offset),
+                      edge_partition_e_value_input.get(edge_offset + i));
+        } else {
+          return e_op_result_t{};
+        }
       },
       e_op_result_t{},
       edge_property_add);
@@ -135,6 +144,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_low_degree(
@@ -146,6 +156,7 @@ __global__ void transform_reduce_e_low_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -177,27 +188,34 @@ __global__ void transform_reduce_e_low_degree(
        &edge_partition_src_value_input,
        &edge_partition_dst_value_input,
        &edge_partition_e_value_input,
+       &edge_partition_e_mask,
        &e_op,
        major_offset,
        indices,
        edge_offset] __device__(auto i) {
-        auto minor        = indices[i];
-        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-        auto src          = GraphViewType::is_storage_transposed
-                                                                     ? minor
-                                                                     : edge_partition.major_from_major_offset_nocheck(major_offset);
-        auto dst          = GraphViewType::is_storage_transposed
-                                                                     ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                                                                     : minor;
-        auto src_offset =
-          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-        auto dst_offset =
-          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-        return e_op(src,
-                    dst,
-                    edge_partition_src_value_input.get(src_offset),
-                    edge_partition_dst_value_input.get(dst_offset),
-                    edge_partition_e_value_input.get(edge_offset + i));
+        if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+          auto minor        = indices[i];
+          auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+          auto src          = GraphViewType::is_storage_transposed
+                                                                       ? minor
+                                                                       : edge_partition.major_from_major_offset_nocheck(major_offset);
+          auto dst          = GraphViewType::is_storage_transposed
+                                                                       ? edge_partition.major_from_major_offset_nocheck(major_offset)
+                                                                       : minor;
+          auto src_offset   = GraphViewType::is_storage_transposed
+                                                                       ? minor_offset
+                                                                       : static_cast<vertex_t>(major_offset);
+          auto dst_offset   = GraphViewType::is_storage_transposed
+                                                                       ? static_cast<vertex_t>(major_offset)
+                                                                       : minor_offset;
+          return e_op(src,
+                      dst,
+                      edge_partition_src_value_input.get(src_offset),
+                      edge_partition_dst_value_input.get(dst_offset),
+                      edge_partition_e_value_input.get(edge_offset + i));
+        } else {
+          return e_op_result_t{};
+        }
       },
       e_op_result_t{},
       edge_property_add);
@@ -214,6 +232,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_mid_degree(
@@ -225,6 +244,7 @@ __global__ void transform_reduce_e_mid_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -250,24 +270,26 @@ __global__ void transform_reduce_e_mid_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
     for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+        auto minor        = indices[i];
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+        auto src          = GraphViewType::is_storage_transposed
+                              ? minor
+                              : edge_partition.major_from_major_offset_nocheck(major_offset);
+        auto dst          = GraphViewType::is_storage_transposed
+                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
+                              : minor;
+        auto src_offset =
+          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
+        auto dst_offset =
+          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
+        auto e_op_result = e_op(src,
+                                dst,
+                                edge_partition_src_value_input.get(src_offset),
+                                edge_partition_dst_value_input.get(dst_offset),
+                                edge_partition_e_value_input.get(edge_offset + i));
+        e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      }
     }
     idx += gridDim.x * (blockDim.x / raft::warp_size());
   }
@@ -280,6 +302,7 @@ template <typename GraphViewType,
           typename EdgePartitionSrcValueInputWrapper,
           typename EdgePartitionDstValueInputWrapper,
           typename EdgePartitionEdgeValueInputWrapper,
+          typename EdgePartitionEdgeMaskWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void transform_reduce_e_high_degree(
@@ -291,6 +314,7 @@ __global__ void transform_reduce_e_high_degree(
   EdgePartitionSrcValueInputWrapper edge_partition_src_value_input,
   EdgePartitionDstValueInputWrapper edge_partition_dst_value_input,
   EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input,
+  thrust::optional<EdgePartitionEdgeMaskWrapper> edge_partition_e_mask,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -313,24 +337,26 @@ __global__ void transform_reduce_e_high_degree(
     edge_t local_degree{};
     thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset);
     for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
-      auto minor        = indices[i];
-      auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
-      auto src          = GraphViewType::is_storage_transposed
-                            ? minor
-                            : edge_partition.major_from_major_offset_nocheck(major_offset);
-      auto dst          = GraphViewType::is_storage_transposed
-                            ? edge_partition.major_from_major_offset_nocheck(major_offset)
-                            : minor;
-      auto src_offset =
-        GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
-      auto dst_offset =
-        GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
-      auto e_op_result = e_op(src,
-                              dst,
-                              edge_partition_src_value_input.get(src_offset),
-                              edge_partition_dst_value_input.get(dst_offset),
-                              edge_partition_e_value_input.get(edge_offset + i));
-      e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      if (!edge_partition_e_mask || (*edge_partition_e_mask).get(edge_offset + i)) {
+        auto minor        = indices[i];
+        auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor);
+        auto src          = GraphViewType::is_storage_transposed
+                              ? minor
+                              : edge_partition.major_from_major_offset_nocheck(major_offset);
+        auto dst          = GraphViewType::is_storage_transposed
+                              ? edge_partition.major_from_major_offset_nocheck(major_offset)
+                              : minor;
+        auto src_offset =
+          GraphViewType::is_storage_transposed ? minor_offset : static_cast<vertex_t>(major_offset);
+        auto dst_offset =
+          GraphViewType::is_storage_transposed ? static_cast<vertex_t>(major_offset) : minor_offset;
+        auto e_op_result = e_op(src,
+                                dst,
+                                edge_partition_src_value_input.get(src_offset),
+                                edge_partition_dst_value_input.get(dst_offset),
+                                edge_partition_e_value_input.get(edge_offset + i));
+        e_op_result_sum  = edge_property_add(e_op_result_sum, e_op_result);
+      }
     }
     idx += gridDim.x;
   }
@@ -417,8 +443,6 @@ T transform_reduce_e(raft::handle_t const& handle,
       typename EdgeValueInputWrapper::value_iterator,
       typename EdgeValueInputWrapper::value_type>>;
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -431,10 +455,18 @@ T transform_reduce_e(raft::handle_t const& handle,
                get_dataframe_buffer_begin(result_buffer) + 1,
                T{});
 
+  auto edge_mask_view = graph_view.edge_mask_view();
+
   for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) {
     auto edge_partition =
       edge_partition_device_view_t<vertex_t, edge_t, GraphViewType::is_multi_gpu>(
         graph_view.local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
 
     edge_partition_src_input_device_view_t edge_partition_src_value_input{};
     edge_partition_dst_input_device_view_t edge_partition_dst_value_input{};
@@ -467,6 +499,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -482,6 +515,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -497,6 +531,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -510,6 +545,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -527,6 +563,7 @@ T transform_reduce_e(raft::handle_t const& handle,
             edge_partition_src_value_input,
             edge_partition_dst_value_input,
             edge_partition_e_value_input,
+            edge_partition_e_mask,
             get_dataframe_buffer_begin(result_buffer),
             e_op);
       }
@@ -601,8 +638,6 @@ auto transform_reduce_e(raft::handle_t const& handle,
     edge_op_result_type<vertex_t, vertex_t, src_value_t, dst_value_t, e_value_t, EdgeOp>::type;
   static_assert(!std::is_same_v<T, void>);
 
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
index 745f1a8fd8e..18e722d62cc 100644
--- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
+++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ template <typename key_t,
           typename dst_value_t,
           typename e_value_t,
           typename EdgeOp>
-struct call_e_op_t {
+struct transform_reduce_v_frontier_call_e_op_t {
   EdgeOp e_op{};
 
   __device__ thrust::optional<
@@ -331,13 +331,13 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle,
 
   // 1. fill the buffer
 
-  detail::call_e_op_t<key_t,
-                      payload_t,
-                      vertex_t,
-                      typename EdgeSrcValueInputWrapper::value_type,
-                      typename EdgeDstValueInputWrapper::value_type,
-                      typename EdgeValueInputWrapper::value_type,
-                      EdgeOp>
+  detail::transform_reduce_v_frontier_call_e_op_t<key_t,
+                                                  payload_t,
+                                                  vertex_t,
+                                                  typename EdgeSrcValueInputWrapper::value_type,
+                                                  typename EdgeDstValueInputWrapper::value_type,
+                                                  typename EdgeValueInputWrapper::value_type,
+                                                  EdgeOp>
     e_op_wrapper{e_op};
 
   auto [key_buffer, payload_buffer] =
diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh
index 2d72a075ca5..0c7058cccb4 100644
--- a/cpp/src/prims/update_edge_src_dst_property.cuh
+++ b/cpp/src/prims/update_edge_src_dst_property.cuh
@@ -44,6 +44,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scatter.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <numeric>
 #include <type_traits>
@@ -181,13 +183,14 @@ void update_edge_major_property(raft::handle_t const& handle,
                        handle.get_stream());
           auto bool_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [rx_value_first,
-             v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) {
-              auto v_offset = v - v_first;
-              return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                       packed_bool_mask(v_offset));
-            });
+            cuda::proclaim_return_type<bool>(
+              [rx_value_first,
+               v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) {
+                auto v_offset = v - v_first;
+                return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                         packed_bool_mask(v_offset));
+              }));
           pack_bools(handle,
                      bool_first,
                      bool_first + (*edge_partition_keys)[i].size(),
@@ -202,8 +205,9 @@ void update_edge_major_property(raft::handle_t const& handle,
 
           auto v_offset_first = thrust::make_transform_iterator(
             (*edge_partition_keys)[i].begin(),
-            [v_first = graph_view.vertex_partition_range_first(
-               major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+            cuda::proclaim_return_type<vertex_t>(
+              [v_first = graph_view.vertex_partition_range_first(
+                 major_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
           thrust::gather(handle.get_thrust_policy(),
                          v_offset_first,
                          v_offset_first + (*edge_partition_keys)[i].size(),
@@ -312,21 +316,24 @@ void update_edge_major_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -391,9 +398,10 @@ void update_edge_major_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.major_offset_from_major_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
@@ -471,7 +479,8 @@ void update_edge_minor_property(raft::handle_t const& handle,
       bcast_size *= sizeof(typename EdgeMinorPropertyOutputWrapper::value_type);
     }
     auto num_concurrent_bcasts =
-      (static_cast<size_t>(graph_view.number_of_edges() / comm_size) * sizeof(vertex_t)) /
+      (static_cast<size_t>(graph_view.compute_number_of_edges(handle) / comm_size) *
+       sizeof(vertex_t)) /
       std::max(bcast_size, size_t{1});
     num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1});
     num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast<size_t>(major_comm_size));
@@ -593,13 +602,14 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
                 auto bool_first = thrust::make_transform_iterator(
                   (*edge_partition_keys).begin() + key_offsets[j],
-                  [rx_value_first,
-                   v_first = graph_view.vertex_partition_range_first(
-                     minor_range_vertex_partition_id)] __device__(auto v) {
-                    auto v_offset = v - v_first;
-                    return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
-                                             packed_bool_mask(v_offset));
-                  });
+                  cuda::proclaim_return_type<bool>(
+                    [rx_value_first,
+                     v_first = graph_view.vertex_partition_range_first(
+                       minor_range_vertex_partition_id)] __device__(auto v) {
+                      auto v_offset = v - v_first;
+                      return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
+                                               packed_bool_mask(v_offset));
+                    }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -611,10 +621,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
                   std::get<std::vector<size_t>>(key_offsets_or_rx_displacements);
                 auto bool_first = thrust::make_transform_iterator(
                   thrust::make_counting_iterator(vertex_t{0}),
-                  [rx_value_first] __device__(vertex_t v_offset) {
+                  cuda::proclaim_return_type<bool>([rx_value_first] __device__(vertex_t v_offset) {
                     return static_cast<bool>(*(rx_value_first + packed_bool_offset(v_offset)) &
                                              packed_bool_mask(v_offset));
-                  });
+                  }));
                 pack_unaligned_bools(
                   handle,
                   bool_first,
@@ -630,8 +640,9 @@ void update_edge_minor_property(raft::handle_t const& handle,
 
               auto v_offset_first = thrust::make_transform_iterator(
                 (*edge_partition_keys).begin() + key_offsets[j],
-                [v_first = graph_view.vertex_partition_range_first(
-                   minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; });
+                cuda::proclaim_return_type<vertex_t>(
+                  [v_first = graph_view.vertex_partition_range_first(
+                     minor_range_vertex_partition_id)] __device__(auto v) { return v - v_first; }));
               thrust::gather(handle.get_thrust_policy(),
                              v_offset_first,
                              v_offset_first + (key_offsets[j + 1] - key_offsets[j]),
@@ -718,21 +729,24 @@ void update_edge_minor_property(raft::handle_t const& handle,
             graph_view.local_vertex_partition_view());
         if constexpr (packed_bool) {
           auto bool_first = thrust::make_transform_iterator(
-            vertex_first, [vertex_property_input_first, vertex_partition] __device__(auto v) {
+            vertex_first,
+            cuda::proclaim_return_type<bool>([vertex_property_input_first,
+                                              vertex_partition] __device__(auto v) {
               auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
               return static_cast<bool>(
                 *(vertex_property_input_first + packed_bool_offset(v_offset)) &
                 packed_bool_mask(v_offset));
-            });
+            }));
           pack_bools(handle,
                      bool_first,
                      bool_first + thrust::distance(vertex_first, vertex_last),
                      rx_value_first);
         } else {
-          auto map_first =
-            thrust::make_transform_iterator(vertex_first, [vertex_partition] __device__(auto v) {
+          auto map_first = thrust::make_transform_iterator(
+            vertex_first,
+            cuda::proclaim_return_type<vertex_t>([vertex_partition] __device__(auto v) {
               return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v);
-            });
+            }));
           // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a
           // permutation iterator (and directly gathers to the internal buffer)
           thrust::gather(handle.get_thrust_policy(),
@@ -799,9 +813,10 @@ void update_edge_minor_property(raft::handle_t const& handle,
             });
         } else {
           auto map_first = thrust::make_transform_iterator(
-            rx_vertices.begin(), [edge_partition] __device__(auto v) {
+            rx_vertices.begin(),
+            cuda::proclaim_return_type<vertex_t>([edge_partition] __device__(auto v) {
               return edge_partition.minor_offset_from_minor_nocheck(v);
-            });
+            }));
           // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
           // directly scatters from the internal buffer)
           thrust::scatter(handle.get_thrust_policy(),
@@ -866,8 +881,6 @@ void update_edge_src_property(
     edge_src_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -917,8 +930,6 @@ void update_edge_src_property(
     edge_src_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
@@ -985,8 +996,6 @@ void update_edge_dst_property(
     edge_dst_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     // currently, nothing to do
   }
@@ -1036,8 +1045,6 @@ void update_edge_dst_property(
     edge_dst_property_output,
   bool do_expensive_check = false)
 {
-  CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
-
   if (do_expensive_check) {
     auto num_invalids = thrust::count_if(
       handle.get_thrust_policy(),
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 6a7334e9f1a..5a9ded02009 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -52,6 +52,8 @@
 #include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <cassert>
 #include <cstdlib>  // FIXME: requirement for temporary std::getenv()
 #include <limits>
@@ -378,7 +380,8 @@ struct random_walker_t {
 
     // scatter d_src_init_v to coalesced vertex vector:
     //
-    auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; };
+    auto dlambda = cuda::proclaim_return_type<index_t>(
+      [stride = max_depth_] __device__(auto indx) { return indx * stride; });
 
     // use the transform iterator as map:
     //
@@ -539,10 +542,11 @@ struct random_walker_t {
 
     // delta = ptr_d_sizes[indx] - 1
     //
-    auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - 1;
-      return ptr_d_coalesced[indx * stride + delta];
-    };
+    auto dlambda = cuda::proclaim_return_type<vertex_t>(
+      [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - 1;
+        return ptr_d_coalesced[indx * stride + delta];
+      });
 
     // use the transform iterator as map:
     //
@@ -587,10 +591,11 @@ struct random_walker_t {
   {
     index_t const* ptr_d_sizes = original::raw_const_ptr(d_sizes);
 
-    auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) {
-      auto delta = ptr_d_sizes[indx] - adjust - 1;
-      return indx * stride + delta;
-    };
+    auto dlambda =
+      cuda::proclaim_return_type<index_t>([stride, adjust, ptr_d_sizes] __device__(auto indx) {
+        auto delta = ptr_d_sizes[indx] - adjust - 1;
+        return indx * stride + delta;
+      });
 
     // use the transform iterator as map:
     //
diff --git a/cpp/src/sampling/sampling_post_processing_impl.cuh b/cpp/src/sampling/sampling_post_processing_impl.cuh
index 77d4f2d865f..852d82e78ab 100644
--- a/cpp/src/sampling/sampling_post_processing_impl.cuh
+++ b/cpp/src/sampling/sampling_post_processing_impl.cuh
@@ -40,6 +40,8 @@
 #include <thrust/tuple.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 #include <optional>
 
 namespace cugraph {
@@ -1229,10 +1231,12 @@ renumber_and_compress_sampled_edgelist(
           auto pair_first       = thrust::make_zip_iterator((*compressed_label_indices).begin(),
                                                       (*compressed_hops).begin());
           auto value_pair_first = thrust::make_transform_iterator(
-            thrust::make_counting_iterator(size_t{0}), [num_hops] __device__(size_t i) {
-              return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
-                                        static_cast<int32_t>(i % num_hops));
-            });
+            thrust::make_counting_iterator(size_t{0}),
+            cuda::proclaim_return_type<thrust::tuple<label_index_t, int32_t>>(
+              [num_hops] __device__(size_t i) {
+                return thrust::make_tuple(static_cast<label_index_t>(i / num_hops),
+                                          static_cast<int32_t>(i % num_hops));
+              }));
           thrust::upper_bound(handle.get_thrust_policy(),
                               pair_first,
                               pair_first + (*compressed_label_indices).size(),
diff --git a/cpp/src/structure/detail/structure_utils.cuh b/cpp/src/structure/detail/structure_utils.cuh
index c49b62e4543..f0f729bce18 100644
--- a/cpp/src/structure/detail/structure_utils.cuh
+++ b/cpp/src/structure/detail/structure_utils.cuh
@@ -20,6 +20,7 @@
 #include <cugraph/utilities/dataframe_buffer.hpp>
 #include <cugraph/utilities/device_functors.cuh>
 #include <cugraph/utilities/error.hpp>
+#include <cugraph/utilities/mask_utils.cuh>
 #include <cugraph/utilities/misc_utils.cuh>
 #include <cugraph/utilities/packed_bool_utils.hpp>
 
@@ -524,34 +525,21 @@ std::tuple<size_t, rmm::device_uvector<uint32_t>> mark_entries(raft::handle_t co
                      return word;
                    });
 
-  size_t bit_count = thrust::transform_reduce(
-    handle.get_thrust_policy(),
-    marked_entries.begin(),
-    marked_entries.end(),
-    [] __device__(auto word) { return __popc(word); },
-    size_t{0},
-    thrust::plus<size_t>());
+  size_t bit_count = detail::count_set_bits(handle, marked_entries.begin(), num_entries);
 
   return std::make_tuple(bit_count, std::move(marked_entries));
 }
 
 template <typename T>
-rmm::device_uvector<T> remove_flagged_elements(raft::handle_t const& handle,
-                                               rmm::device_uvector<T>&& vector,
-                                               raft::device_span<uint32_t const> remove_flags,
-                                               size_t remove_count)
+rmm::device_uvector<T> keep_flagged_elements(raft::handle_t const& handle,
+                                             rmm::device_uvector<T>&& vector,
+                                             raft::device_span<uint32_t const> keep_flags,
+                                             size_t keep_count)
 {
-  rmm::device_uvector<T> result(vector.size() - remove_count, handle.get_stream());
-
-  thrust::copy_if(
-    handle.get_thrust_policy(),
-    thrust::make_counting_iterator(size_t{0}),
-    thrust::make_counting_iterator(vector.size()),
-    thrust::make_transform_output_iterator(result.begin(),
-                                           indirection_t<size_t, T*>{vector.data()}),
-    [remove_flags] __device__(size_t i) {
-      return !(remove_flags[cugraph::packed_bool_offset(i)] & cugraph::packed_bool_mask(i));
-    });
+  rmm::device_uvector<T> result(keep_count, handle.get_stream());
+
+  detail::copy_if_mask_set(
+    handle, vector.begin(), vector.end(), keep_flags.begin(), result.begin());
 
   return result;
 }
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 75862266789..6568b5e3b9e 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -133,8 +133,7 @@ update_local_sorted_unique_edge_majors_minors(
   graph_meta_t<vertex_t, edge_t, multi_gpu> const& meta,
   std::vector<rmm::device_uvector<edge_t>> const& edge_partition_offsets,
   std::vector<rmm::device_uvector<vertex_t>> const& edge_partition_indices,
-  std::optional<std::vector<rmm::device_uvector<vertex_t>>> const& edge_partition_dcs_nzd_vertices,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts)
+  std::optional<std::vector<rmm::device_uvector<vertex_t>>> const& edge_partition_dcs_nzd_vertices)
 {
   auto& comm                 = handle.get_comms();
   auto& major_comm           = handle.get_subcomm(cugraph::partition_manager::major_comm_name());
@@ -341,8 +340,7 @@ update_local_sorted_unique_edge_majors_minors(
       if (use_dcs) {
         thrust::copy(handle.get_thrust_policy(),
                      (*edge_partition_dcs_nzd_vertices)[i].begin(),
-                     (*edge_partition_dcs_nzd_vertices)[i].begin() +
-                       (*edge_partition_dcs_nzd_vertex_counts)[i],
+                     (*edge_partition_dcs_nzd_vertices)[i].end(),
                      unique_edge_majors.begin() + cur_size);
       }
 
@@ -390,7 +388,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
   graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
   bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     partition_(meta.partition)
 {
   CUGRAPH_EXPECTS(
@@ -408,14 +406,6 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
   edge_partition_offsets_          = std::move(edge_partition_offsets);
   edge_partition_indices_          = std::move(edge_partition_indices);
   edge_partition_dcs_nzd_vertices_ = std::move(edge_partition_dcs_nzd_vertices);
-  if (edge_partition_dcs_nzd_vertices_) {
-    edge_partition_dcs_nzd_vertex_counts_ =
-      std::vector<vertex_t>((*edge_partition_dcs_nzd_vertices_).size());
-    for (size_t i = 0; i < (*edge_partition_dcs_nzd_vertex_counts_).size(); ++i) {
-      (*edge_partition_dcs_nzd_vertex_counts_)[i] =
-        static_cast<vertex_t>((*edge_partition_dcs_nzd_vertices_)[i].size());
-    }
-  }
 
   // update local sorted unique edge sources/destinations (only if key, value pair will be used)
 
@@ -432,8 +422,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
         meta,
         edge_partition_offsets_,
         edge_partition_indices_,
-        edge_partition_dcs_nzd_vertices_,
-        edge_partition_dcs_nzd_vertex_counts_);
+        edge_partition_dcs_nzd_vertices_);
   } else {
     std::tie(local_sorted_unique_edge_srcs_,
              local_sorted_unique_edge_src_chunk_start_offsets_,
@@ -447,8 +436,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gp
         meta,
         edge_partition_offsets_,
         edge_partition_indices_,
-        edge_partition_dcs_nzd_vertices_,
-        edge_partition_dcs_nzd_vertex_counts_);
+        edge_partition_dcs_nzd_vertices_);
   }
 }
 
@@ -460,7 +448,7 @@ graph_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_g
   graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
   bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, static_cast<edge_t>(indices.size()), meta.properties),
+      meta.number_of_vertices, static_cast<edge_t>(indices.size()), meta.properties),
     offsets_(std::move(offsets)),
     indices_(std::move(indices)),
     segment_offsets_(meta.segment_offsets)
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 64a8a3212b3..7928c61cf7b 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <prims/count_if_e.cuh>
 #include <prims/per_v_transform_reduce_incoming_outgoing_e.cuh>
 #include <prims/reduce_op.cuh>
+#include <utilities/error_check_utils.cuh>
 
 #include <cugraph/edge_property.hpp>
 #include <cugraph/edge_src_dst_property.hpp>
@@ -51,6 +52,8 @@
 #include <thrust/transform_reduce.h>
 #include <thrust/tuple.h>
 
+#include <cuda/functional>
+
 #include <algorithm>
 #include <cstdint>
 #include <type_traits>
@@ -70,44 +73,15 @@ struct out_of_range_t {
   __device__ bool operator()(vertex_t v) const { return (v < min) || (v >= max); }
 };
 
-template <typename vertex_t, typename edge_t>
-std::vector<edge_t> update_edge_partition_edge_counts(
-  std::vector<edge_t const*> const& edge_partition_offsets,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
-  partition_t<vertex_t> const& partition,
-  std::vector<vertex_t> const& edge_partition_segment_offsets,
-  cudaStream_t stream)
-{
-  std::vector<edge_t> edge_partition_edge_counts(partition.number_of_local_edge_partitions(), 0);
-  auto use_dcs = edge_partition_dcs_nzd_vertex_counts.has_value();
-  for (size_t i = 0; i < edge_partition_offsets.size(); ++i) {
-    auto [major_range_first, major_range_last] = partition.local_edge_partition_major_range(i);
-    auto segment_offset_size_per_partition =
-      edge_partition_segment_offsets.size() / edge_partition_offsets.size();
-    raft::update_host(
-      &(edge_partition_edge_counts[i]),
-      edge_partition_offsets[i] +
-        (use_dcs
-           ? (edge_partition_segment_offsets[segment_offset_size_per_partition * i +
-                                             detail::num_sparse_segments_per_vertex_partition] +
-              (*edge_partition_dcs_nzd_vertex_counts)[i])
-           : (major_range_last - major_range_first)),
-      1,
-      stream);
-  }
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-  return edge_partition_edge_counts;
-}
-
 // compute out-degrees (if we are internally storing edges in the sparse 2D matrix using sources as
 // major indices) or in-degrees (otherwise)
 template <typename vertex_t, typename edge_t>
 rmm::device_uvector<edge_t> compute_major_degrees(
   raft::handle_t const& handle,
-  std::vector<edge_t const*> const& edge_partition_offsets,
-  std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-  std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
-  std::optional<std::vector<uint32_t const*>> const& edge_partition_masks,
+  std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+  std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+    edge_partition_dcs_nzd_vertices,
+  std::optional<std::vector<raft::device_span<uint32_t const>>> const& edge_partition_masks,
   partition_t<vertex_t> const& partition,
   std::vector<vertex_t> const& edge_partition_segment_offsets)
 {
@@ -159,39 +133,39 @@ rmm::device_uvector<edge_t> compute_major_degrees(
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_hypersparse_first - major_range_first),
                       local_degrees.begin(),
-                      [offsets, masks] __device__(auto i) {
+                      cuda::proclaim_return_type<edge_t>([offsets, masks] __device__(auto i) {
                         auto local_degree = offsets[i + 1] - offsets[i];
                         if (masks) {
                           local_degree = static_cast<edge_t>(
-                            detail::count_set_bits(*masks, offsets[i], local_degree));
+                            detail::count_set_bits((*masks).begin(), offsets[i], local_degree));
                         }
                         return local_degree;
-                      });
+                      }));
     if (use_dcs) {
-      auto dcs_nzd_vertices     = (*edge_partition_dcs_nzd_vertices)[i];
-      auto dcs_nzd_vertex_count = (*edge_partition_dcs_nzd_vertex_counts)[i];
+      auto dcs_nzd_vertices = (*edge_partition_dcs_nzd_vertices)[i];
       thrust::fill(execution_policy,
                    local_degrees.begin() + (major_hypersparse_first - major_range_first),
                    local_degrees.begin() + (major_range_last - major_range_first),
                    edge_t{0});
-      thrust::for_each(execution_policy,
-                       thrust::make_counting_iterator(vertex_t{0}),
-                       thrust::make_counting_iterator(dcs_nzd_vertex_count),
-                       [offsets,
-                        dcs_nzd_vertices,
-                        masks,
-                        major_range_first,
-                        major_hypersparse_first,
-                        local_degrees = local_degrees.data()] __device__(auto i) {
-                         auto major_idx    = (major_hypersparse_first - major_range_first) + i;
-                         auto local_degree = offsets[major_idx + 1] - offsets[major_idx];
-                         if (masks) {
-                           local_degree = static_cast<edge_t>(
-                             detail::count_set_bits(*masks, offsets[major_idx], local_degree));
-                         }
-                         auto v                               = dcs_nzd_vertices[i];
-                         local_degrees[v - major_range_first] = local_degree;
-                       });
+      thrust::for_each(
+        execution_policy,
+        thrust::make_counting_iterator(vertex_t{0}),
+        thrust::make_counting_iterator(static_cast<vertex_t>(dcs_nzd_vertices.size())),
+        [offsets,
+         dcs_nzd_vertices,
+         masks,
+         major_range_first,
+         major_hypersparse_first,
+         local_degrees = local_degrees.data()] __device__(auto i) {
+          auto major_idx    = (major_hypersparse_first - major_range_first) + i;
+          auto local_degree = offsets[major_idx + 1] - offsets[major_idx];
+          if (masks) {
+            local_degree = static_cast<edge_t>(
+              detail::count_set_bits((*masks).begin(), offsets[major_idx], local_degree));
+          }
+          auto v                               = dcs_nzd_vertices[i];
+          local_degrees[v - major_range_first] = local_degree;
+        });
     }
     minor_comm.reduce(local_degrees.data(),
                       i == minor_comm_rank ? degrees.data() : static_cast<edge_t*>(nullptr),
@@ -207,10 +181,11 @@ rmm::device_uvector<edge_t> compute_major_degrees(
 // compute out-degrees (if we are internally storing edges in the sparse 2D matrix using sources as
 // major indices) or in-degrees (otherwise)
 template <typename vertex_t, typename edge_t>
-rmm::device_uvector<edge_t> compute_major_degrees(raft::handle_t const& handle,
-                                                  edge_t const* offsets,
-                                                  std::optional<uint32_t const*> masks,
-                                                  vertex_t number_of_vertices)
+rmm::device_uvector<edge_t> compute_major_degrees(
+  raft::handle_t const& handle,
+  raft::device_span<edge_t const> offsets,
+  std::optional<raft::device_span<uint32_t const>> masks,
+  vertex_t number_of_vertices)
 {
   rmm::device_uvector<edge_t> degrees(number_of_vertices, handle.get_stream());
   thrust::tabulate(
@@ -221,7 +196,7 @@ rmm::device_uvector<edge_t> compute_major_degrees(raft::handle_t const& handle,
       auto local_degree = offsets[i + 1] - offsets[i];
       if (masks) {
         local_degree =
-          static_cast<edge_t>(detail::count_set_bits(*masks, offsets[i], local_degree));
+          static_cast<edge_t>(detail::count_set_bits((*masks).begin(), offsets[i], local_degree));
       }
       return local_degree;
     });
@@ -440,28 +415,73 @@ edge_t count_edge_partition_multi_edges(
   }
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+std::tuple<rmm::device_uvector<size_t>, std::vector<size_t>>
+compute_edge_indices_and_edge_partition_offsets(
+  raft::handle_t const& handle,
+  graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu> const& graph_view,
+  raft::device_span<vertex_t const> edge_majors,
+  raft::device_span<vertex_t const> edge_minors)
+{
+  auto edge_first = thrust::make_zip_iterator(edge_majors.begin(), edge_minors.begin());
+
+  rmm::device_uvector<size_t> edge_indices(edge_majors.size(), handle.get_stream());
+  thrust::sequence(handle.get_thrust_policy(), edge_indices.begin(), edge_indices.end(), size_t{0});
+  thrust::sort(handle.get_thrust_policy(),
+               edge_indices.begin(),
+               edge_indices.end(),
+               [edge_first] __device__(size_t lhs, size_t rhs) {
+                 return *(edge_first + lhs) < *(edge_first + rhs);
+               });
+
+  std::vector<size_t> h_major_range_lasts(graph_view.number_of_local_edge_partitions());
+  for (size_t i = 0; i < h_major_range_lasts.size(); ++i) {
+    if constexpr (store_transposed) {
+      h_major_range_lasts[i] = graph_view.local_edge_partition_dst_range_last(i);
+    } else {
+      h_major_range_lasts[i] = graph_view.local_edge_partition_src_range_last(i);
+    }
+  }
+  rmm::device_uvector<size_t> d_major_range_lasts(h_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_major_range_lasts.data(),
+                      h_major_range_lasts.data(),
+                      h_major_range_lasts.size(),
+                      handle.get_stream());
+  rmm::device_uvector<size_t> d_lower_bounds(d_major_range_lasts.size(), handle.get_stream());
+  auto major_first        = edge_majors.begin();
+  auto sorted_major_first = thrust::make_transform_iterator(
+    edge_indices.begin(),
+    cugraph::detail::indirection_t<size_t, decltype(major_first)>{major_first});
+  thrust::lower_bound(handle.get_thrust_policy(),
+                      sorted_major_first,
+                      sorted_major_first + edge_indices.size(),
+                      d_major_range_lasts.begin(),
+                      d_major_range_lasts.end(),
+                      d_lower_bounds.begin());
+  std::vector<size_t> edge_partition_offsets(d_lower_bounds.size() + 1, 0);
+  raft::update_host(edge_partition_offsets.data() + 1,
+                    d_lower_bounds.data(),
+                    d_lower_bounds.size(),
+                    handle.get_stream());
+  handle.sync_stream();
+
+  return std::make_tuple(std::move(edge_indices), edge_partition_offsets);
+}
+
 }  // namespace
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
-  graph_view_t(raft::handle_t const& handle,
-               std::vector<edge_t const*> const& edge_partition_offsets,
-               std::vector<vertex_t const*> const& edge_partition_indices,
-               std::optional<std::vector<vertex_t const*>> const& edge_partition_dcs_nzd_vertices,
-               std::optional<std::vector<vertex_t>> const& edge_partition_dcs_nzd_vertex_counts,
+  graph_view_t(std::vector<raft::device_span<edge_t const>> const& edge_partition_offsets,
+               std::vector<raft::device_span<vertex_t const>> const& edge_partition_indices,
+               std::optional<std::vector<raft::device_span<vertex_t const>>> const&
+                 edge_partition_dcs_nzd_vertices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     edge_partition_offsets_(edge_partition_offsets),
     edge_partition_indices_(edge_partition_indices),
     edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices),
-    edge_partition_dcs_nzd_vertex_counts_(edge_partition_dcs_nzd_vertex_counts),
-    edge_partition_number_of_edges_(
-      update_edge_partition_edge_counts(edge_partition_offsets,
-                                        edge_partition_dcs_nzd_vertex_counts,
-                                        meta.partition,
-                                        meta.edge_partition_segment_offsets,
-                                        handle.get_stream())),
     partition_(meta.partition),
     edge_partition_segment_offsets_(meta.edge_partition_segment_offsets),
     local_sorted_unique_edge_srcs_(meta.local_sorted_unique_edge_srcs),
@@ -479,51 +499,42 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
 {
   // cheap error checks
 
-  auto const minor_comm_size =
-    this->handle_ptr()->get_subcomm(cugraph::partition_manager::minor_comm_name()).get_size();
-
   auto use_dcs = edge_partition_dcs_nzd_vertices.has_value();
 
   CUGRAPH_EXPECTS(edge_partition_offsets.size() == edge_partition_indices.size(),
                   "Internal Error: edge_partition_offsets.size() and "
                   "edge_partition_indices.size() should coincide.");
-  CUGRAPH_EXPECTS(edge_partition_dcs_nzd_vertex_counts.has_value() == use_dcs,
-                  "edge_partition_dcs_nzd_vertices.has_value() and "
-                  "edge_partition_dcs_nzd_vertex_counts.has_value() should coincide");
-  CUGRAPH_EXPECTS(!use_dcs || ((*edge_partition_dcs_nzd_vertices).size() ==
-                               (*edge_partition_dcs_nzd_vertex_counts).size()),
-                  "Internal Error: edge_partition_dcs_nzd_vertices.size() and "
-                  "edge_partition_dcs_nzd_vertex_counts.size() should coincide (if used).");
   CUGRAPH_EXPECTS(
     !use_dcs || ((*edge_partition_dcs_nzd_vertices).size() == edge_partition_offsets.size()),
     "Internal Error: edge_partition_dcs_nzd_vertices.size() should coincide "
     "with edge_partition_offsets.size() (if used).");
 
-  CUGRAPH_EXPECTS(edge_partition_offsets.size() == static_cast<size_t>(minor_comm_size),
-                  "Internal Error: erroneous edge_partition_offsets.size().");
-
-  CUGRAPH_EXPECTS(
-    meta.edge_partition_segment_offsets.size() ==
-      minor_comm_size * (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 3 : 2)),
-    "Internal Error: invalid edge_partition_segment_offsets.size().");
+  CUGRAPH_EXPECTS(meta.edge_partition_segment_offsets.size() ==
+                    edge_partition_offsets.size() *
+                      (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 3 : 2)),
+                  "Internal Error: invalid edge_partition_segment_offsets.size().");
 
   // skip expensive error checks as this function is only called by graph_t
 }
 
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
-  graph_view_t(raft::handle_t const& handle,
-               edge_t const* offsets,
-               vertex_t const* indices,
+  graph_view_t(raft::device_span<edge_t const> offsets,
+               raft::device_span<vertex_t const> indices,
                graph_view_meta_t<vertex_t, edge_t, store_transposed, multi_gpu> meta)
   : detail::graph_base_t<vertex_t, edge_t>(
-      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+      meta.number_of_vertices, meta.number_of_edges, meta.properties),
     offsets_(offsets),
     indices_(indices),
     segment_offsets_(meta.segment_offsets)
 {
   // cheap error checks
 
+  CUGRAPH_EXPECTS(offsets.size() == static_cast<size_t>(meta.number_of_vertices + 1),
+                  "Internal Error: offsets.size() returns an invalid value.");
+  CUGRAPH_EXPECTS(indices.size() == static_cast<size_t>(meta.number_of_edges),
+                  "Internal Error: indices.size() returns an invalid value.");
+
   CUGRAPH_EXPECTS(
     !(meta.segment_offsets).has_value() ||
       ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
@@ -532,23 +543,66 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   // skip expensive error checks as this function is only called by graph_t
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_number_of_edges(raft::handle_t const& handle) const
+{
+  if (this->has_edge_mask()) {
+    edge_t ret{};
+    auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+    auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+    for (size_t i = 0; i < value_firsts.size(); ++i) {
+      ret += static_cast<edge_t>(detail::count_set_bits(handle, value_firsts[i], edge_counts[i]));
+    }
+    ret =
+      host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream());
+    return ret;
+  } else {
+    return this->number_of_edges_;
+  }
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
+  compute_number_of_edges(raft::handle_t const& handle) const
+{
+  if (this->has_edge_mask()) {
+    auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+    auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+    assert(value_firsts.size() == 0);
+    assert(edge_counts.size() == 0);
+    return static_cast<edge_t>(detail::count_set_bits(handle, value_firsts[0], edge_counts[0]));
+  } else {
+    return this->number_of_edges_;
+  }
+}
+
 template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
 rmm::device_uvector<edge_t>
 graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_in_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
+    std::optional<std::vector<raft::device_span<uint32_t const>>> edge_partition_masks{
+      std::nullopt};
+    if (this->has_edge_mask()) {
+      edge_partition_masks =
+        std::vector<raft::device_span<uint32_t const>>(this->edge_partition_offsets_.size());
+      auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+      auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+      for (size_t i = 0; i < (*edge_partition_masks).size(); ++i) {
+        (*edge_partition_masks)[i] =
+          raft::device_span<uint32_t const>(value_firsts[i], edge_counts[i]);
+      }
+    }
     return compute_major_degrees(handle,
                                  this->edge_partition_offsets_,
                                  this->edge_partition_dcs_nzd_vertices_,
-                                 this->edge_partition_dcs_nzd_vertex_counts_,
-                                 this->has_edge_mask()
-                                   ? std::make_optional((*(this->edge_mask_view())).value_firsts())
-                                   : std::nullopt,
+                                 edge_partition_masks,
                                  this->partition_,
                                  this->edge_partition_segment_offsets_);
   } else {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   }
 }
@@ -559,14 +613,16 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   compute_in_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
-    return compute_major_degrees(
-      handle,
-      this->offsets_,
-      this->has_edge_mask() ? std::make_optional((*(this->edge_mask_view())).value_firsts()[0])
-                            : std::nullopt,
-      this->local_vertex_partition_range_size());
+    return compute_major_degrees(handle,
+                                 this->offsets_,
+                                 this->has_edge_mask()
+                                   ? std::make_optional(raft::device_span<uint32_t const>(
+                                       (*(this->edge_mask_view())).value_firsts()[0],
+                                       (*(this->edge_mask_view())).edge_counts()[0]))
+                                   : std::nullopt,
+                                 this->local_vertex_partition_range_size());
   } else {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   }
 }
@@ -577,16 +633,25 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<mul
   compute_out_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   } else {
+    std::optional<std::vector<raft::device_span<uint32_t const>>> edge_partition_masks{
+      std::nullopt};
+    if (this->has_edge_mask()) {
+      edge_partition_masks =
+        std::vector<raft::device_span<uint32_t const>>(this->edge_partition_offsets_.size());
+      auto value_firsts = (*(this->edge_mask_view())).value_firsts();
+      auto edge_counts  = (*(this->edge_mask_view())).edge_counts();
+      for (size_t i = 0; i < (*edge_partition_masks).size(); ++i) {
+        (*edge_partition_masks)[i] =
+          raft::device_span<uint32_t const>(value_firsts[i], edge_counts[i]);
+      }
+    }
     return compute_major_degrees(handle,
                                  this->edge_partition_offsets_,
                                  this->edge_partition_dcs_nzd_vertices_,
-                                 this->edge_partition_dcs_nzd_vertex_counts_,
-                                 this->has_edge_mask()
-                                   ? std::make_optional((*(this->edge_mask_view())).value_firsts())
-                                   : std::nullopt,
+                                 edge_partition_masks,
                                  this->partition_,
                                  this->edge_partition_segment_offsets_);
   }
@@ -598,15 +663,17 @@ graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!mu
   compute_out_degrees(raft::handle_t const& handle) const
 {
   if (store_transposed) {
-    CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+    CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
     return compute_minor_degrees(handle, *this);
   } else {
-    return compute_major_degrees(
-      handle,
-      this->offsets_,
-      this->has_edge_mask() ? std::make_optional((*(this->edge_mask_view())).value_firsts()[0])
-                            : std::nullopt,
-      this->local_vertex_partition_range_size());
+    return compute_major_degrees(handle,
+                                 this->offsets_,
+                                 this->has_edge_mask()
+                                   ? std::make_optional(raft::device_span<uint32_t const>(
+                                       (*(this->edge_mask_view())).value_firsts()[0],
+                                       (*(this->edge_mask_view())).edge_counts()[0]))
+                                   : std::nullopt,
+                                 this->local_vertex_partition_range_size());
   }
 }
 
@@ -614,7 +681,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_max_in_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto in_degrees = compute_in_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), in_degrees.begin(), in_degrees.end());
@@ -632,7 +699,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_max_in_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto in_degrees = compute_in_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), in_degrees.begin(), in_degrees.end());
@@ -646,7 +713,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   compute_max_out_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto out_degrees = compute_out_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), out_degrees.begin(), out_degrees.end());
@@ -664,7 +731,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   compute_max_out_degree(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   auto out_degrees = compute_out_degrees(handle);
   auto it = thrust::max_element(handle.get_thrust_policy(), out_degrees.begin(), out_degrees.end());
@@ -678,7 +745,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   count_self_loops(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   return count_if_e(
     handle,
@@ -693,7 +760,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   count_self_loops(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   return count_if_e(
     handle,
@@ -708,7 +775,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   count_multi_edges(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   if (!this->is_multigraph()) { return edge_t{0}; }
 
@@ -728,7 +795,7 @@ template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_
 edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   count_multi_edges(raft::handle_t const& handle) const
 {
-  CUGRAPH_EXPECTS(!has_edge_mask(), "unimplemented.");
+  CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented.");
 
   if (!this->is_multigraph()) { return edge_t{0}; }
 
@@ -738,4 +805,293 @@ edge_t graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_i
     this->local_edge_partition_segment_offsets());
 }
 
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<bool>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::has_edge(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> edge_srcs,
+  raft::device_span<vertex_t const> edge_dsts,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto [edge_indices, edge_partition_offsets] =
+    compute_edge_indices_and_edge_partition_offsets(handle,
+                                                    *this,
+                                                    store_transposed ? edge_dsts : edge_srcs,
+                                                    store_transposed ? edge_srcs : edge_dsts);
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  auto sorted_edge_first = thrust::make_transform_iterator(
+    edge_indices.begin(), cugraph::detail::indirection_t<size_t, decltype(edge_first)>{edge_first});
+  rmm::device_uvector<bool> ret(edge_srcs.size(), handle.get_stream());
+
+  for (size_t i = 0; i < this->number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+    thrust::transform(handle.get_thrust_policy(),
+                      sorted_edge_first + edge_partition_offsets[i],
+                      sorted_edge_first + edge_partition_offsets[i + 1],
+                      thrust::make_permutation_iterator(
+                        ret.begin(), edge_indices.begin() + edge_partition_offsets[i]),
+                      [edge_partition, edge_partition_e_mask] __device__(auto e) {
+                        auto major     = thrust::get<0>(e);
+                        auto minor     = thrust::get<1>(e);
+                        auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+                        if (major_idx) {
+                          vertex_t const* indices{nullptr};
+                          edge_t local_edge_offset{};
+                          edge_t local_degree{};
+                          thrust::tie(indices, local_edge_offset, local_degree) =
+                            edge_partition.local_edges(*major_idx);
+                          auto it = thrust::lower_bound(
+                            thrust::seq, indices, indices + local_degree, minor);
+                          if ((it != indices + local_degree) && *it == minor) {
+                            if (edge_partition_e_mask) {
+                              return (*edge_partition_e_mask)
+                                .get(local_edge_offset + thrust::distance(indices, it));
+                            } else {
+                              return true;
+                            }
+                          } else {
+                            return false;
+                          }
+                        } else {
+                          return false;
+                        }
+                      });
+  }
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<bool>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::has_edge(
+  raft::handle_t const& handle,
+  raft::device_span<vertex_t const> edge_srcs,
+  raft::device_span<vertex_t const> edge_dsts,
+  bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  rmm::device_uvector<bool> ret(edge_srcs.size(), handle.get_stream());
+
+  auto edge_partition =
+    edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
+  auto edge_partition_e_mask =
+    edge_mask_view
+      ? thrust::make_optional<
+          detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+          *edge_mask_view, 0)
+      : thrust::nullopt;
+  thrust::transform(
+    handle.get_thrust_policy(),
+    edge_first,
+    edge_first + edge_srcs.size(),
+    ret.begin(),
+    [edge_partition, edge_partition_e_mask] __device__(auto e) {
+      auto major        = thrust::get<0>(e);
+      auto minor        = thrust::get<1>(e);
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t local_edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, local_edge_offset, local_degree) =
+        edge_partition.local_edges(major_offset);
+      auto it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+      if ((it != indices + local_degree) && *it == minor) {
+        if (edge_partition_e_mask) {
+          return (*edge_partition_e_mask).get(local_edge_offset + thrust::distance(indices, it));
+        } else {
+          return true;
+        }
+      } else {
+        return false;
+      }
+    });
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
+  compute_multiplicity(raft::handle_t const& handle,
+                       raft::device_span<vertex_t const> edge_srcs,
+                       raft::device_span<vertex_t const> edge_dsts,
+                       bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(this->is_multigraph(), "Use has_edge() instead for non-multigraphs.");
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto [edge_indices, edge_partition_offsets] =
+    compute_edge_indices_and_edge_partition_offsets(handle,
+                                                    *this,
+                                                    store_transposed ? edge_dsts : edge_srcs,
+                                                    store_transposed ? edge_srcs : edge_dsts);
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  auto sorted_edge_first = thrust::make_transform_iterator(
+    edge_indices.begin(), cugraph::detail::indirection_t<size_t, decltype(edge_first)>{edge_first});
+  rmm::device_uvector<edge_t> ret(edge_srcs.size(), handle.get_stream());
+
+  for (size_t i = 0; i < this->number_of_local_edge_partitions(); ++i) {
+    auto edge_partition =
+      edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view(i));
+    auto edge_partition_e_mask =
+      edge_mask_view
+        ? thrust::make_optional<
+            detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+            *edge_mask_view, i)
+        : thrust::nullopt;
+    thrust::transform(
+      handle.get_thrust_policy(),
+      sorted_edge_first + edge_partition_offsets[i],
+      sorted_edge_first + edge_partition_offsets[i + 1],
+      thrust::make_permutation_iterator(ret.begin(),
+                                        edge_indices.begin() + edge_partition_offsets[i]),
+      [edge_partition, edge_partition_e_mask] __device__(auto e) {
+        auto major     = thrust::get<0>(e);
+        auto minor     = thrust::get<1>(e);
+        auto major_idx = edge_partition.major_idx_from_major_nocheck(major);
+        if (major_idx) {
+          vertex_t const* indices{nullptr};
+          edge_t local_edge_offset{};
+          edge_t local_degree{};
+          thrust::tie(indices, local_edge_offset, local_degree) =
+            edge_partition.local_edges(*major_idx);
+          auto lower_it = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+          auto upper_it = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+          auto multiplicity = static_cast<edge_t>(thrust::distance(lower_it, upper_it));
+          if (edge_partition_e_mask && (multiplicity > 0)) {
+            multiplicity = static_cast<edge_t>(detail::count_set_bits(
+              (*edge_partition_e_mask).value_first(),
+              static_cast<size_t>(local_edge_offset + thrust::distance(indices, lower_it)),
+              static_cast<size_t>(multiplicity)));
+          }
+          return multiplicity;
+        } else {
+          return edge_t{0};
+        }
+      });
+  }
+
+  return ret;
+}
+
+template <typename vertex_t, typename edge_t, bool store_transposed, bool multi_gpu>
+rmm::device_uvector<edge_t>
+graph_view_t<vertex_t, edge_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
+  compute_multiplicity(raft::handle_t const& handle,
+                       raft::device_span<vertex_t const> edge_srcs,
+                       raft::device_span<vertex_t const> edge_dsts,
+                       bool do_expensive_check)
+{
+  CUGRAPH_EXPECTS(this->is_multigraph(), "Use has_edge() instead for non-multigraphs.");
+  CUGRAPH_EXPECTS(
+    edge_srcs.size() == edge_dsts.size(),
+    "Invalid input arguments: edge_srcs.size() does not coincide with edge_dsts.size().");
+
+  auto edge_first =
+    thrust::make_zip_iterator(store_transposed ? edge_dsts.begin() : edge_srcs.begin(),
+                              store_transposed ? edge_srcs.begin() : edge_dsts.begin());
+
+  if (do_expensive_check) {
+    auto num_invalids =
+      detail::count_invalid_vertex_pairs(handle, *this, edge_first, edge_first + edge_srcs.size());
+    CUGRAPH_EXPECTS(num_invalids == 0,
+                    "Invalid input argument: there are invalid edge (src, dst) pairs.");
+  }
+
+  auto edge_mask_view = this->edge_mask_view();
+
+  rmm::device_uvector<edge_t> ret(edge_srcs.size(), handle.get_stream());
+
+  auto edge_partition =
+    edge_partition_device_view_t<vertex_t, edge_t, multi_gpu>(this->local_edge_partition_view());
+  auto edge_partition_e_mask =
+    edge_mask_view
+      ? thrust::make_optional<
+          detail::edge_partition_edge_property_device_view_t<edge_t, uint32_t const*, bool>>(
+          *edge_mask_view, 0)
+      : thrust::nullopt;
+  thrust::transform(
+    handle.get_thrust_policy(),
+    edge_first,
+    edge_first + edge_srcs.size(),
+    ret.begin(),
+    [edge_partition, edge_partition_e_mask] __device__(auto e) {
+      auto major        = thrust::get<0>(e);
+      auto minor        = thrust::get<1>(e);
+      auto major_offset = edge_partition.major_offset_from_major_nocheck(major);
+      vertex_t const* indices{nullptr};
+      edge_t local_edge_offset{};
+      edge_t local_degree{};
+      thrust::tie(indices, local_edge_offset, local_degree) =
+        edge_partition.local_edges(major_offset);
+      auto lower_it     = thrust::lower_bound(thrust::seq, indices, indices + local_degree, minor);
+      auto upper_it     = thrust::upper_bound(thrust::seq, indices, indices + local_degree, minor);
+      auto multiplicity = static_cast<edge_t>(thrust::distance(lower_it, upper_it));
+      if (edge_partition_e_mask && (multiplicity > 0)) {
+        multiplicity = static_cast<edge_t>(detail::count_set_bits(
+          (*edge_partition_e_mask).value_first(),
+          static_cast<size_t>(local_edge_offset + thrust::distance(indices, lower_it)),
+          static_cast<size_t>(multiplicity)));
+      }
+      return multiplicity;
+    });
+
+  return ret;
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/structure/remove_multi_edges.cu b/cpp/src/structure/remove_multi_edges.cu
index ba07d068c0e..54403f0b034 100644
--- a/cpp/src/structure/remove_multi_edges.cu
+++ b/cpp/src/structure/remove_multi_edges.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,8 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<int32_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<float>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types);
+                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -39,7 +40,8 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<int32_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<float>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types);
+                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -51,7 +53,8 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<int64_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<float>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types);
+                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -63,7 +66,8 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<int32_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<double>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types);
+                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge);
 
 template std::tuple<rmm::device_uvector<int32_t>,
                     rmm::device_uvector<int32_t>,
@@ -75,7 +79,8 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<int32_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<double>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types);
+                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge);
 
 template std::tuple<rmm::device_uvector<int64_t>,
                     rmm::device_uvector<int64_t>,
@@ -87,6 +92,7 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<int64_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<double>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<int64_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types);
+                   std::optional<rmm::device_uvector<int32_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge);
 
 }  // namespace cugraph
diff --git a/cpp/src/structure/remove_multi_edges_impl.cuh b/cpp/src/structure/remove_multi_edges_impl.cuh
index ab6b1fba8eb..651876ac8b1 100644
--- a/cpp/src/structure/remove_multi_edges_impl.cuh
+++ b/cpp/src/structure/remove_multi_edges_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,10 +104,12 @@ group_multi_edges(
   rmm::device_uvector<vertex_t>&& edgelist_srcs,
   rmm::device_uvector<vertex_t>&& edgelist_dsts,
   decltype(allocate_dataframe_buffer<edge_value_t>(0, rmm::cuda_stream_view{}))&& edgelist_values,
-  size_t mem_frugal_threshold)
+  size_t mem_frugal_threshold,
+  bool keep_min_value_edge)
 {
   auto pair_first  = thrust::make_zip_iterator(edgelist_srcs.begin(), edgelist_dsts.begin());
   auto value_first = get_dataframe_buffer_begin(edgelist_values);
+  auto edge_first  = thrust::make_zip_iterator(pair_first, value_first);
 
   if (edgelist_srcs.size() > mem_frugal_threshold) {
     // FIXME: Tuning parameter to address high frequency multi-edges
@@ -128,19 +130,28 @@ group_multi_edges(
     raft::update_host(
       h_group_counts.data(), group_counts.data(), group_counts.size(), handle.get_stream());
 
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        pair_first,
-                        pair_first + h_group_counts[0],
-                        get_dataframe_buffer_begin(edgelist_values));
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        pair_first + h_group_counts[0],
-                        pair_first + edgelist_srcs.size(),
-                        get_dataframe_buffer_begin(edgelist_values) + h_group_counts[0]);
+    if (keep_min_value_edge) {
+      thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + h_group_counts[0]);
+      thrust::sort(handle.get_thrust_policy(),
+                   edge_first + h_group_counts[0],
+                   edge_first + edgelist_srcs.size());
+    } else {
+      thrust::sort_by_key(
+        handle.get_thrust_policy(), pair_first, pair_first + h_group_counts[0], value_first);
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          pair_first + h_group_counts[0],
+                          pair_first + edgelist_srcs.size(),
+                          value_first + h_group_counts[0]);
+    }
   } else {
-    thrust::sort_by_key(handle.get_thrust_policy(),
-                        pair_first,
-                        pair_first + edgelist_srcs.size(),
-                        get_dataframe_buffer_begin(edgelist_values));
+    if (keep_min_value_edge) {
+      thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + edgelist_srcs.size());
+    } else {
+      thrust::sort_by_key(handle.get_thrust_policy(),
+                          pair_first,
+                          pair_first + edgelist_srcs.size(),
+                          get_dataframe_buffer_begin(edgelist_values));
+    }
   }
 
   return std::make_tuple(
@@ -160,7 +171,8 @@ remove_multi_edges(raft::handle_t const& handle,
                    rmm::device_uvector<vertex_t>&& edgelist_dsts,
                    std::optional<rmm::device_uvector<weight_t>>&& edgelist_weights,
                    std::optional<rmm::device_uvector<edge_t>>&& edgelist_edge_ids,
-                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types)
+                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types,
+                   bool keep_min_value_edge)
 {
   auto total_global_mem = handle.get_device_properties().totalGlobalMem;
   size_t element_size   = sizeof(vertex_t) * 2;
@@ -187,7 +199,8 @@ remove_multi_edges(raft::handle_t const& handle,
             std::make_tuple(std::move(*edgelist_weights),
                             std::move(*edgelist_edge_ids),
                             std::move(*edgelist_edge_types)),
-            mem_frugal_threshold);
+            mem_frugal_threshold,
+            keep_min_value_edge);
       } else {
         std::forward_as_tuple(
           edgelist_srcs, edgelist_dsts, std::tie(edgelist_weights, edgelist_edge_ids)) =
@@ -196,7 +209,8 @@ remove_multi_edges(raft::handle_t const& handle,
             std::move(edgelist_srcs),
             std::move(edgelist_dsts),
             std::make_tuple(std::move(*edgelist_weights), std::move(*edgelist_edge_ids)),
-            mem_frugal_threshold);
+            mem_frugal_threshold,
+            keep_min_value_edge);
       }
     } else {
       if (edgelist_edge_types) {
@@ -207,7 +221,8 @@ remove_multi_edges(raft::handle_t const& handle,
             std::move(edgelist_srcs),
             std::move(edgelist_dsts),
             std::make_tuple(std::move(*edgelist_weights), std::move(*edgelist_edge_types)),
-            mem_frugal_threshold);
+            mem_frugal_threshold,
+            keep_min_value_edge);
       } else {
         std::forward_as_tuple(edgelist_srcs, edgelist_dsts, std::tie(edgelist_weights)) =
           detail::group_multi_edges<vertex_t, thrust::tuple<weight_t>>(
@@ -215,7 +230,8 @@ remove_multi_edges(raft::handle_t const& handle,
             std::move(edgelist_srcs),
             std::move(edgelist_dsts),
             std::make_tuple(std::move(*edgelist_weights)),
-            mem_frugal_threshold);
+            mem_frugal_threshold,
+            keep_min_value_edge);
       }
     }
   } else {
@@ -228,7 +244,8 @@ remove_multi_edges(raft::handle_t const& handle,
             std::move(edgelist_srcs),
             std::move(edgelist_dsts),
             std::make_tuple(std::move(*edgelist_edge_ids), std::move(*edgelist_edge_types)),
-            mem_frugal_threshold);
+            mem_frugal_threshold,
+            keep_min_value_edge);
       } else {
         std::forward_as_tuple(edgelist_srcs, edgelist_dsts, std::tie(edgelist_edge_ids)) =
           detail::group_multi_edges<vertex_t, thrust::tuple<edge_t>>(
@@ -236,7 +253,8 @@ remove_multi_edges(raft::handle_t const& handle,
             std::move(edgelist_srcs),
             std::move(edgelist_dsts),
             std::make_tuple(std::move(*edgelist_edge_ids)),
-            mem_frugal_threshold);
+            mem_frugal_threshold,
+            keep_min_value_edge);
       }
     } else {
       if (edgelist_edge_types) {
@@ -246,7 +264,8 @@ remove_multi_edges(raft::handle_t const& handle,
             std::move(edgelist_srcs),
             std::move(edgelist_dsts),
             std::make_tuple(std::move(*edgelist_edge_types)),
-            mem_frugal_threshold);
+            mem_frugal_threshold,
+            keep_min_value_edge);
       } else {
         std::tie(edgelist_srcs, edgelist_dsts) = detail::group_multi_edges(
           handle, std::move(edgelist_srcs), std::move(edgelist_dsts), mem_frugal_threshold);
@@ -254,50 +273,47 @@ remove_multi_edges(raft::handle_t const& handle,
     }
   }
 
-  auto [multi_edge_count, multi_edges_to_delete] =
-    detail::mark_entries(handle,
-                         edgelist_srcs.size(),
-                         [d_edgelist_srcs = edgelist_srcs.data(),
-                          d_edgelist_dsts = edgelist_dsts.data()] __device__(auto idx) {
-                           return (idx > 0) && (d_edgelist_srcs[idx - 1] == d_edgelist_srcs[idx]) &&
-                                  (d_edgelist_dsts[idx - 1] == d_edgelist_dsts[idx]);
-                         });
-
-  if (multi_edge_count > 0) {
-    edgelist_srcs = detail::remove_flagged_elements(
+  auto [keep_count, keep_flags] = detail::mark_entries(
+    handle,
+    edgelist_srcs.size(),
+    [d_edgelist_srcs = edgelist_srcs.data(),
+     d_edgelist_dsts = edgelist_dsts.data()] __device__(auto idx) {
+      return !((idx > 0) && (d_edgelist_srcs[idx - 1] == d_edgelist_srcs[idx]) &&
+               (d_edgelist_dsts[idx - 1] == d_edgelist_dsts[idx]));
+    });
+
+  if (keep_count < edgelist_srcs.size()) {
+    edgelist_srcs = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_srcs),
-      raft::device_span<uint32_t const>{multi_edges_to_delete.data(), multi_edges_to_delete.size()},
-      multi_edge_count);
-    edgelist_dsts = detail::remove_flagged_elements(
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
+    edgelist_dsts = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_dsts),
-      raft::device_span<uint32_t const>{multi_edges_to_delete.data(), multi_edges_to_delete.size()},
-      multi_edge_count);
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
 
     if (edgelist_weights)
-      edgelist_weights = detail::remove_flagged_elements(
+      edgelist_weights = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_weights),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_ids)
-      edgelist_edge_ids = detail::remove_flagged_elements(
+      edgelist_edge_ids = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_ids),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_types)
-      edgelist_edge_types = detail::remove_flagged_elements(
+      edgelist_edge_types = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_types),
-        raft::device_span<uint32_t const>{multi_edges_to_delete.data(),
-                                          multi_edges_to_delete.size()},
-        multi_edge_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
   }
 
   return std::make_tuple(std::move(edgelist_srcs),
diff --git a/cpp/src/structure/remove_self_loops_impl.cuh b/cpp/src/structure/remove_self_loops_impl.cuh
index 161ffeae28e..dafe26cd1c5 100644
--- a/cpp/src/structure/remove_self_loops_impl.cuh
+++ b/cpp/src/structure/remove_self_loops_impl.cuh
@@ -44,44 +44,44 @@ remove_self_loops(raft::handle_t const& handle,
                   std::optional<rmm::device_uvector<edge_t>>&& edgelist_edge_ids,
                   std::optional<rmm::device_uvector<edge_type_t>>&& edgelist_edge_types)
 {
-  auto [self_loop_count, self_loops_to_delete] =
+  auto [keep_count, keep_flags] =
     detail::mark_entries(handle,
                          edgelist_srcs.size(),
                          [d_srcs = edgelist_srcs.data(), d_dsts = edgelist_dsts.data()] __device__(
-                           size_t i) { return d_srcs[i] == d_dsts[i]; });
+                           size_t i) { return d_srcs[i] != d_dsts[i]; });
 
-  if (self_loop_count > 0) {
-    edgelist_srcs = detail::remove_flagged_elements(
+  if (keep_count < edgelist_srcs.size()) {
+    edgelist_srcs = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_srcs),
-      raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-      self_loop_count);
-    edgelist_dsts = detail::remove_flagged_elements(
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
+    edgelist_dsts = detail::keep_flagged_elements(
       handle,
       std::move(edgelist_dsts),
-      raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-      self_loop_count);
+      raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+      keep_count);
 
     if (edgelist_weights)
-      edgelist_weights = detail::remove_flagged_elements(
+      edgelist_weights = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_weights),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_ids)
-      edgelist_edge_ids = detail::remove_flagged_elements(
+      edgelist_edge_ids = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_ids),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
 
     if (edgelist_edge_types)
-      edgelist_edge_types = detail::remove_flagged_elements(
+      edgelist_edge_types = detail::keep_flagged_elements(
         handle,
         std::move(*edgelist_edge_types),
-        raft::device_span<uint32_t const>{self_loops_to_delete.data(), self_loops_to_delete.size()},
-        self_loop_count);
+        raft::device_span<uint32_t const>{keep_flags.data(), keep_flags.size()},
+        keep_count);
   }
 
   return std::make_tuple(std::move(edgelist_srcs),
diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh
index 6a0c5a4a675..58fae83bca0 100644
--- a/cpp/src/traversal/od_shortest_distances_impl.cuh
+++ b/cpp/src/traversal/od_shortest_distances_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -432,7 +432,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
   // 1. check input arguments
 
   auto const num_vertices = graph_view.number_of_vertices();
-  auto const num_edges    = graph_view.number_of_edges();
+  auto const num_edges    = graph_view.compute_number_of_edges(handle);
 
   CUGRAPH_EXPECTS(num_vertices != 0 || (origins.size() == 0 && destinations.size() == 0),
                   "Invalid input argument: the input graph is empty but origins.size() > 0 or "
@@ -639,13 +639,14 @@ rmm::device_uvector<weight_t> od_shortest_distances(
         static_cast<od_idx_t>(origins.size()),
         cutoff,
         invalid_distance};
-      detail::call_e_op_t<thrust::tuple<vertex_t, od_idx_t>,
-                          weight_t,
-                          vertex_t,
-                          thrust::nullopt_t,
-                          thrust::nullopt_t,
-                          weight_t,
-                          e_op_t<vertex_t, od_idx_t, key_t, weight_t, GraphViewType::is_multi_gpu>>
+      detail::transform_reduce_v_frontier_call_e_op_t<
+        thrust::tuple<vertex_t, od_idx_t>,
+        weight_t,
+        vertex_t,
+        thrust::nullopt_t,
+        thrust::nullopt_t,
+        weight_t,
+        e_op_t<vertex_t, od_idx_t, key_t, weight_t, GraphViewType::is_multi_gpu>>
         e_op_wrapper{e_op};
 
       auto new_frontier_tagged_vertex_buffer =
@@ -1049,7 +1050,7 @@ rmm::device_uvector<weight_t> od_shortest_distances(
   CUGRAPH_EXPECTS(!graph_view.has_edge_mask(), "unimplemented.");
 
   auto const num_vertices = graph_view.number_of_vertices();
-  auto const num_edges    = graph_view.number_of_edges();
+  auto const num_edges    = graph_view.compute_number_of_edges(handle);
 
   weight_t average_vertex_degree =
     static_cast<weight_t>(num_edges) / static_cast<weight_t>(num_vertices);
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index c78fa3839e2..5a6d536c6f5 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -93,7 +93,7 @@ void sssp(raft::handle_t const& handle,
                 "GraphViewType should support the push model.");
 
   auto const num_vertices = push_graph_view.number_of_vertices();
-  auto const num_edges    = push_graph_view.number_of_edges();
+  auto const num_edges    = push_graph_view.compute_number_of_edges(handle);
   if (num_vertices == 0) { return; }
 
   // implements the Near-Far Pile method in
diff --git a/cpp/src/utilities/cugraph_ops_utils.hpp b/cpp/src/utilities/cugraph_ops_utils.hpp
index 9aea4183866..880a2c8d104 100644
--- a/cpp/src/utilities/cugraph_ops_utils.hpp
+++ b/cpp/src/utilities/cugraph_ops_utils.hpp
@@ -30,7 +30,7 @@ ops::graph::csc<EdgeTypeT, NodeTypeT> get_graph(
   ops::graph::csc<EdgeTypeT, NodeTypeT> graph;
   graph.n_src_nodes = gview.number_of_vertices();
   graph.n_dst_nodes = gview.number_of_vertices();
-  graph.n_indices   = gview.number_of_edges();
+  graph.n_indices   = gview.local_edge_partition_view().number_of_edges();
   // FIXME this is sufficient for now, but if there is a fast (cached) way
   // of getting max degree, use that instead
   graph.dst_max_in_degree = std::numeric_limits<EdgeTypeT>::max();
diff --git a/cpp/src/utilities/error_check_utils.cuh b/cpp/src/utilities/error_check_utils.cuh
new file mode 100644
index 00000000000..baaf513d93d
--- /dev/null
+++ b/cpp/src/utilities/error_check_utils.cuh
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/host_scalar_comm.hpp>
+
+#include <raft/core/device_span.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/count.h>
+
+#include <vector>
+
+namespace cugraph {
+namespace detail {
+
+// check vertices in the pair are in [0, num_vertices) and belongs to one of the local edge
+// partitions.
+template <typename vertex_t>
+struct is_invalid_input_vertex_pair_t {
+  vertex_t num_vertices{};
+  raft::device_span<vertex_t const> edge_partition_major_range_firsts{};
+  raft::device_span<vertex_t const> edge_partition_major_range_lasts{};
+  vertex_t edge_partition_minor_range_first{};
+  vertex_t edge_partition_minor_range_last{};
+
+  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> pair) const
+  {
+    auto major = thrust::get<0>(pair);
+    auto minor = thrust::get<1>(pair);
+    if (!is_valid_vertex(num_vertices, major) || !is_valid_vertex(num_vertices, minor)) {
+      return true;
+    }
+    auto it = thrust::upper_bound(thrust::seq,
+                                  edge_partition_major_range_lasts.begin(),
+                                  edge_partition_major_range_lasts.end(),
+                                  major);
+    if (it == edge_partition_major_range_lasts.end()) { return true; }
+    auto edge_partition_idx =
+      static_cast<size_t>(thrust::distance(edge_partition_major_range_lasts.begin(), it));
+    if (major < edge_partition_major_range_firsts[edge_partition_idx]) { return true; }
+    return (minor < edge_partition_minor_range_first) || (minor >= edge_partition_minor_range_last);
+  }
+};
+
+template <typename GraphViewType, typename VertexPairIterator>
+size_t count_invalid_vertex_pairs(raft::handle_t const& handle,
+                                  GraphViewType const& graph_view,
+                                  VertexPairIterator vertex_pair_first,
+                                  VertexPairIterator vertex_pair_last)
+{
+  using vertex_t = typename GraphViewType::vertex_type;
+
+  std::vector<vertex_t> h_edge_partition_major_range_firsts(
+    graph_view.number_of_local_edge_partitions());
+  std::vector<vertex_t> h_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_firsts.size());
+  vertex_t edge_partition_minor_range_first{};
+  vertex_t edge_partition_minor_range_last{};
+  if constexpr (GraphViewType::is_multi_gpu) {
+    for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i++) {
+      if constexpr (GraphViewType::is_storage_transposed) {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_dst_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_dst_range_last(i);
+      } else {
+        h_edge_partition_major_range_firsts[i] = graph_view.local_edge_partition_src_range_first(i);
+        h_edge_partition_major_range_lasts[i]  = graph_view.local_edge_partition_src_range_last(i);
+      }
+    }
+    if constexpr (GraphViewType::is_storage_transposed) {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_src_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_src_range_last();
+    } else {
+      edge_partition_minor_range_first = graph_view.local_edge_partition_dst_range_first();
+      edge_partition_minor_range_last  = graph_view.local_edge_partition_dst_range_last();
+    }
+  } else {
+    h_edge_partition_major_range_firsts[0] = vertex_t{0};
+    h_edge_partition_major_range_lasts[0]  = graph_view.number_of_vertices();
+    edge_partition_minor_range_first       = vertex_t{0};
+    edge_partition_minor_range_last        = graph_view.number_of_vertices();
+  }
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_firsts(
+    h_edge_partition_major_range_firsts.size(), handle.get_stream());
+  rmm::device_uvector<vertex_t> d_edge_partition_major_range_lasts(
+    h_edge_partition_major_range_lasts.size(), handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.data(),
+                      h_edge_partition_major_range_firsts.size(),
+                      handle.get_stream());
+  raft::update_device(d_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.data(),
+                      h_edge_partition_major_range_lasts.size(),
+                      handle.get_stream());
+
+  auto num_invalid_pairs = thrust::count_if(
+    handle.get_thrust_policy(),
+    vertex_pair_first,
+    vertex_pair_last,
+    is_invalid_input_vertex_pair_t<vertex_t>{
+      graph_view.number_of_vertices(),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_firsts.begin(),
+                                        d_edge_partition_major_range_firsts.end()),
+      raft::device_span<vertex_t const>(d_edge_partition_major_range_lasts.begin(),
+                                        d_edge_partition_major_range_lasts.end()),
+      edge_partition_minor_range_first,
+      edge_partition_minor_range_last});
+  if constexpr (GraphViewType::is_multi_gpu) {
+    auto& comm = handle.get_comms();
+    num_invalid_pairs =
+      host_scalar_allreduce(comm, num_invalid_pairs, raft::comms::op_t::SUM, handle.get_stream());
+  }
+
+  return num_invalid_pairs;
+}
+
+}  // namespace detail
+}  // namespace cugraph
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6530a25d178..3df979fe5c2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 ﻿#=============================================================================
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -257,7 +257,7 @@ ConfigureTest(BALANCED_TEST community/balanced_edge_test.cpp)
 
 ###################################################################################################
 # - EGO tests -------------------------------------------------------------------------------------
-ConfigureTest(EGO_TEST community/egonet_test.cpp GPUS 1 PERCENT 75)
+ConfigureTest(EGONET_TEST community/egonet_test.cpp GPUS 1 PERCENT 75)
 
 ###################################################################################################
 # - FORCE ATLAS 2  tests --------------------------------------------------------------------------
@@ -313,6 +313,11 @@ ConfigureTest(DEGREE_TEST structure/degree_test.cpp)
 ConfigureTest(COUNT_SELF_LOOPS_AND_MULTI_EDGES_TEST
               "structure/count_self_loops_and_multi_edges_test.cpp")
 
+###################################################################################################
+# - Query edge existence and multiplicity tests ---------------------------------------------------
+ConfigureTest(HAS_EDGE_AND_COMPUTE_MULTIPLICITY_TEST
+              "structure/has_edge_and_compute_multiplicity_test.cpp")
+
 ###################################################################################################
 # - Coarsening tests ------------------------------------------------------------------------------
 ConfigureTest(COARSEN_GRAPH_TEST structure/coarsen_graph_test.cpp)
@@ -479,6 +484,11 @@ if(BUILD_CUGRAPH_MG_TESTS)
     ConfigureTestMG(MG_COUNT_SELF_LOOPS_AND_MULTI_EDGES_TEST
           "structure/mg_count_self_loops_and_multi_edges_test.cpp")
 
+    ###############################################################################################
+    # - MG Query edge existence and multiplicity tests --------------------------------------------
+    ConfigureTestMG(MG_HAS_EDGE_AND_COMPUTE_MULTIPLICITY_TEST
+          "structure/mg_has_edge_and_compute_multiplicity_test.cpp")
+
     ###############################################################################################
     # - MG PAGERANK tests -------------------------------------------------------------------------
     ConfigureTestMG(MG_PAGERANK_TEST link_analysis/mg_pagerank_test.cpp)
@@ -521,6 +531,10 @@ if(BUILD_CUGRAPH_MG_TESTS)
     # - MG LEIDEN tests --------------------------------------------------------------------------
     ConfigureTestMG(MG_LEIDEN_TEST community/mg_leiden_test.cpp)
 
+    ###############################################################################################
+    # - MG ECG tests --------------------------------------------------------------------------
+    ConfigureTestMG(MG_ECG_TEST community/mg_ecg_test.cpp)
+
     ###############################################################################################
     # - MG MIS tests ------------------------------------------------------------------------------
     ConfigureTestMG(MG_MIS_TEST community/mg_mis_test.cu)
@@ -531,7 +545,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
 
     ###############################################################################################
     # - MG LOUVAIN tests --------------------------------------------------------------------------
-    ConfigureTestMG(MG_EGO_TEST community/mg_egonet_test.cu)
+    ConfigureTestMG(MG_EGONET_TEST community/mg_egonet_test.cu)
 
     ###############################################################################################
     # - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
@@ -738,9 +752,16 @@ if (BUILD_CUGRAPH_MTMG_TESTS)
     # - MTMG tests -------------------------------------------------------------------------
     ConfigureTest(MTMG_TEST mtmg/threaded_test.cu)
     target_link_libraries(MTMG_TEST
-                        PRIVATE
-                        UCP::UCP
-                        )
+                          PRIVATE
+                          UCP::UCP
+                         )
+
+    ConfigureTest(MTMG_LOUVAIN_TEST mtmg/threaded_test_louvain.cu)
+    target_link_libraries(MTMG_LOUVAIN_TEST
+                          PRIVATE
+                          cugraphmgtestutil
+                          UCP::UCP
+                         )
 
     if(BUILD_CUGRAPH_MG_TESTS)
         ###############################################################################################
diff --git a/cpp/tests/c_api/hits_test.c b/cpp/tests/c_api/hits_test.c
index c275d883d11..1ebd4f82a51 100644
--- a/cpp/tests/c_api/hits_test.c
+++ b/cpp/tests/c_api/hits_test.c
@@ -163,7 +163,7 @@ int test_hits()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -195,7 +195,7 @@ int test_hits_with_transpose()
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
@@ -232,7 +232,7 @@ int test_hits_with_initial()
   vertex_t h_initial_vertices[] = {0, 1, 2, 3, 4};
   weight_t h_initial_hubs[]     = {0.347296, 0.532089, 1, 0.00003608, 0.00003608};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   return generic_hits_test(h_src,
diff --git a/cpp/tests/c_api/mg_create_graph_test.c b/cpp/tests/c_api/mg_create_graph_test.c
index fec319d1881..7156647b025 100644
--- a/cpp/tests/c_api/mg_create_graph_test.c
+++ b/cpp/tests/c_api/mg_create_graph_test.c
@@ -175,18 +175,18 @@ int test_create_mg_graph_multiple_edge_lists(const cugraph_resource_handle_t* ha
   int my_rank = cugraph_resource_handle_get_rank(handle);
   int comm_size = cugraph_resource_handle_get_comm_size(handle);
 
-  size_t local_num_vertices = (num_vertices + comm_size - 1) / comm_size;
+  size_t local_num_vertices = num_vertices / comm_size;
   size_t local_start_vertex = my_rank * local_num_vertices;
-  size_t local_num_edges = (num_edges + comm_size - 1) / comm_size;
+  size_t local_num_edges = num_edges / comm_size;
   size_t local_start_edge = my_rank * local_num_edges;
 
-  local_num_edges = (local_num_edges < (num_edges - local_start_edge)) ? local_num_edges : (num_edges - local_start_edge);
-  local_num_vertices = (local_num_vertices < (num_vertices - local_start_vertex)) ? local_num_vertices : (num_vertices - local_start_vertex);
+  local_num_edges = (my_rank != (comm_size - 1)) ? local_num_edges : (num_edges - local_start_edge);
+  local_num_vertices = (my_rank != (comm_size - 1)) ? local_num_vertices : (num_vertices - local_start_vertex);
 
   for (size_t i = 0 ; i < num_local_arrays ; ++i) {
-    size_t vertex_count = (local_num_vertices + num_local_arrays - 1) / num_local_arrays;
+    size_t vertex_count = local_num_vertices / num_local_arrays;
     size_t vertex_start = i * vertex_count;
-    vertex_count = (vertex_count < (local_num_vertices - vertex_start)) ? vertex_count : (local_num_vertices - vertex_start);
+    vertex_count = (i != (num_local_arrays - 1)) ? vertex_count : (local_num_vertices - vertex_start);
     
     ret_code =
       cugraph_type_erased_device_array_create(handle, vertex_count, vertex_tid, vertices + i, &ret_error);
@@ -363,18 +363,18 @@ int test_create_mg_graph_multiple_edge_lists_multi_edge(const cugraph_resource_h
   int my_rank = cugraph_resource_handle_get_rank(handle);
   int comm_size = cugraph_resource_handle_get_comm_size(handle);
 
-  size_t local_num_vertices = (num_vertices + comm_size - 1) / comm_size;
+  size_t local_num_vertices = num_vertices / comm_size;
   size_t local_start_vertex = my_rank * local_num_vertices;
-  size_t local_num_edges = (num_edges + comm_size - 1) / comm_size;
+  size_t local_num_edges = num_edges / comm_size;
   size_t local_start_edge = my_rank * local_num_edges;
 
-  local_num_edges = (local_num_edges < (num_edges - local_start_edge)) ? local_num_edges : (num_edges - local_start_edge);
-  local_num_vertices = (local_num_vertices < (num_vertices - local_start_vertex)) ? local_num_vertices : (num_vertices - local_start_vertex);
+  local_num_edges = (my_rank != (comm_size - 1)) ? local_num_edges : (num_edges - local_start_edge);
+  local_num_vertices = (my_rank != (comm_size - 1)) ? local_num_vertices : (num_vertices - local_start_vertex);
 
   for (size_t i = 0 ; i < num_local_arrays ; ++i) {
     size_t vertex_count = (local_num_vertices + num_local_arrays - 1) / num_local_arrays;
     size_t vertex_start = i * vertex_count;
-    vertex_count = (vertex_count < (local_num_vertices - vertex_start)) ? vertex_count : (local_num_vertices - vertex_start);
+    vertex_count = (i != (num_local_arrays - 1)) ? vertex_count : (local_num_vertices - vertex_start);
     
     ret_code =
       cugraph_type_erased_device_array_create(handle, vertex_count, vertex_tid, vertices + i, &ret_error);
diff --git a/cpp/tests/c_api/mg_hits_test.c b/cpp/tests/c_api/mg_hits_test.c
index 87371093613..3e10bfc05d6 100644
--- a/cpp/tests/c_api/mg_hits_test.c
+++ b/cpp/tests/c_api/mg_hits_test.c
@@ -171,7 +171,7 @@ int test_hits(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // hits wants store_transposed = TRUE
@@ -203,7 +203,7 @@ int test_hits_with_transpose(const cugraph_resource_handle_t* handle)
   weight_t h_hubs[]        = {0.347296, 0.532089, 1, 0.00003608, 0.00003608, 0};
   weight_t h_authorities[] = {0.652703, 0.879385, 0, 1, 0.347296, 0.00009136};
 
-  double epsilon        = 0.0001;
+  double epsilon        = 0.00002;
   size_t max_iterations = 20;
 
   // Hits wants store_transposed = TRUE
diff --git a/cpp/tests/community/ecg_test.cpp b/cpp/tests/community/ecg_test.cpp
index 3c4c9bc9c12..262e2bd23af 100644
--- a/cpp/tests/community/ecg_test.cpp
+++ b/cpp/tests/community/ecg_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -121,41 +121,29 @@ TEST(ecg, dolphin)
   cugraph::legacy::GraphCSRView<int, int, float> graph_csr(
     offsets_v.data(), indices_v.data(), weights_v.data(), num_verts, num_edges);
 
-  // "FIXME": remove this check once we drop support for Pascal
-  //
-  // Calling louvain on Pascal will throw an exception, we'll check that
-  // this is the behavior while we still support Pascal (device_prop.major < 7)
-  //
-  if (handle.get_device_properties().major < 7) {
-    EXPECT_THROW(
-      (cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data())),
-      cugraph::logic_error);
-  } else {
-    cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data());
+  cugraph::ecg<int32_t, int32_t, float>(handle, graph_csr, .05, 16, result_v.data());
 
-    auto cluster_id = cugraph::test::to_host(handle, result_v);
+  auto cluster_id = cugraph::test::to_host(handle, result_v);
 
-    int max = *max_element(cluster_id.begin(), cluster_id.end());
-    int min = *min_element(cluster_id.begin(), cluster_id.end());
+  int max = *max_element(cluster_id.begin(), cluster_id.end());
+  int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-    ASSERT_EQ((min >= 0), 1);
+  ASSERT_EQ((min >= 0), 1);
 
-    std::set<int> cluster_ids;
-    for (auto c : cluster_id) {
-      cluster_ids.insert(c);
-    }
+  std::set<int> cluster_ids;
+  for (auto c : cluster_id) {
+    cluster_ids.insert(c);
+  }
 
-    ASSERT_EQ(cluster_ids.size(), size_t(max + 1));
+  ASSERT_EQ(cluster_ids.size(), size_t(max + 1));
 
-    float modularity{0.0};
+  float modularity{0.0};
 
-    cugraph::ext_raft::analyzeClustering_modularity(
-      graph_csr, max + 1, result_v.data(), &modularity);
+  cugraph::ext_raft::analyzeClustering_modularity(graph_csr, max + 1, result_v.data(), &modularity);
 
-    float random_modularity{0.95 * 0.4962422251701355};
+  float random_modularity{0.95 * 0.4962422251701355};
 
-    ASSERT_GT(modularity, random_modularity);
-  }
+  ASSERT_GT(modularity, random_modularity);
 }
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/leiden_test.cpp b/cpp/tests/community/leiden_test.cpp
index 656e855057f..36e850683bd 100644
--- a/cpp/tests/community/leiden_test.cpp
+++ b/cpp/tests/community/leiden_test.cpp
@@ -79,39 +79,19 @@ class Tests_Leiden : public ::testing::TestWithParam<std::tuple<Leiden_Usecase,
     auto edge_weight_view =
       edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
 
-    // "FIXME": remove this check once we drop support for Pascal
-    //
-    // Calling leiden on Pascal will throw an exception, we'll check that
-    // this is the behavior while we still support Pascal (device_prop.major < 7)
-    //
-    cudaDeviceProp device_prop;
-    RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0));
-
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       hr_timer.start("Leiden");
     }
 
-    if (device_prop.major < 7) {
-      EXPECT_THROW(leiden(graph_view,
-                          edge_weight_view,
-                          graph_view.local_vertex_partition_range_size(),
-                          leiden_usecase.max_level_,
-                          leiden_usecase.resolution_,
-                          leiden_usecase.check_correctness_,
-                          leiden_usecase.expected_level_,
-                          leiden_usecase.expected_modularity_),
-                   cugraph::logic_error);
-    } else {
-      leiden(graph_view,
-             edge_weight_view,
-             graph_view.local_vertex_partition_range_size(),
-             leiden_usecase.max_level_,
-             leiden_usecase.resolution_,
-             leiden_usecase.check_correctness_,
-             leiden_usecase.expected_level_,
-             leiden_usecase.expected_modularity_);
-    }
+    leiden(graph_view,
+           edge_weight_view,
+           graph_view.local_vertex_partition_range_size(),
+           leiden_usecase.max_level_,
+           leiden_usecase.resolution_,
+           leiden_usecase.check_correctness_,
+           leiden_usecase.expected_level_,
+           leiden_usecase.expected_modularity_);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
index 284dcc94b8c..a39793994d1 100644
--- a/cpp/tests/community/louvain_test.cpp
+++ b/cpp/tests/community/louvain_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.  All rights reserved.
  *
  * NVIDIA CORPORATION and its licensors retain all intellectual property
  * and proprietary rights in and to this software, related documentation
@@ -69,28 +69,11 @@ class Tests_Louvain
       handle, input_usecase, true);
     auto graph_view = graph->view();
 
-    // "FIXME": remove this check once we drop support for Pascal
-    //
-    // Calling louvain on Pascal will throw an exception, we'll check that
-    // this is the behavior while we still support Pascal (device_prop.major < 7)
-    //
-    cudaDeviceProp device_prop;
-    RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0));
-
-    if (device_prop.major < 7) {
-      EXPECT_THROW(louvain_legacy(graph_view,
-                                  graph_view.get_number_of_vertices(),
-                                  louvain_usecase.check_correctness_,
-                                  louvain_usecase.expected_level_,
-                                  louvain_usecase.expected_modularity_),
-                   cugraph::logic_error);
-    } else {
-      louvain_legacy(graph_view,
-                     graph_view.get_number_of_vertices(),
-                     louvain_usecase.check_correctness_,
-                     louvain_usecase.expected_level_,
-                     louvain_usecase.expected_modularity_);
-    }
+    louvain_legacy(graph_view,
+                   graph_view.get_number_of_vertices(),
+                   louvain_usecase.check_correctness_,
+                   louvain_usecase.expected_level_,
+                   louvain_usecase.expected_modularity_);
   }
 
   template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
@@ -124,41 +107,20 @@ class Tests_Louvain
     auto edge_weight_view =
       edge_weights ? std::make_optional((*edge_weights).view()) : std::nullopt;
 
-    // "FIXME": remove this check once we drop support for Pascal
-    //
-    // Calling louvain on Pascal will throw an exception, we'll check that
-    // this is the behavior while we still support Pascal (device_prop.major < 7)
-    //
-    cudaDeviceProp device_prop;
-    RAFT_CUDA_TRY(cudaGetDeviceProperties(&device_prop, 0));
-
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
       hr_timer.start("Louvain");
     }
 
-    if (device_prop.major < 7) {
-      EXPECT_THROW(louvain(graph_view,
-                           edge_weight_view,
-                           graph_view.local_vertex_partition_range_size(),
-                           louvain_usecase.max_level_,
-                           louvain_usecase.threshold_,
-                           louvain_usecase.resolution_,
-                           louvain_usecase.check_correctness_,
-                           louvain_usecase.expected_level_,
-                           louvain_usecase.expected_modularity_),
-                   cugraph::logic_error);
-    } else {
-      louvain(graph_view,
-              edge_weight_view,
-              graph_view.local_vertex_partition_range_size(),
-              louvain_usecase.max_level_,
-              louvain_usecase.threshold_,
-              louvain_usecase.resolution_,
-              louvain_usecase.check_correctness_,
-              louvain_usecase.expected_level_,
-              louvain_usecase.expected_modularity_);
-    }
+    louvain(graph_view,
+            edge_weight_view,
+            graph_view.local_vertex_partition_range_size(),
+            louvain_usecase.max_level_,
+            louvain_usecase.threshold_,
+            louvain_usecase.resolution_,
+            louvain_usecase.check_correctness_,
+            louvain_usecase.expected_level_,
+            louvain_usecase.expected_modularity_);
 
     if (cugraph::test::g_perf) {
       RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
@@ -212,27 +174,39 @@ class Tests_Louvain
     weight_t modularity;
 
     if (resolution) {
-      std::tie(level, modularity) =
-        cugraph::louvain(handle,
-                         graph_view,
-                         edge_weight_view,
-                         clustering_v.data(),
-                         max_level ? *max_level : size_t{100},
-                         threshold ? static_cast<weight_t>(*threshold) : weight_t{1e-7},
-                         static_cast<weight_t>(*resolution));
+      std::tie(level, modularity) = cugraph::louvain(
+        handle,
+        std::optional<std::reference_wrapper<raft::random::RngState>>{std::nullopt},
+        graph_view,
+        edge_weight_view,
+        clustering_v.data(),
+        max_level ? *max_level : size_t{100},
+        threshold ? static_cast<weight_t>(*threshold) : weight_t{1e-7},
+        static_cast<weight_t>(*resolution));
     } else if (threshold) {
-      std::tie(level, modularity) = cugraph::louvain(handle,
-                                                     graph_view,
-                                                     edge_weight_view,
-                                                     clustering_v.data(),
-                                                     max_level ? *max_level : size_t{100},
-                                                     static_cast<weight_t>(*threshold));
+      std::tie(level, modularity) = cugraph::louvain(
+        handle,
+        std::optional<std::reference_wrapper<raft::random::RngState>>{std::nullopt},
+        graph_view,
+        edge_weight_view,
+        clustering_v.data(),
+        max_level ? *max_level : size_t{100},
+        static_cast<weight_t>(*threshold));
     } else if (max_level) {
-      std::tie(level, modularity) =
-        cugraph::louvain(handle, graph_view, edge_weight_view, clustering_v.data(), *max_level);
+      std::tie(level, modularity) = cugraph::louvain(
+        handle,
+        std::optional<std::reference_wrapper<raft::random::RngState>>{std::nullopt},
+        graph_view,
+        edge_weight_view,
+        clustering_v.data(),
+        *max_level);
     } else {
-      std::tie(level, modularity) =
-        cugraph::louvain(handle, graph_view, edge_weight_view, clustering_v.data());
+      std::tie(level, modularity) = cugraph::louvain(
+        handle,
+        std::optional<std::reference_wrapper<raft::random::RngState>>{std::nullopt},
+        graph_view,
+        edge_weight_view,
+        clustering_v.data());
     }
 
     RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
diff --git a/cpp/tests/community/mg_ecg_test.cpp b/cpp/tests/community/mg_ecg_test.cpp
new file mode 100644
index 00000000000..81cee1370f0
--- /dev/null
+++ b/cpp/tests/community/mg_ecg_test.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/sequence.h>
+
+#include <chrono>
+#include <gtest/gtest.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// Test param object. This defines the input and expected output for a test, and
+// will be instantiated as the parameter to the tests defined below using
+// INSTANTIATE_TEST_SUITE_P()
+//
+struct Ecg_Usecase {
+  double min_weight_{0.1};
+  size_t ensemble_size_{10};
+  size_t max_level_{100};
+  double threshold_{1e-7};
+  double resolution_{1.0};
+  bool check_correctness_{true};
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameterized test fixture, to be used with TEST_P().  This defines common
+// setup and teardown steps as well as common utilities used by each E2E MG
+// test.  In this case, each test is identical except for the inputs and
+// expected outputs, so the entire test is defined in the run_test() method.
+//
+template <typename input_usecase_t>
+class Tests_MGEcg : public ::testing::TestWithParam<std::tuple<Ecg_Usecase, input_usecase_t>> {
+ public:
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  // Run once for each test instance
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(std::tuple<Ecg_Usecase const&, input_usecase_t const&> const& param)
+  {
+    auto [ecg_usecase, input_usecase] = param;
+
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    auto [mg_graph, mg_edge_weights, d_renumber_map_labels] =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
+        *handle_, input_usecase, true, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+    auto mg_edge_weight_view =
+      mg_edge_weights ? std::make_optional((*mg_edge_weights).view()) : std::nullopt;
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG ECG");
+    }
+
+    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+    raft::random::RngState rng_state(seed);
+
+    cugraph::ecg<vertex_t, edge_t, weight_t, true>(*handle_,
+                                                   rng_state,
+                                                   mg_graph_view,
+                                                   mg_edge_weight_view,
+                                                   ecg_usecase.min_weight_,
+                                                   ecg_usecase.ensemble_size_,
+                                                   ecg_usecase.max_level_,
+                                                   ecg_usecase.threshold_,
+                                                   ecg_usecase.resolution_);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+    // Louvain and detail::permute_range are both tested, here we only make
+    // sure that SG and MG ECG calls work expected.
+
+    cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(*handle_);
+    std::optional<
+      cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+      sg_edge_weights{std::nullopt};
+    std::tie(sg_graph, sg_edge_weights, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+      *handle_,
+      mg_graph_view,
+      mg_edge_weight_view,
+      std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+      false);  // crate a SG graph with MG graph vertex IDs
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    if (comm_rank == 0) {
+      auto sg_graph_view = sg_graph.view();
+      auto sg_edge_weight_view =
+        sg_edge_weights ? std::make_optional((*sg_edge_weights).view()) : std::nullopt;
+
+      cugraph::ecg<vertex_t, edge_t, weight_t, false>(*handle_,
+                                                      rng_state,
+                                                      sg_graph_view,
+                                                      sg_edge_weight_view,
+                                                      ecg_usecase.min_weight_,
+                                                      ecg_usecase.ensemble_size_,
+                                                      ecg_usecase.max_level_,
+                                                      ecg_usecase.threshold_,
+                                                      ecg_usecase.resolution_);
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGEcg<input_usecase_t>::handle_ = nullptr;
+
+using Tests_MGEcg_File = Tests_MGEcg<cugraph::test::File_Usecase>;
+using Tests_MGEcg_Rmat = Tests_MGEcg<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGEcg_File, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGEcg_File, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGEcg_Rmat, CheckInt32Int32Float)
+{
+  run_current_test<int32_t, int32_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGEcg_Rmat, CheckInt32Int64Float)
+{
+  run_current_test<int32_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+TEST_P(Tests_MGEcg_Rmat, CheckInt64Int64Float)
+{
+  run_current_test<int64_t, int64_t, float>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGEcg_File,
+  ::testing::Combine(
+    // enable correctness checks for small graphs
+    ::testing::Values(Ecg_Usecase{0.1, 10, 100, 1e-7, 1.0, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGEcg_Rmat,
+  ::testing::Combine(
+    ::testing::Values(Ecg_Usecase{0.1, 10, 100, 1e-7, 1.0, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_MGEcg_File,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Ecg_Usecase{0.1, 10, 100, 1e-7, 1.0, true}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGEcg_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Ecg_Usecase{0.1, 10, 100, 1e-7, 1.0, true}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(12, 32, 0.57, 0.19, 0.19, 0, true, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index 41339e32d77..011426606fd 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -126,13 +126,15 @@ class Tests_MGLouvain
           rmm::device_uvector<vertex_t> d_sg_cluster_v(sg_graph_view.number_of_vertices(),
                                                        handle_->get_stream());
 
-          std::tie(std::ignore, sg_modularity) = cugraph::louvain(handle,
-                                                                  sg_graph_view,
-                                                                  sg_edge_weight_view,
-                                                                  d_sg_cluster_v.data(),
-                                                                  size_t{1},
-                                                                  threshold,
-                                                                  resolution);
+          std::tie(std::ignore, sg_modularity) = cugraph::louvain(
+            handle,
+            std::optional<std::reference_wrapper<raft::random::RngState>>{std::nullopt},
+            sg_graph_view,
+            sg_edge_weight_view,
+            d_sg_cluster_v.data(),
+            size_t{1},
+            threshold,
+            resolution);
 
           EXPECT_TRUE(cugraph::test::check_invertible(
             handle,
@@ -191,6 +193,7 @@ class Tests_MGLouvain
 
     auto [dendrogram, mg_modularity] = cugraph::louvain<vertex_t, edge_t, weight_t, true>(
       *handle_,
+      std::optional<std::reference_wrapper<raft::random::RngState>>{std::nullopt},
       mg_graph_view,
       mg_edge_weight_view,
       louvain_usecase.max_level_,
diff --git a/cpp/tests/community/triangle_count_test.cpp b/cpp/tests/community/triangle_count_test.cpp
index 836bab59457..592924c3c47 100644
--- a/cpp/tests/community/triangle_count_test.cpp
+++ b/cpp/tests/community/triangle_count_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -232,7 +232,7 @@ class Tests_TriangleCount
       for (size_t i = 0; i < h_cugraph_vertices.size(); ++i) {
         auto v     = h_cugraph_vertices[i];
         auto count = h_cugraph_triangle_counts[i];
-        ASSERT_TRUE(count == h_reference_triangle_counts[v])
+        ASSERT_EQ(count, h_reference_triangle_counts[v])
           << "Triangle count values do not match with the reference values.";
       }
     }
diff --git a/cpp/tests/link_analysis/hits_test.cpp b/cpp/tests/link_analysis/hits_test.cpp
index d0e77769034..cf35356bb76 100644
--- a/cpp/tests/link_analysis/hits_test.cpp
+++ b/cpp/tests/link_analysis/hits_test.cpp
@@ -52,9 +52,11 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
   size_t max_iterations,
   std::optional<result_t const*> starting_hub_values,
   bool normalized,
-  double tolerance)
+  double epsilon)
 {
   CUGRAPH_EXPECTS(num_vertices > 1, "number of vertices expected to be non-zero");
+  auto tolerance = static_cast<result_t>(num_vertices) * epsilon;
+
   std::vector<result_t> prev_hubs(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> prev_authorities(num_vertices, result_t{1.0} / num_vertices);
   std::vector<result_t> curr_hubs(num_vertices);
@@ -127,8 +129,8 @@ std::tuple<std::vector<result_t>, std::vector<result_t>, double, size_t> hits_re
 }
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -175,8 +177,8 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
     // 3. run hits
 
     auto graph_view         = graph.view();
-    auto maximum_iterations = 500;
-    weight_t tolerance      = 1e-5;
+    auto maximum_iterations = 200;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_hubs(graph_view.local_vertex_partition_range_size(),
                                          handle.get_stream());
 
@@ -201,7 +203,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
                                 graph_view,
                                 d_hubs.data(),
                                 d_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -227,12 +229,12 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
         offsets.data(),
         indices.data(),
         unrenumbered_graph_view.number_of_vertices(),
-        unrenumbered_graph_view.number_of_edges(),
+        unrenumbered_graph_view.compute_number_of_edges(handle),
         maximum_iterations,
         (hits_usecase.check_initial_input) ? std::make_optional(initial_random_hubs.data())
                                            : std::nullopt,
         true,
-        tolerance);
+        epsilon);
 
       std::vector<weight_t> h_cugraph_hits{};
       if (renumber) {
@@ -246,8 +248,7 @@ class Tests_Hits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, inpu
       handle.sync_stream();
       auto threshold_ratio = 1e-3;
       auto threshold_magnitude =
-        (1.0 / static_cast<weight_t>(graph_view.number_of_vertices())) *
-        threshold_ratio;  // skip comparison for low hits vertices (lowly ranked vertices)
+        1e-6;  // skip comparison for low hits vertices (lowly ranked vertices)
       auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
         return std::abs(lhs - rhs) <=
                std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -294,14 +295,17 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_Hits_Rmat,
                          // enable correctness checks
-                         ::testing::Combine(::testing::Values(Hits_Usecase{true, false},
+                         ::testing::Combine(::testing::Values(Hits_Usecase{false, true},
                                                               Hits_Usecase{true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
@@ -315,7 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_File,
   ::testing::Combine(
     // disable correctness checks
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -327,7 +331,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_Hits_Rmat,
   // disable correctness checks for large graphs
   ::testing::Combine(
-    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{false, true}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_analysis/mg_hits_test.cpp b/cpp/tests/link_analysis/mg_hits_test.cpp
index cf95d03681d..5c89bafd08e 100644
--- a/cpp/tests/link_analysis/mg_hits_test.cpp
+++ b/cpp/tests/link_analysis/mg_hits_test.cpp
@@ -33,8 +33,8 @@
 #include <gtest/gtest.h>
 
 struct Hits_Usecase {
-  bool check_correctness{true};
   bool check_initial_input{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -81,7 +81,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
     auto mg_graph_view = mg_graph.view();
 
     auto maximum_iterations = 200;
-    weight_t tolerance      = 1e-8;
+    weight_t epsilon        = 1e-7;
     rmm::device_uvector<weight_t> d_mg_hubs(mg_graph_view.local_vertex_partition_range_size(),
                                             handle_->get_stream());
 
@@ -110,7 +110,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                 mg_graph_view,
                                 d_mg_hubs.data(),
                                 d_mg_authorities.data(),
-                                tolerance,
+                                epsilon,
                                 maximum_iterations,
                                 hits_usecase.check_initial_input,
                                 true,
@@ -205,7 +205,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
                                     sg_graph_view,
                                     d_sg_hubs.data(),
                                     d_sg_authorities.data(),
-                                    tolerance,
+                                    epsilon,
                                     maximum_iterations,
                                     hits_usecase.check_initial_input,
                                     true,
@@ -218,9 +218,7 @@ class Tests_MGHits : public ::testing::TestWithParam<std::tuple<Hits_Usecase, in
 
         auto threshold_ratio = 1e-3;
         auto threshold_magnitude =
-          (1.0 / static_cast<result_t>(mg_graph_view.number_of_vertices())) *
-          threshold_ratio;  // skip comparison for low Hits verties (lowly ranked
-                            // vertices)
+          1e-6;  // skip comparison for low Hits verties (lowly ranked vertices)
         auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
           return std::abs(lhs - rhs) <
                  std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
@@ -274,7 +272,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -285,7 +283,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(Hits_Usecase{true, false}, Hits_Usecase{true, true}),
+    ::testing::Values(Hits_Usecase{false, true}, Hits_Usecase{true, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -297,7 +295,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_MGHits_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(Hits_Usecase{false, false}),
+    ::testing::Values(Hits_Usecase{false, false}, Hits_Usecase{true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/link_prediction/weighted_similarity_test.cpp b/cpp/tests/link_prediction/weighted_similarity_test.cpp
index ca644b76c5a..99e752c0b02 100644
--- a/cpp/tests/link_prediction/weighted_similarity_test.cpp
+++ b/cpp/tests/link_prediction/weighted_similarity_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,9 +27,9 @@
 
 struct Similarity_Usecase {
   bool use_weights{false};
-  bool check_correctness{true};
   size_t max_seeds{std::numeric_limits<size_t>::max()};
   size_t max_vertex_pairs_to_check{std::numeric_limits<size_t>::max()};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -293,7 +293,7 @@ INSTANTIATE_TEST_SUITE_P(
     // Disable weighted computation testing in 22.10
     //::testing::Values(Similarity_Usecase{true, true, 20, 100}, Similarity_Usecase{false, true, 20,
     //: 100}),
-    ::testing::Values(Similarity_Usecase{true, true, 20, 100}),
+    ::testing::Values(Similarity_Usecase{true, 20, 100, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/dolphins.mtx"))));
 
@@ -305,7 +305,7 @@ INSTANTIATE_TEST_SUITE_P(
     // Disable weighted computation testing in 22.10
     //::testing::Values(Similarity_Usecase{true, true, 20, 100},
     //: Similarity_Usecase{false,true,20,100}),
-    ::testing::Values(Similarity_Usecase{true, true, 20, 100}),
+    ::testing::Values(Similarity_Usecase{true, 20, 100, true}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -319,7 +319,8 @@ INSTANTIATE_TEST_SUITE_P(
     // disable correctness checks
     // Disable weighted computation testing in 22.10
     //::testing::Values(Similarity_Usecase{false, false}, Similarity_Usecase{true, false}),
-    ::testing::Values(Similarity_Usecase{true, true}),
+    ::testing::Values(Similarity_Usecase{
+      true, std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max(), true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -332,7 +333,8 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // disable correctness checks for large graphs
     //::testing::Values(Similarity_Usecase{false, false}, Similarity_Usecase{true, false}),
-    ::testing::Values(Similarity_Usecase{true, false}),
+    ::testing::Values(Similarity_Usecase{
+      true, std::numeric_limits<size_t>::max(), std::numeric_limits<size_t>::max(), false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/mtmg/multi_node_threaded_test.cu b/cpp/tests/mtmg/multi_node_threaded_test.cu
index e5a7de07781..17aed4fdecf 100644
--- a/cpp/tests/mtmg/multi_node_threaded_test.cu
+++ b/cpp/tests/mtmg/multi_node_threaded_test.cu
@@ -311,7 +311,8 @@ class Tests_Multithreaded
         auto d_my_pageranks = pageranks_view.gather(
           thread_handle,
           raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
-          graph_view,
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
           renumber_map_view);
 
         std::vector<result_t> my_pageranks(d_my_pageranks.size());
diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu
index bc4d8cfef6a..a5df0199cac 100644
--- a/cpp/tests/mtmg/threaded_test.cu
+++ b/cpp/tests/mtmg/threaded_test.cu
@@ -155,10 +155,25 @@ class Tests_Multithreaded
       input_usecase.template construct_edgelist<vertex_t, weight_t>(
         handle, multithreaded_usecase.test_weighted, false, false);
 
+    rmm::device_uvector<vertex_t> d_unique_vertices(2 * d_src_v.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), d_src_v.begin(), d_src_v.end(), d_unique_vertices.begin());
+    thrust::copy(handle.get_thrust_policy(),
+                 d_dst_v.begin(),
+                 d_dst_v.end(),
+                 d_unique_vertices.begin() + d_src_v.size());
+    thrust::sort(handle.get_thrust_policy(), d_unique_vertices.begin(), d_unique_vertices.end());
+
+    d_unique_vertices.resize(thrust::distance(d_unique_vertices.begin(),
+                                              thrust::unique(handle.get_thrust_policy(),
+                                                             d_unique_vertices.begin(),
+                                                             d_unique_vertices.end())),
+                             handle.get_stream());
+
     auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
     auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
     auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
-    auto unique_vertices = cugraph::test::to_host(handle, d_vertices_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_unique_vertices);
 
     // Load edgelist from different threads.  We'll use more threads than GPUs here
     for (int i = 0; i < num_threads; ++i) {
@@ -293,13 +308,13 @@ class Tests_Multithreaded
                                     num_threads]() {
         auto thread_handle = instance_manager->get_handle();
 
-        auto number_of_vertices = unique_vertices->size();
+        auto number_of_vertices = unique_vertices.size();
 
         std::vector<vertex_t> my_vertex_list;
         my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
 
         for (size_t j = i; j < number_of_vertices; j += num_threads) {
-          my_vertex_list.push_back((*unique_vertices)[j]);
+          my_vertex_list.push_back(unique_vertices[j]);
         }
 
         rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
@@ -312,7 +327,8 @@ class Tests_Multithreaded
         auto d_my_pageranks = pageranks_view.gather(
           thread_handle,
           raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
-          graph_view,
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
           renumber_map_view);
 
         std::vector<result_t> my_pageranks(d_my_pageranks.size());
diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu
new file mode 100644
index 00000000000..c1395037646
--- /dev/null
+++ b/cpp/tests/mtmg/threaded_test_louvain.cu
@@ -0,0 +1,511 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/mtmg/edgelist.hpp>
+#include <cugraph/mtmg/graph.hpp>
+#include <cugraph/mtmg/per_thread_edgelist.hpp>
+#include <cugraph/mtmg/renumber_map.hpp>
+#include <cugraph/mtmg/resource_manager.hpp>
+#include <cugraph/mtmg/vertex_result.hpp>
+
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <nccl.h>
+
+#include <vector>
+
+#include <thrust/count.h>
+#include <thrust/unique.h>
+
+struct Multithreaded_Usecase {
+  bool test_weighted{false};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_Multithreaded
+  : public ::testing::TestWithParam<std::tuple<Multithreaded_Usecase, input_usecase_t>> {
+ public:
+  Tests_Multithreaded() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  std::vector<int> get_gpu_list()
+  {
+    int num_gpus_per_node{1};
+    RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node));
+
+    std::vector<int> gpu_list(num_gpus_per_node);
+    std::iota(gpu_list.begin(), gpu_list.end(), 0);
+
+    return gpu_list;
+  }
+
+  template <typename vertex_t,
+            typename edge_t,
+            typename weight_t,
+            typename result_t,
+            bool multi_gpu>
+  void run_current_test(
+    std::tuple<Multithreaded_Usecase const&, input_usecase_t const&> const& param,
+    std::vector<int> gpu_list)
+  {
+    using edge_type_t = int32_t;
+
+    constexpr bool renumber           = true;
+    constexpr bool do_expensive_check = false;
+
+    auto [multithreaded_usecase, input_usecase] = param;
+
+    raft::handle_t handle{};
+
+    size_t max_level{1};  // Louvain is non-deterministic in MG if max_leve > 1
+    weight_t threshold{1e-6};
+    weight_t resolution{1};
+
+    size_t device_buffer_size{64 * 1024 * 1024};
+    size_t thread_buffer_size{4 * 1024 * 1024};
+
+    int num_gpus    = gpu_list.size();
+    int num_threads = num_gpus * 4;
+
+    cugraph::mtmg::resource_manager_t resource_manager;
+
+    std::for_each(gpu_list.begin(), gpu_list.end(), [&resource_manager](int gpu_id) {
+      resource_manager.register_local_gpu(gpu_id, rmm::cuda_device_id{gpu_id});
+    });
+
+    ncclUniqueId instance_manager_id;
+    ncclGetUniqueId(&instance_manager_id);
+
+    auto instance_manager = resource_manager.create_instance_manager(
+      resource_manager.registered_ranks(), instance_manager_id);
+
+    cugraph::mtmg::edgelist_t<vertex_t, weight_t, edge_t, edge_type_t> edgelist;
+    cugraph::mtmg::graph_t<vertex_t, edge_t, false, multi_gpu> graph;
+    cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu> graph_view;
+    cugraph::mtmg::vertex_result_t<vertex_t> louvain_clusters;
+    std::optional<cugraph::mtmg::renumber_map_t<vertex_t>> renumber_map =
+      std::make_optional<cugraph::mtmg::renumber_map_t<vertex_t>>();
+
+    auto edge_weights = multithreaded_usecase.test_weighted
+                          ? std::make_optional<cugraph::mtmg::edge_property_t<
+                              cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+                              weight_t>>()
+                          : std::nullopt;
+
+    //
+    // Simulate graph creation by spawning threads to walk through the
+    // local COO and add edges
+    //
+    std::vector<std::thread> running_threads;
+
+    //  Initialize shared edgelist object, one per GPU
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &edgelist,
+                                    device_buffer_size,
+                                    use_weight    = true,
+                                    use_edge_id   = false,
+                                    use_edge_type = false]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        edgelist.set(thread_handle, device_buffer_size, use_weight, use_edge_id, use_edge_type);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    // Load SG edge list
+    auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] =
+      input_usecase.template construct_edgelist<vertex_t, weight_t>(
+        handle, multithreaded_usecase.test_weighted, false, false);
+
+    rmm::device_uvector<vertex_t> d_unique_vertices(2 * d_src_v.size(), handle.get_stream());
+    thrust::copy(
+      handle.get_thrust_policy(), d_src_v.begin(), d_src_v.end(), d_unique_vertices.begin());
+    thrust::copy(handle.get_thrust_policy(),
+                 d_dst_v.begin(),
+                 d_dst_v.end(),
+                 d_unique_vertices.begin() + d_src_v.size());
+    thrust::sort(handle.get_thrust_policy(), d_unique_vertices.begin(), d_unique_vertices.end());
+
+    d_unique_vertices.resize(thrust::distance(d_unique_vertices.begin(),
+                                              thrust::unique(handle.get_thrust_policy(),
+                                                             d_unique_vertices.begin(),
+                                                             d_unique_vertices.end())),
+                             handle.get_stream());
+
+    auto h_src_v         = cugraph::test::to_host(handle, d_src_v);
+    auto h_dst_v         = cugraph::test::to_host(handle, d_dst_v);
+    auto h_weights_v     = cugraph::test::to_host(handle, d_weights_v);
+    auto unique_vertices = cugraph::test::to_host(handle, d_unique_vertices);
+
+    // Load edgelist from different threads.  We'll use more threads than GPUs here
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    thread_buffer_size,
+                                    &edgelist,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+        cugraph::mtmg::per_thread_edgelist_t<vertex_t, weight_t, edge_t, edge_type_t>
+          per_thread_edgelist(edgelist.get(thread_handle), thread_buffer_size);
+
+        for (size_t j = i; j < h_src_v.size(); j += num_threads) {
+          per_thread_edgelist.append(
+            thread_handle,
+            h_src_v[j],
+            h_dst_v[j],
+            h_weights_v ? std::make_optional((*h_weights_v)[j]) : std::nullopt,
+            std::nullopt,
+            std::nullopt);
+        }
+
+        per_thread_edgelist.flush(thread_handle);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph,
+                                    &edge_weights,
+                                    &edgelist,
+                                    &renumber_map,
+                                    is_symmetric = is_symmetric,
+                                    renumber,
+                                    do_expensive_check]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          edge_t>>
+          edge_ids{std::nullopt};
+        std::optional<cugraph::mtmg::edge_property_t<
+          cugraph::mtmg::graph_view_t<vertex_t, edge_t, false, multi_gpu>,
+          int32_t>>
+          edge_types{std::nullopt};
+
+        edgelist.finalize_buffer(thread_handle);
+        edgelist.consolidate_and_shuffle(thread_handle, false);
+
+        cugraph::mtmg::
+          create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, multi_gpu>(
+            thread_handle,
+            edgelist,
+            cugraph::graph_properties_t{is_symmetric, true},
+            renumber,
+            graph,
+            edge_weights,
+            edge_ids,
+            edge_types,
+            renumber_map,
+            do_expensive_check);
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    graph_view             = graph.view();
+    auto renumber_map_view = renumber_map ? std::make_optional(renumber_map->view()) : std::nullopt;
+
+    weight_t modularity{0};
+
+    for (int i = 0; i < num_threads; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &edge_weights,
+                                    &louvain_clusters,
+                                    &modularity,
+                                    &renumber_map,
+                                    max_level,
+                                    threshold,
+                                    resolution]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        if (thread_handle.get_thread_rank() > 0) return;
+
+        rmm::device_uvector<vertex_t> local_louvain_clusters(
+          graph_view.get(thread_handle).local_vertex_partition_range_size(),
+          thread_handle.get_stream());
+
+        std::tie(std::ignore, modularity) = cugraph::louvain<vertex_t, edge_t, weight_t, true>(
+          thread_handle.raft_handle(),
+          graph_view.get(thread_handle),
+          edge_weights ? std::make_optional(edge_weights->get(thread_handle).view()) : std::nullopt,
+          local_louvain_clusters.data(),
+          max_level,
+          threshold,
+          resolution);
+
+        louvain_clusters.set(thread_handle, std::move(local_louvain_clusters));
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    std::vector<std::tuple<std::vector<vertex_t>, std::vector<vertex_t>>> computed_clusters_v;
+    std::mutex computed_clusters_lock{};
+
+    auto louvain_clusters_view = louvain_clusters.view();
+    std::vector<vertex_t> h_renumber_map;
+
+    // Load computed_clusters_v from different threads.
+    for (int i = 0; i < num_gpus; ++i) {
+      running_threads.emplace_back([&instance_manager,
+                                    &graph_view,
+                                    &renumber_map_view,
+                                    &louvain_clusters_view,
+                                    &computed_clusters_lock,
+                                    &computed_clusters_v,
+                                    &h_src_v,
+                                    &h_dst_v,
+                                    &h_weights_v,
+                                    &h_renumber_map,
+                                    &unique_vertices,
+                                    i,
+                                    num_threads]() {
+        auto thread_handle = instance_manager->get_handle();
+
+        auto number_of_vertices = unique_vertices.size();
+
+        std::vector<vertex_t> my_vertex_list;
+        my_vertex_list.reserve((number_of_vertices + num_threads - 1) / num_threads);
+
+        for (size_t j = i; j < number_of_vertices; j += num_threads) {
+          my_vertex_list.push_back(unique_vertices[j]);
+        }
+
+        rmm::device_uvector<vertex_t> d_my_vertex_list(my_vertex_list.size(),
+                                                       thread_handle.raft_handle().get_stream());
+        raft::update_device(d_my_vertex_list.data(),
+                            my_vertex_list.data(),
+                            my_vertex_list.size(),
+                            thread_handle.raft_handle().get_stream());
+
+        auto d_my_clusters = louvain_clusters_view.gather(
+          thread_handle,
+          raft::device_span<vertex_t const>{d_my_vertex_list.data(), d_my_vertex_list.size()},
+          graph_view.get_vertex_partition_range_lasts(thread_handle),
+          graph_view.get_vertex_partition_view(thread_handle),
+          renumber_map_view);
+
+        std::vector<vertex_t> my_clusters(d_my_clusters.size());
+        raft::update_host(my_clusters.data(),
+                          d_my_clusters.data(),
+                          d_my_clusters.size(),
+                          thread_handle.raft_handle().get_stream());
+
+        {
+          std::lock_guard<std::mutex> lock(computed_clusters_lock);
+          computed_clusters_v.push_back(
+            std::make_tuple(std::move(my_vertex_list), std::move(my_clusters)));
+        }
+
+        h_renumber_map = cugraph::test::to_host(
+          thread_handle.raft_handle(),
+          cugraph::test::device_allgatherv(thread_handle.raft_handle(),
+                                           renumber_map_view->get(thread_handle)));
+      });
+    }
+
+    // Wait for CPU threads to complete
+    std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+    running_threads.resize(0);
+    instance_manager->reset_threads();
+
+    if (multithreaded_usecase.check_correctness) {
+      // Want to compare the results in computed_clusters_v with SG results
+      cugraph::graph_t<vertex_t, edge_t, false, false> sg_graph(handle);
+      std::optional<
+        cugraph::edge_property_t<cugraph::graph_view_t<vertex_t, edge_t, false, false>, weight_t>>
+        sg_edge_weights{std::nullopt};
+
+      for (int i = 0; i < num_gpus; ++i) {
+        running_threads.emplace_back(
+          [&instance_manager, &graph_view, &edge_weights, &sg_graph, &sg_edge_weights]() {
+            auto thread_handle = instance_manager->get_handle();
+
+            if (thread_handle.get_rank() == 0) {
+              std::tie(sg_graph, sg_edge_weights, std::ignore) =
+                cugraph::test::mg_graph_to_sg_graph(
+                  thread_handle.raft_handle(),
+                  graph_view.get(thread_handle),
+                  edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                               : std::nullopt,
+                  std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+                  false);  // create an SG graph with MG graph vertex IDs
+            } else {
+              cugraph::test::mg_graph_to_sg_graph(
+                thread_handle.raft_handle(),
+                graph_view.get(thread_handle),
+                edge_weights ? std::make_optional(edge_weights->get(thread_handle).view())
+                             : std::nullopt,
+                std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+                false);  // create an SG graph with MG graph vertex IDs
+            }
+          });
+      }
+
+      // Wait for CPU threads to complete
+      std::for_each(running_threads.begin(), running_threads.end(), [](auto& t) { t.join(); });
+      running_threads.resize(0);
+      instance_manager->reset_threads();
+
+      rmm::device_uvector<vertex_t> sg_clusters(sg_graph.number_of_vertices(), handle.get_stream());
+      weight_t modularity;
+
+      std::tie(std::ignore, modularity) = cugraph::louvain<vertex_t, edge_t, weight_t, false>(
+        handle,
+        sg_graph.view(),
+        sg_edge_weights ? std::make_optional(sg_edge_weights->view()) : std::nullopt,
+        sg_clusters.data(),
+        max_level,
+        threshold,
+        resolution);
+
+      auto h_sg_clusters = cugraph::test::to_host(handle, sg_clusters);
+      std::map<vertex_t, vertex_t> h_cluster_map;
+      std::map<vertex_t, vertex_t> h_cluster_reverse_map;
+
+      std::for_each(
+        computed_clusters_v.begin(),
+        computed_clusters_v.end(),
+        [&h_sg_clusters, &h_cluster_map, &h_renumber_map, &h_cluster_reverse_map](auto t1) {
+          std::for_each(
+            thrust::make_zip_iterator(std::get<0>(t1).begin(), std::get<1>(t1).begin()),
+            thrust::make_zip_iterator(std::get<0>(t1).end(), std::get<1>(t1).end()),
+            [&h_sg_clusters, &h_cluster_map, &h_renumber_map, &h_cluster_reverse_map](auto t2) {
+              vertex_t v = thrust::get<0>(t2);
+              vertex_t c = thrust::get<1>(t2);
+
+              auto pos    = std::find(h_renumber_map.begin(), h_renumber_map.end(), v);
+              auto offset = std::distance(h_renumber_map.begin(), pos);
+
+              auto cluster_pos = h_cluster_map.find(c);
+              if (cluster_pos == h_cluster_map.end()) {
+                auto reverse_pos = h_cluster_reverse_map.find(h_sg_clusters[offset]);
+
+                ASSERT_TRUE(reverse_pos != h_cluster_map.end()) << "two different cluster mappings";
+
+                h_cluster_map.insert(std::make_pair(c, h_sg_clusters[offset]));
+                h_cluster_reverse_map.insert(std::make_pair(h_sg_clusters[offset], c));
+              } else {
+                ASSERT_EQ(cluster_pos->second, h_sg_clusters[offset])
+                  << "vertex " << v << ", offset = " << offset
+                  << ", SG cluster = " << h_sg_clusters[offset] << ", mtmg cluster = " << c
+                  << ", mapped value = " << cluster_pos->second;
+              }
+            });
+        });
+    }
+  }
+};
+
+using Tests_Multithreaded_File = Tests_Multithreaded<cugraph::test::File_Usecase>;
+using Tests_Multithreaded_Rmat = Tests_Multithreaded<cugraph::test::Rmat_Usecase>;
+
+// FIXME: add tests for type combinations
+TEST_P(Tests_Multithreaded_File, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_File_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+TEST_P(Tests_Multithreaded_Rmat, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float, true>(
+    override_Rmat_Usecase_with_cmd_line_arguments(GetParam()), std::vector<int>{{0, 1}});
+}
+
+INSTANTIATE_TEST_SUITE_P(file_test,
+                         Tests_Multithreaded_File,
+                         ::testing::Combine(
+                           // enable correctness checks
+                           ::testing::Values(Multithreaded_Usecase{true, true}),
+                           ::testing::Values(cugraph::test::File_Usecase("karate.csv"),
+                                             cugraph::test::File_Usecase("dolphins.csv"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(Multithreaded_Usecase{true, true}),
+    //::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+    ::testing::Values(cugraph::test::Rmat_Usecase(5, 8, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  file_benchmark_test, /* note that the test filename can be overridden in benchmarking (with
+                          --gtest_filter to select only the file_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one File_Usecase that differ only in filename
+                          (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_File,
+  ::testing::Combine(
+    // disable correctness checks
+    ::testing::Values(Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_Multithreaded_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(Multithreaded_Usecase{true, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_count_if_e.cu b/cpp/tests/prims/mg_count_if_e.cu
index 449aa728d87..03bf8ae0ae5 100644
--- a/cpp/tests/prims/mg_count_if_e.cu
+++ b/cpp/tests/prims/mg_count_if_e.cu
@@ -53,8 +53,9 @@
 #include <random>
 
 struct Prims_Usecase {
-  bool check_correctness{true};
   bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -102,6 +103,13 @@ class Tests_MGCountIfE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG count_if_e
 
     const int hash_bin_count = 5;
@@ -148,19 +156,19 @@ class Tests_MGCountIfE
                                                               (*mg_renumber_map).size()),
         false);
 
-      auto sg_graph_view = sg_graph.view();
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
 
-      auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
-        *handle_,
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
-        thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
-        hash_bin_count);
-      auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
-        *handle_, sg_graph_view, sg_vertex_prop);
-      auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
-        *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_vertex_prop = cugraph::test::generate<vertex_t, result_t>::vertex_property(
+          *handle_,
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_first()),
+          thrust::make_counting_iterator(sg_graph_view.local_vertex_partition_range_last()),
+          hash_bin_count);
+        auto sg_src_prop = cugraph::test::generate<vertex_t, result_t>::src_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
+        auto sg_dst_prop = cugraph::test::generate<vertex_t, result_t>::dst_property(
+          *handle_, sg_graph_view, sg_vertex_prop);
 
-      if (handle_->get_comms().get_rank() == 0) {
         auto expected_result = count_if_e(
           *handle_,
           sg_graph_view,
@@ -312,7 +320,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGCountIfE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -320,7 +331,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGCountIfE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -332,7 +346,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGCountIfE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
index a3edb1f6372..ac73c446d89 100644
--- a/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
+++ b/cpp/tests/prims/mg_per_v_pair_transform_dst_nbr_intersection.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "property_generator.cuh"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
 #include <utilities/mg_utilities.hpp>
@@ -116,29 +118,8 @@ class Tests_MGPerVPairTransformDstNbrIntersection
 
     std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
     if (prims_usecase.edge_masking) {
-      cugraph::edge_src_property_t<decltype(mg_graph_view), vertex_t> edge_src_renumber_map(
-        *handle_, mg_graph_view);
-      cugraph::edge_dst_property_t<decltype(mg_graph_view), vertex_t> edge_dst_renumber_map(
-        *handle_, mg_graph_view);
-      cugraph::update_edge_src_property(
-        *handle_, mg_graph_view, (*mg_renumber_map).begin(), edge_src_renumber_map);
-      cugraph::update_edge_dst_property(
-        *handle_, mg_graph_view, (*mg_renumber_map).begin(), edge_dst_renumber_map);
-
-      edge_mask = cugraph::edge_property_t<decltype(mg_graph_view), bool>(*handle_, mg_graph_view);
-
-      cugraph::transform_e(
-        *handle_,
-        mg_graph_view,
-        edge_src_renumber_map.view(),
-        edge_dst_renumber_map.view(),
-        cugraph::edge_dummy_property_t{}.view(),
-        [] __device__(auto src, auto dst, auto src_property, auto dst_property, thrust::nullopt_t) {
-          return ((src_property % 2 == 0) && (dst_property % 2 == 0))
-                   ? false
-                   : true;  // mask out the edges with even unrenumbered src & dst vertex IDs
-        },
-        (*edge_mask).mutable_view());
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
       mg_graph_view.attach_edge_mask((*edge_mask).view());
     }
 
@@ -257,42 +238,6 @@ class Tests_MGPerVPairTransformDstNbrIntersection
       if (handle_->get_comms().get_rank() == 0) {
         auto sg_graph_view = sg_graph.view();
 
-        if (prims_usecase.edge_masking) {
-          rmm::device_uvector<vertex_t> srcs(0, handle_->get_stream());
-          rmm::device_uvector<vertex_t> dsts(0, handle_->get_stream());
-          std::tie(srcs, dsts, std::ignore, std::ignore) =
-            cugraph::decompress_to_edgelist<vertex_t, edge_t, weight_t, false, false>(
-              *handle_, sg_graph_view, std::nullopt, std::nullopt, std::nullopt);
-          auto edge_first = thrust::make_zip_iterator(srcs.begin(), dsts.begin());
-          srcs.resize(thrust::distance(edge_first,
-                                       thrust::remove_if(handle_->get_thrust_policy(),
-                                                         edge_first,
-                                                         edge_first + srcs.size(),
-                                                         [] __device__(auto pair) {
-                                                           return (thrust::get<0>(pair) % 2 == 0) &&
-                                                                  (thrust::get<1>(pair) % 2 == 0);
-                                                         })),
-                      handle_->get_stream());
-          dsts.resize(srcs.size(), handle_->get_stream());
-          rmm::device_uvector<vertex_t> vertices(sg_graph_view.number_of_vertices(),
-                                                 handle_->get_stream());
-          thrust::sequence(
-            handle_->get_thrust_policy(), vertices.begin(), vertices.end(), vertex_t{0});
-          std::tie(sg_graph, std::ignore, std::ignore, std::ignore, std::ignore) = cugraph::
-            create_graph_from_edgelist<vertex_t, edge_t, weight_t, edge_t, int32_t, false, false>(
-              *handle_,
-              std::move(vertices),
-              std::move(srcs),
-              std::move(dsts),
-              std::nullopt,
-              std::nullopt,
-              std::nullopt,
-              cugraph::graph_properties_t{sg_graph_view.is_symmetric(),
-                                          sg_graph_view.is_multigraph()},
-              false);
-          sg_graph_view = sg_graph.view();
-        }
-
         auto sg_result_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<edge_t, edge_t>>(
           cugraph::size_dataframe_buffer(mg_aggregate_vertex_pair_buffer), handle_->get_stream());
         auto sg_out_degrees = sg_graph_view.compute_out_degrees(*handle_);
diff --git a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
index eb6a8fd5cb6..80aa34b68ae 100644
--- a/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_random_select_transform_outgoing_e.cu
@@ -301,8 +301,8 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
                      sg_graph_view.local_edge_partition_view().offsets().begin(),
                      sg_graph_view.local_edge_partition_view().offsets().end(),
                      sg_offsets.begin());
-        rmm::device_uvector<vertex_t> sg_indices(sg_graph_view.number_of_edges(),
-                                                 handle_->get_stream());
+        rmm::device_uvector<vertex_t> sg_indices(
+          sg_graph_view.local_edge_partition_view().indices().size(), handle_->get_stream());
         thrust::copy(handle_->get_thrust_policy(),
                      sg_graph_view.local_edge_partition_view().indices().begin(),
                      sg_graph_view.local_edge_partition_view().indices().end(),
@@ -324,8 +324,9 @@ class Tests_MGPerVRandomSelectTransformOutgoingE
            with_replacement = prims_usecase.with_replacement,
            invalid_value =
              invalid_value ? thrust::make_optional<result_t>(*invalid_value) : thrust::nullopt,
-           property_transform = cugraph::test::detail::property_transform<vertex_t, property_t>{
-             hash_bin_count}] __device__(size_t i) {
+           property_transform =
+             cugraph::test::detail::vertex_property_transform<vertex_t, property_t>{
+               hash_bin_count}] __device__(size_t i) {
             auto v = *(frontier_vertex_first + i);
 
             // check sample_offsets
diff --git a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
index 677d6ce5022..fc8114a4652 100644
--- a/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
+++ b/cpp/tests/prims/mg_per_v_transform_reduce_incoming_outgoing_e.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,8 +150,9 @@ struct result_compare {
 };
 
 struct Prims_Usecase {
-  bool check_correctness{true};
   bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -200,6 +201,13 @@ class Tests_MGPerVTransformReduceIncomingOutgoingE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform reduce
 
     const int hash_bin_count = 5;
@@ -674,7 +682,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGPerVTransformReduceIncomingOutgoingE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -682,7 +693,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -694,7 +708,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGPerVTransformReduceIncomingOutgoingE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_e.cu b/cpp/tests/prims/mg_transform_e.cu
index 24deaad810a..e9be80f1f7d 100644
--- a/cpp/tests/prims/mg_transform_e.cu
+++ b/cpp/tests/prims/mg_transform_e.cu
@@ -52,6 +52,7 @@
 
 struct Prims_Usecase {
   bool use_edgelist{false};
+  bool edge_masking{false};
   bool check_correctness{true};
 };
 
@@ -100,6 +101,13 @@ class Tests_MGTransformE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform_e
 
     const int hash_bin_count = 5;
@@ -439,7 +447,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGTransformE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false, true}, Prims_Usecase{true, true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -447,8 +458,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGTransformE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{false, true},
-                                                              Prims_Usecase{true, true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -460,7 +473,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGTransformE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false, false}, Prims_Usecase{true, false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/mg_transform_reduce_e.cu b/cpp/tests/prims/mg_transform_reduce_e.cu
index 79aa3da54df..c4ae11ab7c9 100644
--- a/cpp/tests/prims/mg_transform_reduce_e.cu
+++ b/cpp/tests/prims/mg_transform_reduce_e.cu
@@ -91,8 +91,9 @@ struct result_compare<thrust::tuple<Args...>> {
 };
 
 struct Prims_Usecase {
-  bool check_correctness{true};
   bool test_weighted{false};
+  bool edge_masking{false};
+  bool check_correctness{true};
 };
 
 template <typename input_usecase_t>
@@ -141,6 +142,13 @@ class Tests_MGTransformReduceE
 
     auto mg_graph_view = mg_graph.view();
 
+    std::optional<cugraph::edge_property_t<decltype(mg_graph_view), bool>> edge_mask{std::nullopt};
+    if (prims_usecase.edge_masking) {
+      edge_mask =
+        cugraph::test::generate<vertex_t, bool>::edge_property(*handle_, mg_graph_view, 2);
+      mg_graph_view.attach_edge_mask((*edge_mask).view());
+    }
+
     // 2. run MG transform reduce
 
     const int hash_bin_count = 5;
@@ -365,7 +373,10 @@ INSTANTIATE_TEST_SUITE_P(
   file_test,
   Tests_MGTransformReduceE_File,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{true}),
+    ::testing::Values(Prims_Usecase{false, false, true},
+                      Prims_Usecase{false, true, true},
+                      Prims_Usecase{true, false, true},
+                      Prims_Usecase{true, true, true}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/web-Google.mtx"),
                       cugraph::test::File_Usecase("test/datasets/ljournal-2008.mtx"),
@@ -373,7 +384,10 @@ INSTANTIATE_TEST_SUITE_P(
 
 INSTANTIATE_TEST_SUITE_P(rmat_small_test,
                          Tests_MGTransformReduceE_Rmat,
-                         ::testing::Combine(::testing::Values(Prims_Usecase{true}),
+                         ::testing::Combine(::testing::Values(Prims_Usecase{false, false, true},
+                                                              Prims_Usecase{false, true, true},
+                                                              Prims_Usecase{true, false, true},
+                                                              Prims_Usecase{true, true, true}),
                                             ::testing::Values(cugraph::test::Rmat_Usecase(
                                               10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
@@ -385,7 +399,10 @@ INSTANTIATE_TEST_SUITE_P(
                           factor (to avoid running same benchmarks more than once) */
   Tests_MGTransformReduceE_Rmat,
   ::testing::Combine(
-    ::testing::Values(Prims_Usecase{false}),
+    ::testing::Values(Prims_Usecase{false, false, false},
+                      Prims_Usecase{false, true, false},
+                      Prims_Usecase{true, false, false},
+                      Prims_Usecase{true, true, false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/prims/property_generator.cuh b/cpp/tests/prims/property_generator.cuh
index e7264cd276f..680455eda79 100644
--- a/cpp/tests/prims/property_generator.cuh
+++ b/cpp/tests/prims/property_generator.cuh
@@ -15,6 +15,7 @@
  */
 
 #pragma once
+#include <prims/transform_e.cuh>
 #include <prims/update_edge_src_dst_property.cuh>
 
 #include <cugraph/edge_src_dst_property.hpp>
@@ -61,7 +62,7 @@ __host__ __device__ auto make_property_value(T val)
 }
 
 template <typename vertex_t, typename property_t>
-struct property_transform {
+struct vertex_property_transform {
   int32_t mod{};
 
   constexpr __device__ property_t operator()(vertex_t v) const
@@ -73,6 +74,20 @@ struct property_transform {
   }
 };
 
+template <typename vertex_t, typename property_t>
+struct edge_property_transform {
+  int32_t mod{};
+
+  constexpr __device__ property_t operator()(
+    vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const
+  {
+    static_assert(cugraph::is_thrust_tuple_of_arithmetic<property_t>::value ||
+                  std::is_arithmetic_v<property_t>);
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    return make_property_value<property_t>(hash_func(src + dst) % mod);
+  }
+};
+
 }  // namespace detail
 
 template <typename vertex_t, typename property_t>
@@ -96,7 +111,7 @@ struct generate {
                       labels.begin(),
                       labels.end(),
                       cugraph::get_dataframe_buffer_begin(data),
-                      detail::property_transform<vertex_t, property_t>{hash_bin_count});
+                      detail::vertex_property_transform<vertex_t, property_t>{hash_bin_count});
     return data;
   }
 
@@ -111,7 +126,7 @@ struct generate {
                       begin,
                       end,
                       cugraph::get_dataframe_buffer_begin(data),
-                      detail::property_transform<vertex_t, property_t>{hash_bin_count});
+                      detail::vertex_property_transform<vertex_t, property_t>{hash_bin_count});
     return data;
   }
 
@@ -138,6 +153,22 @@ struct generate {
       handle, graph_view, cugraph::get_dataframe_buffer_begin(property), output_property);
     return output_property;
   }
+
+  template <typename graph_view_type>
+  static auto edge_property(raft::handle_t const& handle,
+                            graph_view_type const& graph_view,
+                            int32_t hash_bin_count)
+  {
+    auto output_property = cugraph::edge_property_t<graph_view_type, bool>(handle, graph_view);
+    cugraph::transform_e(handle,
+                         graph_view,
+                         cugraph::edge_src_dummy_property_t{}.view(),
+                         cugraph::edge_dst_dummy_property_t{}.view(),
+                         cugraph::edge_dummy_property_t{}.view(),
+                         detail::edge_property_transform<vertex_t, property_t>{hash_bin_count},
+                         output_property.mutable_view());
+    return output_property;
+  }
 };
 
 }  // namespace test
diff --git a/cpp/tests/sampling/sampling_post_processing_test.cu b/cpp/tests/sampling/sampling_post_processing_test.cu
index e5267d75ac2..6be735c3482 100644
--- a/cpp/tests/sampling/sampling_post_processing_test.cu
+++ b/cpp/tests/sampling/sampling_post_processing_test.cu
@@ -38,6 +38,8 @@
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include <cuda/functional>
+
 struct SamplingPostProcessing_Usecase {
   size_t num_labels{};
   size_t num_seeds_per_label{};
@@ -318,15 +320,16 @@ bool check_renumber_map_invariants(
 
     auto renumbered_merged_vertex_first = thrust::make_transform_iterator(
       merged_vertices.begin(),
-      [sorted_org_vertices =
-         raft::device_span<vertex_t const>(sorted_org_vertices.data(), sorted_org_vertices.size()),
-       matching_renumbered_vertices = raft::device_span<vertex_t const>(
-         matching_renumbered_vertices.data(),
-         matching_renumbered_vertices.size())] __device__(vertex_t major) {
-        auto it = thrust::lower_bound(
-          thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
-        return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
-      });
+      cuda::proclaim_return_type<vertex_t>(
+        [sorted_org_vertices = raft::device_span<vertex_t const>(sorted_org_vertices.data(),
+                                                                 sorted_org_vertices.size()),
+         matching_renumbered_vertices = raft::device_span<vertex_t const>(
+           matching_renumbered_vertices.data(),
+           matching_renumbered_vertices.size())] __device__(vertex_t major) {
+          auto it = thrust::lower_bound(
+            thrust::seq, sorted_org_vertices.begin(), sorted_org_vertices.end(), major);
+          return matching_renumbered_vertices[thrust::distance(sorted_org_vertices.begin(), it)];
+        }));
 
     thrust::reduce_by_key(handle.get_thrust_policy(),
                           sort_key_first,
@@ -1020,23 +1023,24 @@ class Tests_SamplingPostProcessing
                  ? this_label_output_edgelist_srcs.begin()
                  : this_label_output_edgelist_dsts.begin()) +
                 old_size,
-              [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
-               nzd_vertices =
-                 renumbered_and_compressed_nzd_vertices
-                   ? thrust::make_optional<raft::device_span<vertex_t const>>(
-                       (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
-                       (offset_end_offset - offset_start_offset) - 1)
-                   : thrust::nullopt,
-               base_v] __device__(size_t i) {
-                auto idx = static_cast<size_t>(thrust::distance(
-                  offsets.begin() + 1,
-                  thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
-                if (nzd_vertices) {
-                  return (*nzd_vertices)[idx];
-                } else {
-                  return base_v + static_cast<vertex_t>(idx);
-                }
-              });
+              cuda::proclaim_return_type<vertex_t>(
+                [offsets = raft::device_span<size_t const>(d_offsets.data(), d_offsets.size()),
+                 nzd_vertices =
+                   renumbered_and_compressed_nzd_vertices
+                     ? thrust::make_optional<raft::device_span<vertex_t const>>(
+                         (*renumbered_and_compressed_nzd_vertices).data() + offset_start_offset,
+                         (offset_end_offset - offset_start_offset) - 1)
+                     : thrust::nullopt,
+                 base_v] __device__(size_t i) {
+                  auto idx = static_cast<size_t>(thrust::distance(
+                    offsets.begin() + 1,
+                    thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), i)));
+                  if (nzd_vertices) {
+                    return (*nzd_vertices)[idx];
+                  } else {
+                    return base_v + static_cast<vertex_t>(idx);
+                  }
+                }));
             thrust::copy(handle.get_thrust_policy(),
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets[0],
                          renumbered_and_compressed_edgelist_minors.begin() + h_offsets.back(),
diff --git a/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp b/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
index 68828d5eee1..b7f1dce2023 100644
--- a/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
+++ b/cpp/tests/structure/count_self_loops_and_multi_edges_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -208,10 +208,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_File,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{}),
     ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
                       cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
 
@@ -220,10 +217,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_Rmat,
   ::testing::Combine(
     // enable correctness checks
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{},
-                      CountSelfLoopsAndMultiEdges_Usecase{}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{}),
     ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -235,10 +229,7 @@ INSTANTIATE_TEST_SUITE_P(
   Tests_CountSelfLoopsAndMultiEdges_Rmat,
   ::testing::Combine(
     // disable correctness checks for large graphs
-    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false},
-                      CountSelfLoopsAndMultiEdges_Usecase{false}),
+    ::testing::Values(CountSelfLoopsAndMultiEdges_Usecase{false}),
     ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
 
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp
new file mode 100644
index 00000000000..3ad6953ca03
--- /dev/null
+++ b/cpp/tests/structure/has_edge_and_compute_multiplicity_test.cpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governin_from_mtxg permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <random>
+#include <vector>
+
+struct HasEdgeAndComputeMultiplicity_Usecase {
+  size_t num_vertex_pairs{};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_HasEdgeAndComputeMultiplicity
+  : public ::testing::TestWithParam<
+      std::tuple<HasEdgeAndComputeMultiplicity_Usecase, input_usecase_t>> {
+ public:
+  Tests_HasEdgeAndComputeMultiplicity() {}
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, bool store_transposed>
+  void run_current_test(
+    HasEdgeAndComputeMultiplicity_Usecase const& has_edge_and_compute_multiplicity_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    using weight_t = float;
+
+    constexpr bool renumber = true;
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, false> graph(handle);
+    std::optional<rmm::device_uvector<vertex_t>> d_renumber_map_labels{std::nullopt};
+    std::tie(graph, std::ignore, d_renumber_map_labels) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+        handle, input_usecase, false, renumber);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto graph_view = graph.view();
+
+    raft::random::RngState rng_state(0);
+    rmm::device_uvector<vertex_t> edge_srcs(
+      has_edge_and_compute_multiplicity_usecase.num_vertex_pairs, handle.get_stream());
+    rmm::device_uvector<vertex_t> edge_dsts(edge_srcs.size(), handle.get_stream());
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         edge_srcs.data(),
+                                         edge_srcs.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+    cugraph::detail::uniform_random_fill(handle.get_stream(),
+                                         edge_dsts.data(),
+                                         edge_dsts.size(),
+                                         vertex_t{0},
+                                         graph_view.number_of_vertices(),
+                                         rng_state);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Querying edge existence");
+    }
+
+    auto edge_exists =
+      graph_view.has_edge(handle,
+                          raft::device_span<vertex_t const>(edge_srcs.data(), edge_srcs.size()),
+                          raft::device_span<vertex_t const>(edge_dsts.data(), edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.start("Computing multiplicity");
+    }
+
+    auto edge_multiplicities = graph_view.compute_multiplicity(
+      handle,
+      raft::device_span<vertex_t const>(edge_srcs.data(), edge_srcs.size()),
+      raft::device_span<vertex_t const>(edge_dsts.data(), edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (has_edge_and_compute_multiplicity_usecase.check_correctness) {
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> unrenumbered_graph(handle);
+      if (renumber) {
+        std::tie(unrenumbered_graph, std::ignore, std::ignore) =
+          cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, false>(
+            handle, input_usecase, false, false);
+      }
+      auto unrenumbered_graph_view = renumber ? unrenumbered_graph.view() : graph_view;
+
+      std::vector<edge_t> h_offsets = cugraph::test::to_host(
+        handle, unrenumbered_graph_view.local_edge_partition_view().offsets());
+      std::vector<vertex_t> h_indices = cugraph::test::to_host(
+        handle, unrenumbered_graph_view.local_edge_partition_view().indices());
+
+      rmm::device_uvector<vertex_t> d_unrenumbered_edge_srcs(edge_srcs.size(), handle.get_stream());
+      rmm::device_uvector<vertex_t> d_unrenumbered_edge_dsts(edge_dsts.size(), handle.get_stream());
+      raft::copy_async(
+        d_unrenumbered_edge_srcs.data(), edge_srcs.data(), edge_srcs.size(), handle.get_stream());
+      raft::copy_async(
+        d_unrenumbered_edge_dsts.data(), edge_dsts.data(), edge_dsts.size(), handle.get_stream());
+      if (renumber) {
+        cugraph::unrenumber_local_int_vertices(handle,
+                                               d_unrenumbered_edge_srcs.data(),
+                                               d_unrenumbered_edge_srcs.size(),
+                                               (*d_renumber_map_labels).data(),
+                                               vertex_t{0},
+                                               graph_view.number_of_vertices());
+        cugraph::unrenumber_local_int_vertices(handle,
+                                               d_unrenumbered_edge_dsts.data(),
+                                               d_unrenumbered_edge_dsts.size(),
+                                               (*d_renumber_map_labels).data(),
+                                               vertex_t{0},
+                                               graph_view.number_of_vertices());
+      }
+      auto h_unrenumbered_edge_srcs = cugraph::test::to_host(handle, d_unrenumbered_edge_srcs);
+      auto h_unrenumbered_edge_dsts = cugraph::test::to_host(handle, d_unrenumbered_edge_dsts);
+
+      auto h_cugraph_edge_exists         = cugraph::test::to_host(handle, edge_exists);
+      auto h_cugraph_edge_multiplicities = cugraph::test::to_host(handle, edge_multiplicities);
+      std::vector<bool> h_reference_edge_exists(edge_srcs.size());
+      std::vector<edge_t> h_reference_edge_multiplicities(edge_srcs.size());
+      for (size_t i = 0; i < edge_srcs.size(); ++i) {
+        auto src      = h_unrenumbered_edge_srcs[i];
+        auto dst      = h_unrenumbered_edge_dsts[i];
+        auto major    = store_transposed ? dst : src;
+        auto minor    = store_transposed ? src : dst;
+        auto lower_it = std::lower_bound(
+          h_indices.begin() + h_offsets[major], h_indices.begin() + h_offsets[major + 1], minor);
+        auto upper_it = std::upper_bound(
+          h_indices.begin() + h_offsets[major], h_indices.begin() + h_offsets[major + 1], minor);
+        auto multiplicity                  = static_cast<edge_t>(std::distance(lower_it, upper_it));
+        h_reference_edge_exists[i]         = multiplicity > 0 ? true : false;
+        h_reference_edge_multiplicities[i] = multiplicity;
+      }
+
+      ASSERT_TRUE(std::equal(h_reference_edge_exists.begin(),
+                             h_reference_edge_exists.end(),
+                             h_cugraph_edge_exists.begin()))
+        << "has_edge() return values do not match with the reference values.";
+      ASSERT_TRUE(std::equal(h_reference_edge_multiplicities.begin(),
+                             h_reference_edge_multiplicities.end(),
+                             h_cugraph_edge_multiplicities.begin()))
+        << "compute_multiplicity() return values do not match with the reference values.";
+    }
+  }
+};
+
+using Tests_HasEdgeAndComputeMultiplicity_File =
+  Tests_HasEdgeAndComputeMultiplicity<cugraph::test::File_Usecase>;
+using Tests_HasEdgeAndComputeMultiplicity_Rmat =
+  Tests_HasEdgeAndComputeMultiplicity<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_HasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_test,
+  Tests_HasEdgeAndComputeMultiplicity_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_test,
+  Tests_HasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_HasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    // disable correctness checks for large graphs
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 1024 * 128, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
new file mode 100644
index 00000000000..8079de7ebfe
--- /dev/null
+++ b/cpp/tests/structure/mg_has_edge_and_compute_multiplicity_test.cpp
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/device_comm_wrapper.hpp>
+#include <utilities/mg_utilities.hpp>
+#include <utilities/test_graphs.hpp>
+#include <utilities/test_utilities.hpp>
+#include <utilities/thrust_wrapper.hpp>
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/detail/shuffle_wrappers.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/core/comms.hpp>
+#include <raft/core/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+struct HasEdgeAndComputeMultiplicity_Usecase {
+  size_t num_vertex_pairs{};
+  bool check_correctness{true};
+};
+
+template <typename input_usecase_t>
+class Tests_MGHasEdgeAndComputeMultiplicity
+  : public ::testing::TestWithParam<
+      std::tuple<HasEdgeAndComputeMultiplicity_Usecase, input_usecase_t>> {
+ public:
+  Tests_MGHasEdgeAndComputeMultiplicity() {}
+
+  static void SetUpTestCase() { handle_ = cugraph::test::initialize_mg_handle(); }
+
+  static void TearDownTestCase() { handle_.reset(); }
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Compare the results of running has_edge & compute_multiplicity on multiple GPUs to that of
+  // a single-GPU run
+  template <typename vertex_t, typename edge_t, bool store_transposed>
+  void run_current_test(
+    HasEdgeAndComputeMultiplicity_Usecase const& has_edge_and_compute_multiplicity_usecase,
+    input_usecase_t const& input_usecase)
+  {
+    using weight_t       = float;
+    using edge_type_id_t = int32_t;
+
+    HighResTimer hr_timer{};
+
+    auto const comm_rank = handle_->get_comms().get_rank();
+    auto const comm_size = handle_->get_comms().get_size();
+
+    // 1. create MG graph
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Construct graph");
+    }
+
+    cugraph::graph_t<vertex_t, edge_t, store_transposed, true> mg_graph(*handle_);
+    std::optional<rmm::device_uvector<vertex_t>> mg_renumber_map{std::nullopt};
+    std::tie(mg_graph, std::ignore, mg_renumber_map) =
+      cugraph::test::construct_graph<vertex_t, edge_t, weight_t, store_transposed, true>(
+        *handle_, input_usecase, false, true);
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    auto mg_graph_view = mg_graph.view();
+
+    // 2. create an edge list to query
+
+    raft::random::RngState rng_state(comm_rank);
+    size_t num_vertex_pairs_this_gpu =
+      (has_edge_and_compute_multiplicity_usecase.num_vertex_pairs / comm_size) +
+      ((comm_rank < has_edge_and_compute_multiplicity_usecase.num_vertex_pairs % comm_size)
+         ? size_t{1}
+         : size_t{0});
+    rmm::device_uvector<vertex_t> d_mg_edge_srcs(num_vertex_pairs_this_gpu, handle_->get_stream());
+    rmm::device_uvector<vertex_t> d_mg_edge_dsts(d_mg_edge_srcs.size(), handle_->get_stream());
+    cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                         d_mg_edge_srcs.data(),
+                                         d_mg_edge_srcs.size(),
+                                         vertex_t{0},
+                                         mg_graph_view.number_of_vertices(),
+                                         rng_state);
+    cugraph::detail::uniform_random_fill(handle_->get_stream(),
+                                         d_mg_edge_dsts.data(),
+                                         d_mg_edge_dsts.size(),
+                                         vertex_t{0},
+                                         mg_graph_view.number_of_vertices(),
+                                         rng_state);
+
+    std::tie(store_transposed ? d_mg_edge_dsts : d_mg_edge_srcs,
+             store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts,
+             std::ignore,
+             std::ignore,
+             std::ignore) =
+      cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning<
+        vertex_t,
+        edge_t,
+        weight_t,
+        edge_type_id_t>(*handle_,
+                        std::move(store_transposed ? d_mg_edge_dsts : d_mg_edge_srcs),
+                        std::move(store_transposed ? d_mg_edge_srcs : d_mg_edge_dsts),
+                        std::nullopt,
+                        std::nullopt,
+                        std::nullopt,
+                        mg_graph_view.vertex_partition_range_lasts());
+
+    // 3. run MG has_edge & compute_multiplicity
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Querying edge existence");
+    }
+
+    auto d_mg_edge_exists = mg_graph_view.has_edge(
+      *handle_,
+      raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()),
+      raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.start("MG Computing multiplicity");
+    }
+
+    auto d_mg_edge_multiplicities = mg_graph_view.compute_multiplicity(
+      *handle_,
+      raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()),
+      raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle_->get_comms().barrier();
+      hr_timer.stop();
+      hr_timer.display_and_clear(std::cout);
+    }
+
+    // 4. copmare SG & MG results
+
+    if (has_edge_and_compute_multiplicity_usecase.check_correctness) {
+      // 4-1. aggregate MG results
+
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_mg_edge_srcs.data(),
+        d_mg_edge_srcs.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+      cugraph::unrenumber_int_vertices<vertex_t, true>(
+        *handle_,
+        d_mg_edge_dsts.data(),
+        d_mg_edge_dsts.size(),
+        (*mg_renumber_map).data(),
+        mg_graph_view.vertex_partition_range_lasts());
+
+      auto d_mg_aggregate_edge_srcs = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_mg_edge_srcs.data(), d_mg_edge_srcs.size()));
+      auto d_mg_aggregate_edge_dsts = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<vertex_t const>(d_mg_edge_dsts.data(), d_mg_edge_dsts.size()));
+      auto d_mg_aggregate_edge_exists = cugraph::test::device_gatherv(
+        *handle_, raft::device_span<bool const>(d_mg_edge_exists.data(), d_mg_edge_exists.size()));
+      auto d_mg_aggregate_edge_multiplicities = cugraph::test::device_gatherv(
+        *handle_,
+        raft::device_span<edge_t const>(d_mg_edge_multiplicities.data(),
+                                        d_mg_edge_multiplicities.size()));
+
+      cugraph::graph_t<vertex_t, edge_t, store_transposed, false> sg_graph(*handle_);
+      std::tie(sg_graph, std::ignore, std::ignore) = cugraph::test::mg_graph_to_sg_graph(
+        *handle_,
+        mg_graph_view,
+        std::optional<cugraph::edge_property_view_t<edge_t, weight_t const*>>{std::nullopt},
+        std::make_optional<raft::device_span<vertex_t const>>((*mg_renumber_map).data(),
+                                                              (*mg_renumber_map).size()),
+        false);
+
+      if (handle_->get_comms().get_rank() == 0) {
+        auto sg_graph_view = sg_graph.view();
+
+        // 4-2. run SG count_self_loops & count_multi_edges
+
+        auto d_sg_edge_exists = sg_graph_view.has_edge(
+          *handle_,
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_srcs.data(),
+                                            d_mg_aggregate_edge_srcs.size()),
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_dsts.data(),
+                                            d_mg_aggregate_edge_dsts.size()));
+        auto d_sg_edge_multiplicities = sg_graph_view.compute_multiplicity(
+          *handle_,
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_srcs.data(),
+                                            d_mg_aggregate_edge_srcs.size()),
+          raft::device_span<vertex_t const>(d_mg_aggregate_edge_dsts.data(),
+                                            d_mg_aggregate_edge_dsts.size()));
+
+        // 4-3. compare
+
+        auto h_mg_aggregate_edge_exists =
+          cugraph::test::to_host(*handle_, d_mg_aggregate_edge_exists);
+        auto h_mg_aggregate_edge_multiplicities =
+          cugraph::test::to_host(*handle_, d_mg_aggregate_edge_multiplicities);
+        auto h_sg_edge_exists         = cugraph::test::to_host(*handle_, d_sg_edge_exists);
+        auto h_sg_edge_multiplicities = cugraph::test::to_host(*handle_, d_sg_edge_multiplicities);
+
+        ASSERT_TRUE(std::equal(h_mg_aggregate_edge_exists.begin(),
+                               h_mg_aggregate_edge_exists.end(),
+                               h_sg_edge_exists.begin()));
+        ASSERT_TRUE(std::equal(h_mg_aggregate_edge_multiplicities.begin(),
+                               h_mg_aggregate_edge_multiplicities.end(),
+                               h_sg_edge_multiplicities.begin()));
+      }
+    }
+  }
+
+ private:
+  static std::unique_ptr<raft::handle_t> handle_;
+};
+
+template <typename input_usecase_t>
+std::unique_ptr<raft::handle_t> Tests_MGHasEdgeAndComputeMultiplicity<input_usecase_t>::handle_ =
+  nullptr;
+
+using Tests_MGHasEdgeAndComputeMultiplicity_File =
+  Tests_MGHasEdgeAndComputeMultiplicity<cugraph::test::File_Usecase>;
+using Tests_MGHasEdgeAndComputeMultiplicity_Rmat =
+  Tests_MGHasEdgeAndComputeMultiplicity<cugraph::test::Rmat_Usecase>;
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt64Int64FloatTransposeFalse)
+{
+  auto param = GetParam();
+  run_current_test<int64_t, int64_t, false>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_File, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(std::get<0>(param), std::get<1>(param));
+}
+
+TEST_P(Tests_MGHasEdgeAndComputeMultiplicity_Rmat, CheckInt32Int32FloatTransposeTrue)
+{
+  auto param = GetParam();
+  run_current_test<int32_t, int32_t, true>(
+    std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param)));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+  file_tests,
+  Tests_MGHasEdgeAndComputeMultiplicity_File,
+  ::testing::Combine(
+    // enable correctness checks
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"),
+                      cugraph::test::File_Usecase("test/datasets/webbase-1M.mtx"))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_small_tests,
+  Tests_MGHasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 128}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false))));
+
+INSTANTIATE_TEST_SUITE_P(
+  rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
+                          --gtest_filter to select only the rmat_benchmark_test with a specific
+                          vertex & edge type combination) by command line arguments and do not
+                          include more than one Rmat_Usecase that differ only in scale or edge
+                          factor (to avoid running same benchmarks more than once) */
+  Tests_MGHasEdgeAndComputeMultiplicity_Rmat,
+  ::testing::Combine(
+    ::testing::Values(HasEdgeAndComputeMultiplicity_Usecase{1024 * 1024 * 128, false}),
+    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, false, false))));
+
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/structure/mg_select_random_vertices_test.cpp b/cpp/tests/structure/mg_select_random_vertices_test.cpp
index 79c50301922..8392a6831ca 100644
--- a/cpp/tests/structure/mg_select_random_vertices_test.cpp
+++ b/cpp/tests/structure/mg_select_random_vertices_test.cpp
@@ -79,6 +79,8 @@ class Tests_MGSelectRandomVertices
     //
 
     std::vector<bool> with_replacement_flags = {true, false};
+    std::vector<bool> sort_vertices_flags    = {true, false};
+
     {
       // Generate distributed vertex set to sample from
       std::srand((unsigned)std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -90,7 +92,7 @@ class Tests_MGSelectRandomVertices
       std::iota(
         h_given_set.begin(), h_given_set.end(), mg_graph_view.local_vertex_partition_range_first());
       std::shuffle(h_given_set.begin(), h_given_set.end(), std::mt19937{std::random_device{}()});
-      h_given_set.resize(std::rand() % mg_graph_view.local_vertex_partition_range_size() + 1);
+      h_given_set.resize(std::rand() % (mg_graph_view.local_vertex_partition_range_size() + 1));
 
       // Compute size of the distributed vertex set
       int num_of_elements_in_given_set = static_cast<int>(h_given_set.size());
@@ -105,82 +107,97 @@ class Tests_MGSelectRandomVertices
       size_t select_count =
         num_of_elements_in_given_set > select_random_vertices_usecase.select_count
           ? select_random_vertices_usecase.select_count
-          : std::rand() % num_of_elements_in_given_set + 1;
-
-      for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
-        bool with_replacement = with_replacement_flags[idx];
-        auto d_sampled_vertices =
-          cugraph::select_random_vertices(*handle_,
-                                          mg_graph_view,
-                                          std::make_optional(raft::device_span<vertex_t const>{
-                                            d_given_set.data(), d_given_set.size()}),
-                                          rng_state,
-                                          select_count,
-                                          with_replacement,
-                                          true);
-
-        RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-        auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
-
-        if (select_random_vertices_usecase.check_correctness) {
-          if (!with_replacement) {
-            std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
-
-            auto nr_duplicates =
-              std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
-                            h_sampled_vertices.end());
-
-            ASSERT_EQ(nr_duplicates, 0);
+          : std::rand() % (num_of_elements_in_given_set + 1);
+
+      for (int i = 0; i < with_replacement_flags.size(); i++) {
+        for (int j = 0; j < sort_vertices_flags.size(); j++) {
+          bool with_replacement = with_replacement_flags[i];
+          bool sort_vertices    = sort_vertices_flags[j];
+
+          auto d_sampled_vertices =
+            cugraph::select_random_vertices(*handle_,
+                                            mg_graph_view,
+                                            std::make_optional(raft::device_span<vertex_t const>{
+                                              d_given_set.data(), d_given_set.size()}),
+                                            rng_state,
+                                            select_count,
+                                            with_replacement,
+                                            sort_vertices);
+
+          RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+          auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+          if (select_random_vertices_usecase.check_correctness) {
+            if (!with_replacement) {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+              auto nr_duplicates =
+                std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                              h_sampled_vertices.end());
+
+              ASSERT_EQ(nr_duplicates, 0);
+            }
+
+            std::sort(h_given_set.begin(), h_given_set.end());
+            if (sort_vertices) {
+              assert(std::is_sorted(h_sampled_vertices.begin(), h_sampled_vertices.end()));
+            } else {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+            }
+            std::for_each(
+              h_sampled_vertices.begin(), h_sampled_vertices.end(), [&h_given_set](vertex_t v) {
+                ASSERT_TRUE(std::binary_search(h_given_set.begin(), h_given_set.end(), v));
+              });
           }
-
-          std::sort(h_given_set.begin(), h_given_set.end());
-          std::for_each(
-            h_sampled_vertices.begin(), h_sampled_vertices.end(), [&h_given_set](vertex_t v) {
-              ASSERT_TRUE(std::binary_search(h_given_set.begin(), h_given_set.end(), v));
-            });
         }
       }
-    }
 
-    //
-    // Test sampling from [0, V)
-    //
-
-    for (int idx = 0; idx < with_replacement_flags.size(); idx++) {
-      bool with_replacement   = false;
-      auto d_sampled_vertices = cugraph::select_random_vertices(
-        *handle_,
-        mg_graph_view,
-        std::optional<raft::device_span<vertex_t const>>{std::nullopt},
-        rng_state,
-        select_random_vertices_usecase.select_count,
-        with_replacement,
-        true);
-
-      RAFT_CUDA_TRY(cudaDeviceSynchronize());
-
-      auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
-
-      if (select_random_vertices_usecase.check_correctness) {
-        if (!with_replacement) {
-          std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
-
-          auto nr_duplicates =
-            std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
-                          h_sampled_vertices.end());
-
-          ASSERT_EQ(nr_duplicates, 0);
+      //
+      // Test sampling from [0, V)
+      //
+
+      for (int i = 0; i < with_replacement_flags.size(); i++) {
+        for (int j = 0; j < sort_vertices_flags.size(); j++) {
+          bool with_replacement = with_replacement_flags[i];
+          bool sort_vertices    = sort_vertices_flags[j];
+
+          auto d_sampled_vertices = cugraph::select_random_vertices(
+            *handle_,
+            mg_graph_view,
+            std::optional<raft::device_span<vertex_t const>>{std::nullopt},
+            rng_state,
+            select_random_vertices_usecase.select_count,
+            with_replacement,
+            sort_vertices);
+
+          RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+          auto h_sampled_vertices = cugraph::test::to_host(*handle_, d_sampled_vertices);
+
+          if (select_random_vertices_usecase.check_correctness) {
+            if (!with_replacement) {
+              std::sort(h_sampled_vertices.begin(), h_sampled_vertices.end());
+
+              auto nr_duplicates =
+                std::distance(std::unique(h_sampled_vertices.begin(), h_sampled_vertices.end()),
+                              h_sampled_vertices.end());
+
+              ASSERT_EQ(nr_duplicates, 0);
+            }
+            if (sort_vertices) {
+              assert(std::is_sorted(h_sampled_vertices.begin(), h_sampled_vertices.end()));
+            }
+
+            auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
+            auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
+            std::for_each(h_sampled_vertices.begin(),
+                          h_sampled_vertices.end(),
+                          [vertex_first, vertex_last](vertex_t v) {
+                            ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
+                          });
+          }
         }
-
-        auto vertex_first = mg_graph_view.local_vertex_partition_range_first();
-        auto vertex_last  = mg_graph_view.local_vertex_partition_range_last();
-
-        std::for_each(h_sampled_vertices.begin(),
-                      h_sampled_vertices.end(),
-                      [vertex_first, vertex_last](vertex_t v) {
-                        ASSERT_TRUE((v >= vertex_first) && (v < vertex_last));
-                      });
       }
     }
   }
diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index 3fe70081614..7471e005ca0 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,9 +73,8 @@ inline auto make_pool()
   // run more than 2 tests in parallel at the same time. Changes to this value could
   // effect the maximum amount of parallel tests, and therefore `tests/CMakeLists.txt`
   // `_CUGRAPH_TEST_PERCENT` default value will need to be audited.
-  auto const [free, total] = rmm::detail::available_device_memory();
-  auto const min_alloc =
-    rmm::detail::align_down(std::min(free, total / 6), rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+  auto const [free, total] = rmm::available_device_memory();
+  auto const min_alloc = rmm::align_down(std::min(free, total / 6), rmm::CUDA_ALLOCATION_ALIGNMENT);
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda(), min_alloc);
 }
 
diff --git a/cpp/tests/utilities/device_comm_wrapper.cu b/cpp/tests/utilities/device_comm_wrapper.cu
index cfc65b5d741..50727394ad7 100644
--- a/cpp/tests/utilities/device_comm_wrapper.cu
+++ b/cpp/tests/utilities/device_comm_wrapper.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,9 +40,10 @@ rmm::device_uvector<T> device_gatherv(raft::handle_t const& handle,
   rmm::device_uvector<T> gathered_v(
     is_root ? std::reduce(rx_sizes.begin(), rx_sizes.end()) : size_t{0}, handle.get_stream());
 
+  using comm_datatype_t = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
   cugraph::device_gatherv(handle.get_comms(),
-                          d_input.data(),
-                          gathered_v.data(),
+                          reinterpret_cast<comm_datatype_t const*>(d_input.data()),
+                          reinterpret_cast<comm_datatype_t*>(gathered_v.data()),
                           d_input.size(),
                           rx_sizes,
                           rx_displs,
@@ -64,9 +65,10 @@ rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
   rmm::device_uvector<T> gathered_v(std::reduce(rx_sizes.begin(), rx_sizes.end()),
                                     handle.get_stream());
 
+  using comm_datatype_t = std::conditional_t<std::is_same_v<T, bool>, uint8_t, T>;
   cugraph::device_allgatherv(handle.get_comms(),
-                             d_input.data(),
-                             gathered_v.data(),
+                             reinterpret_cast<comm_datatype_t const*>(d_input.data()),
+                             reinterpret_cast<comm_datatype_t*>(gathered_v.data()),
                              rx_sizes,
                              rx_displs,
                              handle.get_stream());
@@ -76,6 +78,9 @@ rmm::device_uvector<T> device_allgatherv(raft::handle_t const& handle,
 
 // explicit instantiation
 
+template rmm::device_uvector<bool> device_gatherv(raft::handle_t const& handle,
+                                                  raft::device_span<bool const> d_input);
+
 template rmm::device_uvector<int32_t> device_gatherv(raft::handle_t const& handle,
                                                      raft::device_span<int32_t const> d_input);
 
@@ -91,6 +96,9 @@ template rmm::device_uvector<float> device_gatherv(raft::handle_t const& handle,
 template rmm::device_uvector<double> device_gatherv(raft::handle_t const& handle,
                                                     raft::device_span<double const> d_input);
 
+template rmm::device_uvector<bool> device_allgatherv(raft::handle_t const& handle,
+                                                     raft::device_span<bool const> d_input);
+
 template rmm::device_uvector<int32_t> device_allgatherv(raft::handle_t const& handle,
                                                         raft::device_span<int32_t const> d_input);
 
diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp
index 16c9d3ed145..5a9dc9c90d4 100644
--- a/cpp/tests/utilities/test_graphs.hpp
+++ b/cpp/tests/utilities/test_graphs.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -621,9 +621,27 @@ construct_graph(raft::handle_t const& handle,
 
   CUGRAPH_EXPECTS(d_src_v.size() <= static_cast<size_t>(std::numeric_limits<edge_t>::max()),
                   "Invalid template parameter: edge_t overflow.");
-  if (drop_self_loops) { remove_self_loops(handle, d_src_v, d_dst_v, d_weights_v); }
+  if (drop_self_loops) {
+    std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) =
+      cugraph::remove_self_loops<vertex_t, edge_t, weight_t, int32_t>(handle,
+                                                                      std::move(d_src_v),
+                                                                      std::move(d_dst_v),
+                                                                      std::move(d_weights_v),
+                                                                      std::nullopt,
+                                                                      std::nullopt);
+  }
 
-  if (drop_multi_edges) { sort_and_remove_multi_edges(handle, d_src_v, d_dst_v, d_weights_v); }
+  if (drop_multi_edges) {
+    std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) =
+      cugraph::remove_multi_edges<vertex_t, edge_t, weight_t, int32_t>(
+        handle,
+        std::move(d_src_v),
+        std::move(d_dst_v),
+        std::move(d_weights_v),
+        std::nullopt,
+        std::nullopt,
+        is_symmetric ? true /* keep minimum weight edges to maintain symmetry */ : false);
+  }
 
   graph_t<vertex_t, edge_t, store_transposed, multi_gpu> graph(handle);
   std::optional<
diff --git a/cpp/tests/utilities/test_utilities.hpp b/cpp/tests/utilities/test_utilities.hpp
index 321a0536e02..3fa6ae089d3 100644
--- a/cpp/tests/utilities/test_utilities.hpp
+++ b/cpp/tests/utilities/test_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -377,18 +377,24 @@ template <typename T>
 std::vector<T> to_host(raft::handle_t const& handle, raft::device_span<T const> data)
 {
   std::vector<T> h_data(data.size());
-  raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
+  if constexpr (std::is_same_v<T, bool>) {  // std::vector<bool> stores values in a packed format
+    auto h_tmp = new bool[data.size()];
+    raft::update_host(h_tmp, data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+    std::transform(
+      h_tmp, h_tmp + data.size(), h_data.begin(), [](uint8_t v) { return static_cast<bool>(v); });
+    delete[] h_tmp;
+  } else {
+    raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+  }
   return h_data;
 }
 
 template <typename T>
 std::vector<T> to_host(raft::handle_t const& handle, rmm::device_uvector<T> const& data)
 {
-  std::vector<T> h_data(data.size());
-  raft::update_host(h_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
-  return h_data;
+  return to_host(handle, raft::device_span<T const>(data.data(), data.size()));
 }
 
 template <typename T>
@@ -396,11 +402,7 @@ std::optional<std::vector<T>> to_host(raft::handle_t const& handle,
                                       std::optional<raft::device_span<T const>> data)
 {
   std::optional<std::vector<T>> h_data{std::nullopt};
-  if (data) {
-    h_data = std::vector<T>((*data).size());
-    raft::update_host((*h_data).data(), (*data).data(), (*data).size(), handle.get_stream());
-    handle.sync_stream();
-  }
+  if (data) { h_data = to_host(handle, *data); }
   return h_data;
 }
 
@@ -410,9 +412,7 @@ std::optional<std::vector<T>> to_host(raft::handle_t const& handle,
 {
   std::optional<std::vector<T>> h_data{std::nullopt};
   if (data) {
-    h_data = std::vector<T>((*data).size());
-    raft::update_host((*h_data).data(), (*data).data(), (*data).size(), handle.get_stream());
-    handle.sync_stream();
+    h_data = to_host(handle, raft::device_span<T const>((*data).data(), (*data).size()));
   }
   return h_data;
 }
@@ -430,8 +430,16 @@ template <typename T>
 rmm::device_uvector<T> to_device(raft::handle_t const& handle, std::vector<T> const& data)
 {
   rmm::device_uvector<T> d_data(data.size(), handle.get_stream());
-  raft::update_device(d_data.data(), data.data(), data.size(), handle.get_stream());
-  handle.sync_stream();
+  if constexpr (std::is_same_v<T, bool>) {  // std::vector<bool> stores values in a packed format
+    auto h_tmp = new bool[data.size()];
+    std::copy(data.begin(), data.end(), h_tmp);
+    raft::update_device(d_data.data(), h_tmp, h_tmp + data.size(), handle.get_stream());
+    handle.sync_stream();
+    delete[] h_tmp;
+  } else {
+    raft::update_device(d_data.data(), data.data(), data.size(), handle.get_stream());
+    handle.sync_stream();
+  }
   return d_data;
 }
 
@@ -453,11 +461,7 @@ std::optional<rmm::device_uvector<T>> to_device(raft::handle_t const& handle,
                                                 std::optional<std::vector<T>> const& data)
 {
   std::optional<rmm::device_uvector<T>> d_data{std::nullopt};
-  if (data) {
-    d_data = rmm::device_uvector<T>(data->size(), handle.get_stream());
-    raft::update_host(d_data->data(), data->data(), data->size(), handle.get_stream());
-    handle.sync_stream();
-  }
+  if (data) { d_data = to_device(handle, *data); }
   return d_data;
 }
 
diff --git a/cpp/tests/utilities/thrust_wrapper.cu b/cpp/tests/utilities/thrust_wrapper.cu
index cb7e6f1bd66..2daf250b4a2 100644
--- a/cpp/tests/utilities/thrust_wrapper.cu
+++ b/cpp/tests/utilities/thrust_wrapper.cu
@@ -206,131 +206,5 @@ template void populate_vertex_ids(raft::handle_t const& handle,
                                   rmm::device_uvector<int64_t>& d_vertices_v,
                                   int64_t vertex_id_offset);
 
-template <typename vertex_t, typename weight_t>
-void remove_self_loops(raft::handle_t const& handle,
-                       rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-                       rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-                       std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */)
-{
-  if (d_weight_v) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(d_src_v.begin(), d_dst_v.begin(), (*d_weight_v).begin()));
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::remove_if(
-                         handle.get_thrust_policy(),
-                         edge_first,
-                         edge_first + d_src_v.size(),
-                         [] __device__(auto e) { return thrust::get<0>(e) == thrust::get<1>(e); })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-    (*d_weight_v).resize(d_src_v.size(), handle.get_stream());
-  } else {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::remove_if(
-                         handle.get_thrust_policy(),
-                         edge_first,
-                         edge_first + d_src_v.size(),
-                         [] __device__(auto e) { return thrust::get<0>(e) == thrust::get<1>(e); })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-  }
-
-  d_src_v.shrink_to_fit(handle.get_stream());
-  d_dst_v.shrink_to_fit(handle.get_stream());
-  if (d_weight_v) { (*d_weight_v).shrink_to_fit(handle.get_stream()); }
-}
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void remove_self_loops(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template <typename vertex_t, typename weight_t>
-void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */)
-{
-  if (d_weight_v) {
-    auto edge_first = thrust::make_zip_iterator(
-      thrust::make_tuple(d_src_v.begin(), d_dst_v.begin(), (*d_weight_v).begin()));
-    thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size());
-    d_src_v.resize(
-      thrust::distance(edge_first,
-                       thrust::unique(handle.get_thrust_policy(),
-                                      edge_first,
-                                      edge_first + d_src_v.size(),
-                                      [] __device__(auto lhs, auto rhs) {
-                                        return (thrust::get<0>(lhs) == thrust::get<0>(rhs)) &&
-                                               (thrust::get<1>(lhs) == thrust::get<1>(rhs));
-                                      })),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-    (*d_weight_v).resize(d_src_v.size(), handle.get_stream());
-  } else {
-    auto edge_first =
-      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin()));
-    thrust::sort(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size());
-    d_src_v.resize(
-      thrust::distance(
-        edge_first,
-        thrust::unique(handle.get_thrust_policy(), edge_first, edge_first + d_src_v.size())),
-      handle.get_stream());
-    d_dst_v.resize(d_src_v.size(), handle.get_stream());
-  }
-
-  d_src_v.shrink_to_fit(handle.get_stream());
-  d_dst_v.shrink_to_fit(handle.get_stream());
-  if (d_weight_v) { (*d_weight_v).shrink_to_fit(handle.get_stream()); }
-}
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int32_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int32_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<float>>& d_weight_v /* [INOUT] */);
-
-template void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<int64_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<int64_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<double>>& d_weight_v /* [INOUT] */);
-
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/thrust_wrapper.hpp b/cpp/tests/utilities/thrust_wrapper.hpp
index eead4dc268f..fb82d781198 100644
--- a/cpp/tests/utilities/thrust_wrapper.hpp
+++ b/cpp/tests/utilities/thrust_wrapper.hpp
@@ -46,18 +46,5 @@ void populate_vertex_ids(raft::handle_t const& handle,
                          rmm::device_uvector<vertex_t>& d_vertices_v /* [INOUT] */,
                          vertex_t vertex_id_offset);
 
-template <typename vertex_t, typename weight_t>
-void remove_self_loops(raft::handle_t const& handle,
-                       rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-                       rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-                       std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */);
-
-template <typename vertex_t, typename weight_t>
-void sort_and_remove_multi_edges(
-  raft::handle_t const& handle,
-  rmm::device_uvector<vertex_t>& d_src_v /* [INOUT] */,
-  rmm::device_uvector<vertex_t>& d_dst_v /* [INOUT] */,
-  std::optional<rmm::device_uvector<weight_t>>& d_weight_v /* [INOUT] */);
-
 }  // namespace test
 }  // namespace cugraph
diff --git a/dependencies.yaml b/dependencies.yaml
index 2c0918ad117..e9badf2be9f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -9,7 +9,8 @@ files:
       - checks
       - common_build
       - cpp_build
-      - cudatoolkit
+      - cuda
+      - cuda_version
       - docs
       - python_build_wheel
       - python_build_cythonize
@@ -37,19 +38,19 @@ files:
   docs:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - docs
       - py_version
       - depends_on_pylibcugraphops
   test_cpp:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - test_cpp
   test_notebooks:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - py_version
       - test_notebook
       - test_python_common
@@ -57,7 +58,7 @@ files:
   test_python:
     output: none
     includes:
-      - cudatoolkit
+      - cuda_version
       - depends_on_cudf
       - py_version
       - test_python_common
@@ -165,6 +166,15 @@ files:
       table: project
     includes:
       - python_run_cugraph_dgl
+      - depends_on_pylibcugraphops
+  py_test_cugraph_dgl:
+    output: pyproject
+    pyproject_dir: python/cugraph-dgl
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
   py_build_cugraph_pyg:
     output: pyproject
     pyproject_dir: python/cugraph-pyg
@@ -179,6 +189,37 @@ files:
       table: project
     includes:
       - python_run_cugraph_pyg
+      - depends_on_pylibcugraphops
+  py_test_cugraph_pyg:
+    output: pyproject
+    pyproject_dir: python/cugraph-pyg
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+  py_build_cugraph_equivariant:
+    output: pyproject
+    pyproject_dir: python/cugraph-equivariant
+    extras:
+      table: build-system
+    includes:
+      - python_build_wheel
+  py_run_cugraph_equivariant:
+    output: pyproject
+    pyproject_dir: python/cugraph-equivariant
+    extras:
+      table: project
+    includes:
+      - depends_on_pylibcugraphops
+  py_test_cugraph_equivariant:
+    output: pyproject
+    pyproject_dir: python/cugraph-equivariant
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
   py_build_cugraph_service_client:
     output: pyproject
     pyproject_dir: python/cugraph-service/client
@@ -255,33 +296,40 @@ dependencies:
       - output_types: [conda, requirements]
         packages:
           - pre-commit
-  cudatoolkit:
+  cuda_version:
     specific:
-      - output_types: [conda]
+      - output_types: conda
         matrices:
           - matrix:
-              cuda: "12.0"
+              cuda: "11.2"
             packages:
-              - cuda-version=12.0
+              - cuda-version=11.2
           - matrix:
-              cuda: "11.8"
+              cuda: "11.4"
             packages:
-              - cuda-version=11.8
-              - cudatoolkit
+              - cuda-version=11.4
           - matrix:
               cuda: "11.5"
             packages:
               - cuda-version=11.5
-              - cudatoolkit
           - matrix:
-              cuda: "11.4"
+              cuda: "11.8"
             packages:
-              - cuda-version=11.4
-              - cudatoolkit
+              - cuda-version=11.8
           - matrix:
-              cuda: "11.2"
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+  cuda:
+    specific:
+      - output_types: [conda]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+          - matrix:
+              cuda: "11.*"
             packages:
-              - cuda-version=11.2
               - cudatoolkit
   common_build:
     common:
@@ -297,10 +345,10 @@ dependencies:
           - cxx-compiler
           - gmock>=1.13.0
           - gtest>=1.13.0
-          - libcugraphops==23.12.*
-          - libraft-headers==23.12.*
-          - libraft==23.12.*
-          - librmm==23.12.*
+          - libcugraphops==24.2.*
+          - libraft-headers==24.2.*
+          - libraft==24.2.*
+          - librmm==24.2.*
           - openmpi # Required for building cpp-mgtests (multi-GPU tests)
     specific:
       - output_types: [conda]
@@ -326,9 +374,8 @@ dependencies:
             packages:
               - nvcc_linux-aarch64=11.8
           - matrix:
-              cuda: "12.0"
+              cuda: "12.*"
             packages:
-              - cuda-version=12.0
               - cuda-nvcc
   docs:
     common:
@@ -372,21 +419,26 @@ dependencies:
       - output_types: [conda, pyproject, requirements]
         packages:
           - cython>=3.0.0
-          - scikit-build>=0.13.1
+      - output_types: conda
+        packages:
+          - scikit-build-core>=0.7.0
+      - output_types: [pyproject, requirements]
+        packages:
+          - scikit-build-core[pyproject]>=0.7.0
   python_run_cugraph:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - &dask rapids-dask-dependency==23.12.*
-          - &dask_cuda dask-cuda==23.12.*
+          - &dask rapids-dask-dependency==24.2.*
+          - &dask_cuda dask-cuda==24.2.*
           - &numba numba>=0.57
-          - &numpy numpy>=1.21 
-          - &ucx_py ucx-py==0.35.*
+          - &numpy numpy>=1.21
+          - &ucx_py ucx-py==0.36.*
       - output_types: conda
         packages:
           - aiohttp
           - fsspec>=0.6.0
-          - libcudf==23.12.*
+          - libcudf==24.2.*
           - requests
           - nccl>=2.9.9
           - ucx-proc=*=gpu
@@ -409,7 +461,7 @@ dependencies:
           - *numpy
       - output_types: [pyproject]
         packages:
-          - &cugraph cugraph==23.12.*
+          - &cugraph cugraph==24.2.*
   python_run_cugraph_pyg:
     common:
       - output_types: [conda, pyproject]
@@ -437,7 +489,7 @@ dependencies:
       - output_types: pyproject
         packages:
           - *cugraph
-          - cugraph-service-client==23.12.*
+          - cugraph-service-client==24.2.*
   test_cpp:
     common:
       - output_types: conda
@@ -472,7 +524,7 @@ dependencies:
           - scikit-learn>=0.23.1
       - output_types: [conda]
         packages:
-          - pylibwholegraph==23.12.*
+          - pylibwholegraph==24.2.*
   test_python_pylibcugraph:
     common:
       - output_types: [conda, pyproject]
@@ -489,7 +541,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==23.12.*
+          - cugraph==24.2.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - dgl>=1.1.0.cu*
@@ -497,7 +549,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - cugraph==23.12.*
+          - cugraph==24.2.*
           - pytorch>=2.0
           - pytorch-cuda==11.8
           - pyg>=2.4.0
@@ -506,22 +558,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==23.12.*
+          - &rmm_conda rmm==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &rmm_packages_pip_cu12
-              - rmm-cu12==23.12.*
+              - rmm-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *rmm_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *rmm_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &rmm_packages_pip_cu11
-              - rmm-cu11==23.12.*
+              - rmm-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *rmm_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *rmm_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *rmm_packages_pip_cu11}
@@ -531,22 +584,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==23.12.*
+          - &cudf_conda cudf==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &cudf_packages_pip_cu12
-              - cudf-cu12==23.12.*
+              - cudf-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *cudf_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *cudf_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &cudf_packages_pip_cu11
-              - cudf-cu11==23.12.*
+              - cudf-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *cudf_packages_pip_cu11}
@@ -556,22 +610,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &dask_cudf_conda dask-cudf==23.12.*
+          - &dask_cudf_conda dask-cudf==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &dask_cudf_packages_pip_cu12
-              - dask-cudf-cu12==23.12.*
+              - dask-cudf-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *dask_cudf_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *dask_cudf_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &dask_cudf_packages_pip_cu11
-              - dask-cudf-cu11==23.12.*
+              - dask-cudf-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *dask_cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *dask_cudf_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *dask_cudf_packages_pip_cu11}
@@ -581,22 +636,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibraft_conda pylibraft==23.12.*
+          - &pylibraft_conda pylibraft==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibraft_packages_pip_cu12
-              - pylibraft-cu12==23.12.*
+              - pylibraft-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *pylibraft_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibraft_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibraft_packages_pip_cu11
-              - pylibraft-cu11==23.12.*
+              - pylibraft-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *pylibraft_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibraft_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibraft_packages_pip_cu11}
@@ -606,22 +662,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &raft_dask_conda raft-dask==23.12.*
+          - &raft_dask_conda raft-dask==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &raft_dask_packages_pip_cu12
-              - raft-dask-cu12==23.12.*
+              - raft-dask-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *raft_dask_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *raft_dask_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &raft_dask_packages_pip_cu11
-              - raft-dask-cu11==23.12.*
+              - raft-dask-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *raft_dask_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *raft_dask_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *raft_dask_packages_pip_cu11}
@@ -631,22 +688,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraph_conda pylibcugraph==23.12.*
+          - &pylibcugraph_conda pylibcugraph==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibcugraph_packages_pip_cu12
-              - pylibcugraph-cu12==23.12.*
+              - pylibcugraph-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *pylibcugraph_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibcugraph_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibcugraph_packages_pip_cu11
-              - pylibcugraph-cu11==23.12.*
+              - pylibcugraph-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *pylibcugraph_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibcugraph_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibcugraph_packages_pip_cu11}
@@ -656,22 +714,23 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &pylibcugraphops_conda pylibcugraphops==23.12.*
+          - &pylibcugraphops_conda pylibcugraphops==24.2.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
     specific:
       - output_types: [requirements, pyproject]
         matrices:
           - matrix: {cuda: "12.2"}
             packages: &pylibcugraphops_packages_pip_cu12
-              - pylibcugraphops-cu12==23.12.*
+              - pylibcugraphops-cu12==24.2.*
           - {matrix: {cuda: "12.1"}, packages: *pylibcugraphops_packages_pip_cu12}
           - {matrix: {cuda: "12.0"}, packages: *pylibcugraphops_packages_pip_cu12}
           - matrix: {cuda: "11.8"}
             packages: &pylibcugraphops_packages_pip_cu11
-              - pylibcugraphops-cu11==23.12.*
+              - pylibcugraphops-cu11==24.2.*
           - {matrix: {cuda: "11.5"}, packages: *pylibcugraphops_packages_pip_cu11}
           - {matrix: {cuda: "11.4"}, packages: *pylibcugraphops_packages_pip_cu11}
           - {matrix: {cuda: "11.2"}, packages: *pylibcugraphops_packages_pip_cu11}
diff --git a/docs/cugraph/source/api_docs/cugraph_c/community.rst b/docs/cugraph/source/api_docs/cugraph_c/community.rst
index 0bbfe365c4d..d55325720c4 100644
--- a/docs/cugraph/source/api_docs/cugraph_c/community.rst
+++ b/docs/cugraph/source/api_docs/cugraph_c/community.rst
@@ -1,12 +1,6 @@
 Community
 =========
 
-.. role:: py(code)
-   :language: c
-   :class: highlight
-
-``#include <cugraph_c/community_algorithms.h>``
-
 Triangle Counting
 -----------------
 .. doxygenfunction:: cugraph_triangle_count
@@ -45,8 +39,8 @@ Spectral Clustering - Modularity Maximization
 .. doxygenfunction:: cugraph_analyze_clustering_modularity
     :project: libcugraph
 
-Spectral Clusteriong - Edge Cut
--------------------------------
+Spectral Clustering - Edge Cut
+------------------------------
 .. doxygenfunction:: cugraph_analyze_clustering_edge_cut
     :project: libcugraph
 
diff --git a/docs/cugraph/source/api_docs/cugraph_c/labeling.rst b/docs/cugraph/source/api_docs/cugraph_c/labeling.rst
index af105ee8fc9..4ca598c0a06 100644
--- a/docs/cugraph/source/api_docs/cugraph_c/labeling.rst
+++ b/docs/cugraph/source/api_docs/cugraph_c/labeling.rst
@@ -12,8 +12,8 @@ Strongly Connected Components
 .. doxygenfunction:: cugraph_strongly_connected_components
     :project: libcugraph
 
-Support
--------
+Labeling Support Functions
+--------------------------
  .. doxygengroup:: labeling
      :project: libcugraph
      :members:
diff --git a/docs/cugraph/source/api_docs/cugraph_c/sampling.rst b/docs/cugraph/source/api_docs/cugraph_c/sampling.rst
index 21b837daf93..3d5af713c33 100644
--- a/docs/cugraph/source/api_docs/cugraph_c/sampling.rst
+++ b/docs/cugraph/source/api_docs/cugraph_c/sampling.rst
@@ -7,7 +7,7 @@ Uniform Random Walks
     :project: libcugraph
 
 Biased Random Walks
---------------------
+-------------------
 .. doxygenfunction:: cugraph_biased_random_walks
     :project: libcugraph
 
@@ -21,16 +21,13 @@ Node2Vec
 .. doxygenfunction:: cugraph_node2vec
     :project: libcugraph
 
-Uniform Neighborhood Sampling
------------------------------
-.. doxygenfunction:: cugraph_uniform_neighbor_sample_with_edge_properties
-    :project: libcugraph
-
+Uniform Neighbor Sampling
+-------------------------
 .. doxygenfunction:: cugraph_uniform_neighbor_sample
     :project: libcugraph
 
-Support
--------
+Sampling Support Functions
+--------------------------
 .. doxygengroup:: samplingC
      :project: libcugraph
      :members:
diff --git a/docs/cugraph/source/api_docs/cugraph_c/similarity.rst b/docs/cugraph/source/api_docs/cugraph_c/similarity.rst
index fba07ad206c..200ba695781 100644
--- a/docs/cugraph/source/api_docs/cugraph_c/similarity.rst
+++ b/docs/cugraph/source/api_docs/cugraph_c/similarity.rst
@@ -17,8 +17,8 @@ Overlap
 .. doxygenfunction:: cugraph_overlap_coefficients
     :project: libcugraph
 
-Support
--------
+Similarty Support Functions
+---------------------------
 .. doxygengroup:: similarity
      :project: libcugraph
      :members:
diff --git a/docs/cugraph/source/api_docs/cugraph_c/traversal.rst b/docs/cugraph/source/api_docs/cugraph_c/traversal.rst
index c90760e9e79..1578951e05f 100644
--- a/docs/cugraph/source/api_docs/cugraph_c/traversal.rst
+++ b/docs/cugraph/source/api_docs/cugraph_c/traversal.rst
@@ -22,8 +22,8 @@ Extract Max Path Length
 .. doxygenfunction:: cugraph_extract_paths_result_get_max_path_length
     :project: libcugraph
 
-Support
--------
+Traversal Support Functions
+---------------------------
 .. doxygengroup:: traversal
      :project: libcugraph
      :members:
diff --git a/docs/cugraph/source/conf.py b/docs/cugraph/source/conf.py
index 3f7ef7deb03..cef06a584fc 100644
--- a/docs/cugraph/source/conf.py
+++ b/docs/cugraph/source/conf.py
@@ -77,9 +77,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '23.12'
+version = '24.02'
 # The full version, including alpha/beta/rc tags.
-release = '23.12.00'
+release = '24.02.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/cugraph/source/installation/source_build.md b/docs/cugraph/source/installation/source_build.md
index f5ee0741da6..1a129d45295 100644
--- a/docs/cugraph/source/installation/source_build.md
+++ b/docs/cugraph/source/installation/source_build.md
@@ -1,53 +1,46 @@
 # Building from Source
 
-The following instructions are for users wishing to build cuGraph from source code.  These instructions are tested on supported distributions of Linux, CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html) for list of supported environments.  Other operating systems _might be_ compatible, but are not currently tested.
-
-The cuGraph package include both a C/C++ CUDA portion and a python portion.  Both libraries need to be installed in order for cuGraph to operate correctly.
+These instructions are tested on supported versions/distributions of Linux,
+CUDA, and Python - See [RAPIDS Getting Started](https://rapids.ai/start.html)
+for the list of supported environments.  Other environments _might be_
+compatible, but are not currently tested.
 
 ## Prerequisites
 
-__Compiler:__
+__Compilers:__
 * `gcc`           version 9.3+
-* `nvcc`          version 11.0+
-* `cmake`         version 3.20.1+
+* `nvcc`          version 11.5+
 
 __CUDA:__
-* CUDA 11.0+
+* CUDA 11.2+
 * NVIDIA driver 450.80.02+
 * Pascal architecture or better
 
-You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).
-
-__Packages:__
-* `cmake`         version 3.20.1+
-* `libcugraphops` (version matching source branch version, eg. `23.10`)
-
-You can obtain `libcugraphops` using `conda`/`mamba` from the `nvidia` channel, or using `pip` with the `--extra-index-url=https://pypi.nvidia.com` option.  See the [RAPIDS docs](https://docs.rapids.ai/install#environment) for more details.
-
-## Building cuGraph
-To install cuGraph from source, ensure the dependencies are met.
+Further details and download links for these prerequisites are available on the
+[RAPIDS System Requirements page](https://docs.rapids.ai/install#system-req).
 
+## Setting up the development environment
 
-### Clone Repo and Configure Conda Environment
-__GIT clone a version of the repository__
-
-  ```bash
-  # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
-  export CUGRAPH_HOME=$(pwd)/cugraph
-
-  # Download the cuGraph repo - if you have a folked version, use that path here instead
-  git clone https://github.com/rapidsai/cugraph.git $CUGRAPH_HOME
+### Clone the repository:
+```bash
+CUGRAPH_HOME=$(pwd)/cugraph
+git clone https://github.com/rapidsai/cugraph.git $CUGRAPH_HOME
+cd $CUGRAPH_HOME
+```
 
-  cd $CUGRAPH_HOME
-  ```
+### Create the conda environment
 
-__Create the conda development environment__
+Using conda is the easiest way to install both the build and runtime
+dependencies for cugraph. While it is possible to build and run cugraph without
+conda, the required packages occasionally change, making it difficult to
+document here. The best way to see the current dependencies needed for a build
+and run environment is to examine the list of packages in the [conda
+environment YAML
+files](https://github.com/rapidsai/cugraph/blob/main/conda/environments).
 
 ```bash
-# create the conda environment (assuming in base `cugraph` directory)
-
 # for CUDA 11.x
-conda env create --name cugraph_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+conda env create --name cugraph_dev --file $CUGRAPH_HOME/conda/environments/all_cuda-118_arch-x86_64.yaml
 
 # activate the environment
 conda activate cugraph_dev
@@ -56,101 +49,53 @@ conda activate cugraph_dev
 conda deactivate
 ```
 
-  - The environment can be updated as development includes/changes the dependencies. To do so, run:
-
+The environment can be updated as cugraph adds/removes/updates its dependencies. To do so, run:
 
 ```bash
-
-# Where XXX is the CUDA 11 version
-conda env update --name cugraph_dev --file conda/environments/cugraph_dev_cuda11.XXX.yml
-
+# for CUDA 11.x
+conda env update --name cugraph_dev --file $CUGRAPH_HOME/conda/environments/all_cuda-118_arch-x86_64.yaml
 conda activate cugraph_dev
 ```
 
+### Build and Install
 
-### Build and Install Using the `build.sh` Script
-Using the `build.sh` script make compiling and installing cuGraph a breeze.  To build and install, simply do:
+#### Build and install using `build.sh`
+Using the `build.sh` script, located in the `$CUGRAPH_HOME` directory, is the
+recommended way to build and install the cugraph libraries. By default,
+`build.sh` will build and install a predefined set of targets
+(packages/libraries), but can also accept a list of targets to build.
 
-```bash
-$ cd $CUGRAPH_HOME
-$ ./build.sh clean
-$ ./build.sh libcugraph
-$ ./build.sh cugraph
-```
+For example, to build only the cugraph C++ library (`libcugraph`) and the
+high-level python library (`cugraph`) without building the C++ test binaries,
+run this command:
 
-There are several other options available on the build script for advanced users.
-`build.sh` options:
 ```bash
-build.sh [<target> ...] [<flag> ...]
- where <target> is:
-   clean                      - remove all existing build artifacts and configuration (start over)
-   uninstall                  - uninstall libcugraph and cugraph from a prior build/install (see also -n)
-   libcugraph                 - build libcugraph.so and SG test binaries
-   libcugraph_etl             - build libcugraph_etl.so and SG test binaries
-   pylibcugraph               - build the pylibcugraph Python package
-   cugraph                    - build the cugraph Python package
-   nx-cugraph                 - build the nx-cugraph Python package
-   cugraph-service            - build the cugraph-service_client and cugraph-service_server Python package
-   cpp-mgtests                - build libcugraph and libcugraph_etl MG tests. Builds MPI communicator, adding MPI as a dependency.
-   cugraph-dgl                - build the cugraph-dgl extensions for DGL
-   cugraph-pyg                - build the cugraph-dgl extensions for PyG
-   docs                       - build the docs
- and <flag> is:
-   -v                         - verbose build mode
-   -g                         - build for debug
-   -n                         - do not install after a successful build
-   --pydevelop                - use setup.py develop instead of install
-   --allgpuarch               - build for all supported GPU architectures
-   --skip_cpp_tests           - do not build the SG test binaries as part of the libcugraph and libcugraph_etl targets
-   --without_cugraphops       - do not build algos that require cugraph-ops
-   --cmake_default_generator  - use the default cmake generator instead of ninja
-   --clean                    - clean an individual target (note: to do a complete rebuild, use the clean target described above)
-   -h                         - print this text
-
- default action (no args) is to build and install 'libcugraph' then 'libcugraph_etl' then 'pylibcugraph' then 'cugraph' then 'cugraph-service' targets
-
-examples:
-$ ./build.sh clean                        # remove prior build artifacts (start over)
-$ ./build.sh libcugraph -v                # compile and install libcugraph with verbose output
-$ ./build.sh libcugraph -g                # compile and install libcugraph for debug
-$ ./build.sh libcugraph -n                # compile libcugraph but do not install
-
-# make parallelism options can also be defined: Example build jobs using 4 threads (make -j4)
-$ PARALLEL_LEVEL=4 ./build.sh libcugraph
-
-Note that the libraries will be installed to the location set in `$PREFIX` if set (i.e. `export PREFIX=/install/path`), otherwise to `$CONDA_PREFIX`.
+$ cd $CUGRAPH_HOME
+$ ./build.sh libcugraph pylibcugraph cugraph --skip_cpp_tests
 ```
 
+There are several other options available on the build script for advanced
+users. Refer to the output of `--help` for details.
 
-## Building each section independently
-#### Build and Install the C++/CUDA `libcugraph` Library
-CMake depends on the `nvcc` executable being on your path or defined in `$CUDACXX`.
-
-This project uses cmake for building the C/C++ library. To configure cmake, run:
-
-  ```bash
-  # Set the localtion to cuGraph in an environment variable CUGRAPH_HOME
-  export CUGRAPH_HOME=$(pwd)/cugraph
-
-  cd $CUGRAPH_HOME
-  cd cpp                                        # enter cpp directory
-  mkdir build                                   # create build directory
-  cd build                                      # enter the build directory
-  cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
-
-  # now build the code
-  make -j                                       # "-j" starts multiple threads
-  make install                                  # install the libraries
-  ```
-The default installation locations are `$CMAKE_INSTALL_PREFIX/lib` and `$CMAKE_INSTALL_PREFIX/include/cugraph` respectively.
+Note that libraries will be installed to the location set in `$PREFIX` if set
+(i.e. `export PREFIX=/install/path`), otherwise to `$CONDA_PREFIX`.
 
 #### Updating the RAFT branch
 
-`libcugraph` uses the [RAFT](https://github.com/rapidsai/raft) library and there are times when it might be desirable to build against a different RAFT branch, such as when working on new features that might span both RAFT and cuGraph.
+`libcugraph` uses the [RAFT](https://github.com/rapidsai/raft) library and
+there are times when it might be desirable to build against a different RAFT
+branch, such as when working on new features that might span both RAFT and
+cuGraph.
 
-For local development, the `CPM_raft_SOURCE=<path/to/raft/source>` option can be passed to the `cmake` command to enable `libcugraph` to use the local RAFT branch.
+For local development, the `CPM_raft_SOURCE=<path/to/raft/source>` option can
+be passed to the `cmake` command to enable `libcugraph` to use the local RAFT
+branch. The `build.sh` script calls `cmake` to build the C/C++ targets, but
+developers can call `cmake` directly in order to pass it options like those
+described here. Refer to the `build.sh` script to see how to call `cmake` and
+other commands directly.
 
-To have CI test a `cugraph` pull request against a different RAFT branch, modify the bottom of the `cpp/cmake/thirdparty/get_raft.cmake` file as follows:
+To have CI test a `cugraph` pull request against a different RAFT branch,
+modify the bottom of the `cpp/cmake/thirdparty/get_raft.cmake` file as follows:
 
 ```cmake
 # Change pinned tag and fork here to test a commit in CI
@@ -167,24 +112,10 @@ find_and_configure_raft(VERSION    ${CUGRAPH_MIN_VERSION_raft}
                         )
 ```
 
-When the above change is pushed to a pull request, the continuous integration servers will use the specified RAFT branch to run the cuGraph tests. After the changes in the RAFT branch are merged to the release branch, remember to revert the `get_raft.cmake` file back to the original cuGraph branch.
-
-### Building and installing the Python package
-
-2) Install the Python packages to your Python path:
-
-```bash
-cd $CUGRAPH_HOME
-cd python
-cd pylibcugraph
-python setup.py build_ext --inplace
-python setup.py install    # install pylibcugraph
-cd ../cugraph
-python setup.py build_ext --inplace
-python setup.py install    # install cugraph python bindings
-
-```
-
+When the above change is pushed to a pull request, the continuous integration
+servers will use the specified RAFT branch to run the cuGraph tests. After the
+changes in the RAFT branch are merged to the release branch, remember to revert
+the `get_raft.cmake` file back to the original cuGraph branch.
 
 
 ## Run tests
@@ -240,7 +171,10 @@ Note: This conda installation only applies to Linux and Python versions 3.8/3.10
 
 ### (OPTIONAL) Set environment variable on activation
 
-It is possible to configure the conda environment to set environmental variables on activation. Providing instructions to set PATH to include the CUDA toolkit bin directory and LD_LIBRARY_PATH to include the CUDA lib64 directory will be helpful.
+It is possible to configure the conda environment to set environment variables
+on activation. Providing instructions to set PATH to include the CUDA toolkit
+bin directory and LD_LIBRARY_PATH to include the CUDA lib64 directory will be
+helpful.
 
 ```bash
 cd  ~/anaconda3/envs/cugraph_dev
@@ -271,7 +205,8 @@ unset LD_LIBRARY_PATH
 
 ## Creating documentation
 
-Python API documentation can be generated from _./docs/cugraph directory_. Or through using "./build.sh docs"
+Python API documentation can be generated from _./docs/cugraph directory_. Or
+through using "./build.sh docs"
 
 ## Attribution
 Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md
diff --git a/docs/cugraph/source/wholegraph/installation/source_build.md b/docs/cugraph/source/wholegraph/installation/source_build.md
index c468048c351..a7727ac4052 100644
--- a/docs/cugraph/source/wholegraph/installation/source_build.md
+++ b/docs/cugraph/source/wholegraph/installation/source_build.md
@@ -27,7 +27,7 @@ __Other Packages__:
 * cython
 * setuputils3
 * scikit-learn
-* scikit-build
+* scikit-build-core
 * nanobind>=0.2.0
 
 ## Building wholegraph
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 2c1dd855cb5..1f099e8f85f 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-23.12/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()
diff --git a/mg_utils/wait_for_workers.py b/mg_utils/wait_for_workers.py
new file mode 100644
index 00000000000..fa75c90d4ad
--- /dev/null
+++ b/mg_utils/wait_for_workers.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import yaml
+
+from dask.distributed import Client
+
+
+def initialize_dask_cuda(communication_type):
+    communication_type = communication_type.lower()
+    if "ucx" in communication_type:
+        os.environ["UCX_MAX_RNDV_RAILS"] = "1"
+
+    if communication_type == "ucx-ib":
+        os.environ["UCX_MEMTYPE_REG_WHOLE_ALLOC_TYPES"]="cuda"
+        os.environ["DASK_RMM__POOL_SIZE"]="0.5GB"
+        os.environ["DASK_DISTRIBUTED__COMM__UCX__CREATE_CUDA_CONTEXT"]="True"
+
+
+def wait_for_workers(
+    num_expected_workers, scheduler_file_path, communication_type, timeout_after=0
+):
+    """
+    Waits until num_expected_workers workers are available based on
+    the workers managed by scheduler_file_path, then returns 0. If
+    timeout_after is specified, will return 1 if num_expected_workers
+    workers are not available before the timeout.
+    """
+    # FIXME: use scheduler file path from global environment if none
+    # supplied in configuration yaml
+
+    print("wait_for_workers.py - initializing client...", end="")
+    sys.stdout.flush()
+    initialize_dask_cuda(communication_type)
+    print("done.")
+    sys.stdout.flush()
+
+    ready = False
+    start_time = time.time()
+    while not ready:
+        if timeout_after and ((time.time() - start_time) >= timeout_after):
+            print(
+                f"wait_for_workers.py timed out after {timeout_after} seconds before finding {num_expected_workers} workers."
+            )
+            sys.stdout.flush()
+            break
+        with Client(scheduler_file=scheduler_file_path) as client:
+            num_workers = len(client.scheduler_info()["workers"])
+            if num_workers < num_expected_workers:
+                print(
+                    f"wait_for_workers.py expected {num_expected_workers} but got {num_workers}, waiting..."
+                )
+                sys.stdout.flush()
+                time.sleep(5)
+            else:
+                print(f"wait_for_workers.py got {num_workers} workers, done.")
+                sys.stdout.flush()
+                ready = True
+
+    if ready is False:
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    import argparse
+
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--num-expected-workers",
+        type=int,
+        required=False,
+        help="Number of workers to wait for. If not specified, "
+        "uses the NUM_WORKERS env var if set, otherwise defaults "
+        "to 16.",
+    )
+    ap.add_argument(
+        "--scheduler-file-path",
+        type=str,
+        required=True,
+        help="Path to shared scheduler file to read.",
+    )
+    ap.add_argument(
+        "--communication-type",
+        type=str,
+        default="tcp",
+        required=False,
+        help="Initiliaze dask_cuda based on the cluster communication type."
+        "Supported values are tcp(default), ucx, ucxib, ucx-ib.",
+    )
+    ap.add_argument(
+        "--timeout-after",
+        type=int,
+        default=0,
+        required=False,
+        help="Number of seconds to wait for workers. "
+        "Default is 0 which means wait forever.",
+    )
+    args = ap.parse_args()
+
+    if args.num_expected_workers is None:
+        args.num_expected_workers = os.environ.get("NUM_WORKERS", 16)
+
+    exitcode = wait_for_workers(
+        num_expected_workers=args.num_expected_workers,
+        scheduler_file_path=args.scheduler_file_path,
+        communication_type=args.communication_type,
+        timeout_after=args.timeout_after,
+    )
+
+    sys.exit(exitcode)
diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
index b73ccb0cf9a..c6df338ab9a 100644
--- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
+++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
@@ -10,11 +10,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==23.12.*
+- cugraph==24.2.*
 - dgl>=1.1.0.cu*
 - pandas
 - pre-commit
-- pylibcugraphops==23.12.*
+- pylibcugraphops==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
index f154b096256..11139910931 100644
--- a/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
+++ b/python/cugraph-dgl/cugraph_dgl/dataloading/dataloader.py
@@ -17,7 +17,7 @@
 import cupy as cp
 import cudf
 from cugraph.utilities.utils import import_optional
-from cugraph.experimental import BulkSampler
+from cugraph.gnn import BulkSampler
 from dask.distributed import default_client, Event
 from cugraph_dgl.dataloading import (
     HomogenousBulkSamplerDataset,
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
index 8843e61ad89..cc4ce474f2d 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,7 +29,7 @@ class GATConv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int or tuple
+    in_feats : int or (int, int)
         Input feature size. A pair denotes feature sizes of source and
         destination nodes.
     out_feats : int
@@ -92,7 +92,7 @@ class GATConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -104,14 +104,19 @@ def __init__(
         bias: bool = True,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
 
         if isinstance(in_feats, int):
@@ -126,28 +131,34 @@ def __init__(
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-            self.attn_weights = nn.Parameter(torch.Tensor(3 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(3 * num_heads * out_feats))
         else:
             self.register_parameter("lin_edge", None)
-            self.attn_weights = nn.Parameter(torch.Tensor(2 * num_heads * out_feats))
+            self.attn_weights = nn.Parameter(torch.empty(2 * num_heads * out_feats))
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
-            self.register_buffer("bias", None)
+            self.register_buffer("lin_res", None)
 
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        if bias and not isinstance(self.lin_res, nn.Linear):
+            if concat:
+                self.bias = nn.Parameter(torch.empty(num_heads, out_feats))
+            else:
+                self.bias = nn.Parameter(torch.empty(out_feats))
         else:
-            self.register_buffer("lin_res", None)
+            self.register_buffer("bias", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -172,7 +183,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -182,8 +193,10 @@ def forward(
         ----------
         graph : DGLGraph or SparseGraph
             The graph.
-        nfeat : torch.Tensor
-            Input features of shape :math:`(N, D_{in})`.
+        nfeat : torch.Tensor or (torch.Tensor, torch.Tensor)
+            Node features. If given as a tuple, the two elements correspond to
+            the source and destination node features, respectively, in a
+            bipartite graph.
         efeat: torch.Tensor, optional
             Optional edge features.
         max_in_degree : int
@@ -237,18 +250,17 @@ def forward(
 
         if bipartite:
             if not hasattr(self, "lin_src"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_feats must be a pair of "
-                    f"integers to allow bipartite node features, but got "
-                    f"{self.in_feats}."
-                )
-            nfeat_src = self.lin_src(nfeat[0])
-            nfeat_dst = self.lin_dst(nfeat[1])
+                nfeat_src = self.lin(nfeat[0])
+                nfeat_dst = self.lin(nfeat[1])
+            else:
+                nfeat_src = self.lin_src(nfeat[0])
+                nfeat_dst = self.lin_dst(nfeat[1])
         else:
             if not hasattr(self, "lin"):
                 raise RuntimeError(
                     f"{self.__class__.__name__}.in_feats is expected to be an "
-                    f"integer, but got {self.in_feats}."
+                    f"integer when the graph is not bipartite, "
+                    f"but got {self.in_feats}."
                 )
             nfeat = self.lin(nfeat)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
index 209a5fe1a8d..6c78b4df0b8 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -29,14 +29,11 @@ class GATv2Conv(BaseConv):
 
     Parameters
     ----------
-    in_feats : int, or pair of ints
-        Input feature size; i.e, the number of dimensions of :math:`h_i^{(l)}`.
-        If the layer is to be applied to a unidirectional bipartite graph, `in_feats`
-        specifies the input feature size on both the source and destination nodes.
-        If a scalar is given, the source and destination node feature size
-        would take the same value.
+    in_feats : int or (int, int)
+        Input feature size. A pair denotes feature sizes of source and
+        destination nodes.
     out_feats : int
-        Output feature size; i.e, the number of dimensions of :math:`h_i^{(l+1)}`.
+        Output feature size.
     num_heads : int
         Number of heads in Multi-Head Attention.
     feat_drop : float, optional
@@ -58,17 +55,15 @@ class GATv2Conv(BaseConv):
         input graph. By setting ``True``, it will suppress the check and let the
         users handle it by themselves. Defaults: ``False``.
     bias : bool, optional
-        If set to :obj:`False`, the layer will not learn
-        an additive bias. (default: :obj:`True`)
+        If True, learns a bias term. Defaults: ``True``.
     share_weights : bool, optional
-        If set to :obj:`True`, the same matrix for :math:`W_{left}` and
-        :math:`W_{right}` in the above equations, will be applied to the source
-        and the target node of every edge. (default: :obj:`False`)
+        If ``True``, the same matrix will be applied to the source and the
+        destination node features. Defaults: ``False``.
     """
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         num_heads: int,
         feat_drop: float = 0.0,
@@ -81,16 +76,22 @@ def __init__(
         share_weights: bool = False,
     ):
         super().__init__()
+
+        if isinstance(in_feats, int):
+            self.in_feats_src = self.in_feats_dst = in_feats
+        else:
+            self.in_feats_src, self.in_feats_dst = in_feats
         self.in_feats = in_feats
         self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
         self.num_heads = num_heads
         self.feat_drop = nn.Dropout(feat_drop)
         self.concat = concat
         self.edge_feats = edge_feats
         self.negative_slope = negative_slope
+        self.residual = residual
         self.allow_zero_in_degree = allow_zero_in_degree
         self.share_weights = share_weights
+        self.bias = bias
 
         self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
         if share_weights:
@@ -106,30 +107,28 @@ def __init__(
                 self.in_feats_dst, num_heads * out_feats, bias=bias
             )
 
-        self.attn = nn.Parameter(torch.Tensor(num_heads * out_feats))
+        self.attn_weights = nn.Parameter(torch.empty(num_heads * out_feats))
 
         if edge_feats is not None:
             self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
         else:
             self.register_parameter("lin_edge", None)
 
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(num_heads, out_feats))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
-        else:
-            self.register_buffer("bias", None)
-
-        self.residual = residual and self.in_feats_dst != out_feats * num_heads
-        if self.residual:
-            self.lin_res = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
+        out_dim = num_heads * out_feats if concat else out_feats
+        if residual:
+            if self.in_feats_dst != out_dim:
+                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
+            else:
+                self.lin_res = nn.Identity()
         else:
             self.register_buffer("lin_res", None)
 
         self.reset_parameters()
 
+    def set_allow_zero_in_degree(self, set_value):
+        r"""Set allow_zero_in_degree flag."""
+        self.allow_zero_in_degree = set_value
+
     def reset_parameters(self):
         r"""Reinitialize learnable parameters."""
         gain = nn.init.calculate_gain("relu")
@@ -137,7 +136,7 @@ def reset_parameters(self):
         nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
 
         nn.init.xavier_normal_(
-            self.attn.view(-1, self.num_heads, self.out_feats), gain=gain
+            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
         )
         if self.lin_edge is not None:
             self.lin_edge.reset_parameters()
@@ -145,13 +144,10 @@ def reset_parameters(self):
         if self.lin_res is not None:
             self.lin_res.reset_parameters()
 
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
@@ -225,7 +221,7 @@ def forward(
 
         out = ops_torch.operators.mha_gat_v2_n2n(
             nfeat,
-            self.attn,
+            self.attn_weights,
             _graph,
             num_heads=self.num_heads,
             activation="LeakyReLU",
@@ -243,7 +239,4 @@ def forward(
                 res = res.mean(dim=1)
             out = out + res
 
-        if self.bias is not None:
-            out = out + self.bias
-
         return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
index 54916674210..5c4b5dea441 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -100,16 +100,16 @@ def __init__(
         self.self_loop = self_loop
         if regularizer is None:
             self.W = nn.Parameter(
-                torch.Tensor(num_rels + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_rels + dim_self_loop, in_feats, out_feats)
             )
             self.coeff = None
         elif regularizer == "basis":
             if num_bases is None:
                 raise ValueError('Missing "num_bases" for basis regularization.')
             self.W = nn.Parameter(
-                torch.Tensor(num_bases + dim_self_loop, in_feats, out_feats)
+                torch.empty(num_bases + dim_self_loop, in_feats, out_feats)
             )
-            self.coeff = nn.Parameter(torch.Tensor(num_rels, num_bases))
+            self.coeff = nn.Parameter(torch.empty(num_rels, num_bases))
             self.num_bases = num_bases
         else:
             raise ValueError(
@@ -119,7 +119,7 @@ def __init__(
         self.regularizer = regularizer
 
         if bias:
-            self.bias = nn.Parameter(torch.Tensor(out_feats))
+            self.bias = nn.Parameter(torch.empty(out_feats))
         else:
             self.register_parameter("bias", None)
 
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
index a3f946d7cb4..b6198903766 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -65,7 +65,7 @@ class SAGEConv(BaseConv):
 
     def __init__(
         self,
-        in_feats: Union[int, Tuple[int, int]],
+        in_feats: Union[int, tuple[int, int]],
         out_feats: int,
         aggregator_type: str = "mean",
         feat_drop: float = 0.0,
@@ -111,7 +111,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        feat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         max_in_degree: Optional[int] = None,
     ) -> torch.Tensor:
         r"""Forward computation.
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
index 8481b9ee265..e77556fb76f 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
 from cugraph.utilities.utils import import_optional
@@ -51,7 +51,7 @@ class TransformerConv(BaseConv):
 
     def __init__(
         self,
-        in_node_feats: Union[int, Tuple[int, int]],
+        in_node_feats: Union[int, tuple[int, int]],
         out_node_feats: int,
         num_heads: int,
         concat: bool = True,
@@ -116,7 +116,7 @@ def reset_parameters(self):
     def forward(
         self,
         g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
+        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
         efeat: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Forward computation.
diff --git a/python/cugraph-dgl/pyproject.toml b/python/cugraph-dgl/pyproject.toml
index eff7a20f0aa..65ee414da44 100644
--- a/python/cugraph-dgl/pyproject.toml
+++ b/python/cugraph-dgl/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 [build-system]
 
@@ -18,15 +18,26 @@ authors = [
 ]
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
-dependencies = [
-    "cugraph==23.12.*",
-    "numba>=0.57",
-    "numpy>=1.21",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
     "Programming Language :: Python",
 ]
+dependencies = [
+    "cugraph==24.2.*",
+    "numba>=0.57",
+    "numpy>=1.21",
+    "pylibcugraphops==24.2.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.optional-dependencies]
+test = [
+    "pandas",
+    "pytest",
+    "pytest-benchmark",
+    "pytest-cov",
+    "pytest-xdist",
+    "scipy",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cugraph"
diff --git a/python/cugraph-dgl/tests/conftest.py b/python/cugraph-dgl/tests/conftest.py
index a3863ed81fa..ee1183f5cd1 100644
--- a/python/cugraph-dgl/tests/conftest.py
+++ b/python/cugraph-dgl/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,6 +13,7 @@
 
 import pytest
 
+import dgl
 import torch
 
 from cugraph.testing.mg_utils import (
@@ -58,3 +59,10 @@ class SparseGraphData1:
 @pytest.fixture
 def sparse_graph_1():
     return SparseGraphData1()
+
+
+@pytest.fixture
+def dgl_graph_1():
+    src = torch.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
+    dst = torch.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
+    return dgl.graph((src, dst))
diff --git a/python/cugraph-dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/tests/nn/test_gatconv.py
index ce145b2bc87..de27efc6329 100644
--- a/python/cugraph-dgl/tests/nn/test_gatconv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATConv as CuGraphGATConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,37 +22,49 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatconv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATConv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,24 +76,24 @@ def test_gatconv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {"bias": False, "allow_zero_in_degree": True, "residual": residual}
 
-    conv1 = GATConv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATConv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATConv(*args, **kwargs).cuda()
     dim = num_heads * out_feats
     with torch.no_grad():
-        conv2.attn_weights.data[:dim] = conv1.attn_l.data.flatten()
-        conv2.attn_weights.data[dim:] = conv1.attn_r.data.flatten()
-        if bipartite:
-            conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-            conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
+        conv2.attn_weights[:dim].copy_(conv1.attn_l.flatten())
+        conv2.attn_weights[dim:].copy_(conv1.attn_r.flatten())
+        if mode == "bipartite":
+            conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+            conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
         else:
-            conv2.lin.weight.data = conv1.fc.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+            conv2.lin.weight.copy_(conv1.fc.weight)
+        if residual and conv1.has_linear_res:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -90,12 +101,12 @@ def test_gatconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
-    if bipartite:
+    if mode == "bipartite":
         assert torch.allclose(
             conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
         )
@@ -105,25 +116,38 @@ def test_gatconv_equality(
     else:
         assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
 
+    if residual and conv1.has_linear_res:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
     assert torch.allclose(
         torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
         conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=ATOL,
+        atol=1e-5,  # Note: using a loosened tolerance here due to numerical error
     )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatconv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -131,17 +155,17 @@ def test_gatconv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -154,8 +178,8 @@ def test_gatconv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
index 52003edacca..2d26b7fdc28 100644
--- a/python/cugraph-dgl/tests/nn/test_gatv2conv.py
+++ b/python/cugraph-dgl/tests/nn/test_gatv2conv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,45 +15,56 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
 
-ATOL = 1e-6
+ATOL = 1e-5
 
 
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("residual", [False, True])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_gatv2conv_equality(
-    bipartite, idtype_int, max_in_degree, num_heads, residual, to_block, sparse_format
+    dgl_graph_1,
+    mode,
+    idx_type,
+    max_in_degree,
+    num_heads,
+    residual,
+    to_block,
+    sparse_format,
 ):
     from dgl.nn.pytorch import GATv2Conv
 
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
     size = (g.num_src_nodes(), g.num_dst_nodes())
 
-    if bipartite:
+    if mode == "bipartite":
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.randn(size[0], in_feats[0]).to(device),
+            torch.randn(size[1], in_feats[1]).to(device),
+        )
+    elif mode == "share_weights":
+        in_feats = 5
+        nfeat = (
+            torch.randn(size[0], in_feats).to(device),
+            torch.randn(size[1], in_feats).to(device),
         )
     else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        in_feats = 7
+        nfeat = torch.randn(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,19 +76,24 @@ def test_gatv2conv_equality(
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
     args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True}
+    kwargs = {
+        "bias": False,
+        "allow_zero_in_degree": True,
+        "residual": residual,
+        "share_weights": mode == "share_weights",
+    }
 
-    conv1 = GATv2Conv(*args, **kwargs).cuda()
-    out1 = conv1(g, nfeat)
+    conv1 = GATv2Conv(*args, **kwargs).to(device)
+    conv2 = CuGraphGATv2Conv(*args, **kwargs).to(device)
 
-    conv2 = CuGraphGATv2Conv(*args, **kwargs).cuda()
     with torch.no_grad():
-        conv2.attn.data = conv1.attn.data.flatten()
-        conv2.lin_src.weight.data = conv1.fc_src.weight.data.detach().clone()
-        conv2.lin_dst.weight.data = conv1.fc_dst.weight.data.detach().clone()
-        if residual and conv2.residual:
-            conv2.lin_res.weight.data = conv1.fc_res.weight.data.detach().clone()
+        conv2.attn_weights.copy_(conv1.attn.flatten())
+        conv2.lin_src.weight.copy_(conv1.fc_src.weight)
+        conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
+        if residual:
+            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
 
+    out1 = conv1(g, nfeat)
     if sparse_format is not None:
         out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
     else:
@@ -85,8 +101,8 @@ def test_gatv2conv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out1 = torch.rand_like(out1)
-    grad_out2 = grad_out1.clone().detach()
+    grad_out1 = torch.randn_like(out1)
+    grad_out2 = grad_out1.detach().clone()
     out1.backward(grad_out1)
     out2.backward(grad_out2)
 
@@ -97,21 +113,38 @@ def test_gatv2conv_equality(
         conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
     )
 
-    assert torch.allclose(conv1.attn.grad, conv1.attn.grad, atol=ATOL)
+    if residual:
+        assert torch.allclose(
+            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
+        )
+
+    assert torch.allclose(
+        conv1.attn.grad,
+        conv2.attn_weights.grad.view(1, num_heads, out_feats),
+        atol=ATOL,
+    )
 
 
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8, 800])
+@pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_heads", [1, 2, 7])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 def test_gatv2conv_edge_feats(
-    bias, bipartite, concat, max_in_degree, num_heads, to_block, use_edge_feats
+    dgl_graph_1,
+    bias,
+    bipartite,
+    concat,
+    max_in_degree,
+    num_heads,
+    to_block,
+    use_edge_feats,
 ):
     torch.manual_seed(12345)
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device)
 
     if to_block:
         g = dgl.to_block(g)
@@ -119,17 +152,17 @@ def test_gatv2conv_edge_feats(
     if bipartite:
         in_feats = (10, 3)
         nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).cuda(),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).cuda(),
+            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
+            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
         )
     else:
         in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).cuda()
+        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
     out_feats = 2
 
     if use_edge_feats:
         edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).cuda()
+        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
     else:
         edge_feats = None
         efeat = None
@@ -142,8 +175,8 @@ def test_gatv2conv_edge_feats(
         edge_feats=edge_feats,
         bias=bias,
         allow_zero_in_degree=True,
-    ).cuda()
+    ).to(device)
     out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
index bdaa89e57f2..b5d3686c609 100644
--- a/python/cugraph-dgl/tests/nn/test_relgraphconv.py
+++ b/python/cugraph-dgl/tests/nn/test_relgraphconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -23,7 +22,7 @@
 ATOL = 1e-6
 
 
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("num_bases", [1, 2, 5])
 @pytest.mark.parametrize("regularizer", [None, "basis"])
@@ -31,7 +30,8 @@
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_relgraphconv_equality(
-    idtype_int,
+    dgl_graph_1,
+    idx_type,
     max_in_degree,
     num_bases,
     regularizer,
@@ -42,6 +42,12 @@ def test_relgraphconv_equality(
     from dgl.nn.pytorch import RelGraphConv
 
     torch.manual_seed(12345)
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
+
+    if to_block:
+        g = dgl.to_block(g)
+
     in_feat, out_feat, num_rels = 10, 2, 3
     args = (in_feat, out_feat, num_rels)
     kwargs = {
@@ -50,16 +56,10 @@ def test_relgraphconv_equality(
         "bias": False,
         "self_loop": self_loop,
     }
-    g = create_graph1().to("cuda")
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).cuda()
-
-    if idtype_int:
-        g = g.int()
-    if to_block:
-        g = dgl.to_block(g)
 
+    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
     size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).cuda()
+    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
 
     if sparse_format == "coo":
         sg = SparseGraph(
@@ -76,18 +76,18 @@ def test_relgraphconv_equality(
             size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
         )
 
-    conv1 = RelGraphConv(*args, **kwargs).cuda()
-    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).cuda()
+    conv1 = RelGraphConv(*args, **kwargs).to(device)
+    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).to(device)
 
     with torch.no_grad():
         if self_loop:
-            conv2.W.data[:-1] = conv1.linear_r.W.data
-            conv2.W.data[-1] = conv1.loop_weight.data
+            conv2.W[:-1].copy_(conv1.linear_r.W)
+            conv2.W[-1].copy_(conv1.loop_weight)
         else:
-            conv2.W.data = conv1.linear_r.W.data.detach().clone()
+            conv2.W.copy_(conv1.linear_r.W)
 
         if regularizer is not None:
-            conv2.coeff.data = conv1.linear_r.coeff.data.detach().clone()
+            conv2.coeff.copy_(conv1.linear_r.coeff)
 
     out1 = conv1(g, feat, g.edata[dgl.ETYPE])
 
@@ -98,7 +98,7 @@ def test_relgraphconv_equality(
 
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
 
diff --git a/python/cugraph-dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/tests/nn/test_sageconv.py
index b5d0a44b868..3f1c2b1b3fe 100644
--- a/python/cugraph-dgl/tests/nn/test_sageconv.py
+++ b/python/cugraph-dgl/tests/nn/test_sageconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,21 +25,19 @@
 @pytest.mark.parametrize("aggr", ["mean", "pool"])
 @pytest.mark.parametrize("bias", [False, True])
 @pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
 @pytest.mark.parametrize("max_in_degree", [None, 8])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_sageconv_equality(
-    aggr, bias, bipartite, idtype_int, max_in_degree, to_block, sparse_format
+    dgl_graph_1, aggr, bias, bipartite, idx_type, max_in_degree, to_block, sparse_format
 ):
     from dgl.nn.pytorch import SAGEConv
 
     torch.manual_seed(12345)
-    kwargs = {"aggregator_type": aggr, "bias": bias}
-    g = create_graph1().to("cuda")
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
-    if idtype_int:
-        g = g.int()
     if to_block:
         g = dgl.to_block(g)
 
@@ -49,12 +46,12 @@ def test_sageconv_equality(
     if bipartite:
         in_feats = (5, 3)
         feat = (
-            torch.rand(size[0], in_feats[0], requires_grad=True).cuda(),
-            torch.rand(size[1], in_feats[1], requires_grad=True).cuda(),
+            torch.rand(size[0], in_feats[0], requires_grad=True).to(device),
+            torch.rand(size[1], in_feats[1], requires_grad=True).to(device),
         )
     else:
         in_feats = 5
-        feat = torch.rand(size[0], in_feats).cuda()
+        feat = torch.rand(size[0], in_feats).to(device)
     out_feats = 2
 
     if sparse_format == "coo":
@@ -65,18 +62,19 @@ def test_sageconv_equality(
         offsets, indices, _ = g.adj_tensors("csc")
         sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
 
-    conv1 = SAGEConv(in_feats, out_feats, **kwargs).cuda()
-    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).cuda()
+    kwargs = {"aggregator_type": aggr, "bias": bias}
+    conv1 = SAGEConv(in_feats, out_feats, **kwargs).to(device)
+    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).to(device)
 
     in_feats_src = conv2.in_feats_src
     with torch.no_grad():
-        conv2.lin.weight.data[:, :in_feats_src] = conv1.fc_neigh.weight.data
-        conv2.lin.weight.data[:, in_feats_src:] = conv1.fc_self.weight.data
+        conv2.lin.weight[:, :in_feats_src].copy_(conv1.fc_neigh.weight)
+        conv2.lin.weight[:, in_feats_src:].copy_(conv1.fc_self.weight)
         if bias:
-            conv2.lin.bias.data[:] = conv1.fc_self.bias.data
+            conv2.lin.bias.copy_(conv1.fc_self.bias)
         if aggr == "pool":
-            conv2.pre_lin.weight.data[:] = conv1.fc_pool.weight.data
-            conv2.pre_lin.bias.data[:] = conv1.fc_pool.bias.data
+            conv2.pre_lin.weight.copy_(conv1.fc_pool.weight)
+            conv2.pre_lin.bias.copy_(conv1.fc_pool.bias)
 
     out1 = conv1(g, feat)
     if sparse_format is not None:
@@ -85,7 +83,7 @@ def test_sageconv_equality(
         out2 = conv2(g, feat, max_in_degree=max_in_degree)
     assert torch.allclose(out1, out2, atol=ATOL)
 
-    grad_out = torch.rand_like(out1)
+    grad_out = torch.randn_like(out1)
     out1.backward(grad_out)
     out2.backward(grad_out)
     assert torch.allclose(
diff --git a/python/cugraph-dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/tests/nn/test_transformerconv.py
index 5ac4fd7bea7..28d13dedec8 100644
--- a/python/cugraph-dgl/tests/nn/test_transformerconv.py
+++ b/python/cugraph-dgl/tests/nn/test_transformerconv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 
 from cugraph_dgl.nn.conv.base import SparseGraph
 from cugraph_dgl.nn import TransformerConv
-from .common import create_graph1
 
 dgl = pytest.importorskip("dgl", reason="DGL not available")
 torch = pytest.importorskip("torch", reason="PyTorch not available")
@@ -26,27 +25,25 @@
 @pytest.mark.parametrize("beta", [False, True])
 @pytest.mark.parametrize("bipartite_node_feats", [False, True])
 @pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("idtype_int", [False, True])
-@pytest.mark.parametrize("num_heads", [1, 2, 3, 4])
+@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
+@pytest.mark.parametrize("num_heads", [1, 3, 4])
 @pytest.mark.parametrize("to_block", [False, True])
 @pytest.mark.parametrize("use_edge_feats", [False, True])
 @pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
 def test_transformerconv(
+    dgl_graph_1,
     beta,
     bipartite_node_feats,
     concat,
-    idtype_int,
+    idx_type,
     num_heads,
     to_block,
     use_edge_feats,
     sparse_format,
 ):
     torch.manual_seed(12345)
-    device = "cuda"
-    g = create_graph1().to(device)
-
-    if idtype_int:
-        g = g.int()
+    device = torch.device("cuda")
+    g = dgl_graph_1.to(device).astype(idx_type)
 
     if to_block:
         g = dgl.to_block(g)
@@ -92,5 +89,5 @@ def test_transformerconv(
     else:
         out = conv(g, nfeat, efeat)
 
-    grad_out = torch.rand_like(out)
+    grad_out = torch.randn_like(out)
     out.backward(grad_out)
diff --git a/python/cugraph-equivariant/LICENSE b/python/cugraph-equivariant/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cugraph-equivariant/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cugraph-equivariant/README.md b/python/cugraph-equivariant/README.md
new file mode 100644
index 00000000000..d5de8852709
--- /dev/null
+++ b/python/cugraph-equivariant/README.md
@@ -0,0 +1,5 @@
+# cugraph-equivariant
+
+## Description
+
+cugraph-equivariant library provides fast symmetry-preserving (equivariant) operations and convolutional layers, to accelerate the equivariant neural networks in drug discovery and other domains.
diff --git a/python/cugraph-equivariant/cugraph_equivariant/VERSION b/python/cugraph-equivariant/cugraph_equivariant/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cugraph-equivariant/cugraph_equivariant/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/__init__.py
new file mode 100644
index 00000000000..20507bd9329
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cugraph_equivariant._version import __git_commit__, __version__
diff --git a/python/cugraph-service/server/setup.py b/python/cugraph-equivariant/cugraph_equivariant/_version.py
similarity index 51%
rename from python/cugraph-service/server/setup.py
rename to python/cugraph-equivariant/cugraph_equivariant/_version.py
index 91864168e2c..31a707bb17e 100644
--- a/python/cugraph-service/server/setup.py
+++ b/python/cugraph-equivariant/cugraph_equivariant/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,14 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from setuptools import find_packages, setup
+import importlib.resources
 
-packages = find_packages(include=["cugraph_service_server*"])
-setup(
-    entry_points={
-        "console_scripts": [
-            "cugraph-service-server=cugraph_service_server.__main__:main"
-        ],
-    },
-    package_data={key: ["VERSION"] for key in packages},
+# Read VERSION file from the module that is symlinked to VERSION file
+# in the root of the repo at build time or copied to the module at
+# installation. VERSION is a separate file that allows CI build-time scripts
+# to update version info (including commit hashes) without modifying
+# source files.
+__version__ = (
+    importlib.resources.files("cugraph_equivariant")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
 )
+__git_commit__ = ""
diff --git a/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py b/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py
new file mode 100644
index 00000000000..8f4d8de0042
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/nn/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tensor_product_conv import FullyConnectedTensorProductConv
+
+DiffDockTensorProductConv = FullyConnectedTensorProductConv
+
+__all__ = [
+    "FullyConnectedTensorProductConv",
+    "DiffDockTensorProductConv",
+]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py b/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py
new file mode 100644
index 00000000000..5120a23180d
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/nn/tensor_product_conv.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Sequence, Union
+
+import torch
+from torch import nn
+from e3nn import o3
+from e3nn.nn import BatchNorm
+
+from cugraph_equivariant.utils import scatter_reduce
+
+from pylibcugraphops.pytorch.operators import FusedFullyConnectedTensorProduct
+
+
+class FullyConnectedTensorProductConv(nn.Module):
+    r"""Message passing layer for tensor products in DiffDock-like architectures.
+    The left operand of tensor product is the spherical harmonic representation
+    of edge vector; the right operand consists of node features in irreps.
+
+    .. math::
+        \sum_{b \in \mathcal{N}_a} Y\left(\hat{r}_{a b}\right)
+        \otimes_{\psi_{a b}} \mathbf{h}_b
+
+    where the path weights :math:`\psi_{a b}` can be constructed from edge
+    embeddings and scalar features using an MLP:
+
+    .. math::
+        \psi_{a b} = \operatorname{MLP}
+        \left(e_{a b}, \mathbf{h}_a^0, \mathbf{h}_b^0\right)
+
+    Users have the option to either directly input the weights or provide the
+    MLP parameters and scalar features from edges and nodes.
+
+    Parameters
+    ----------
+    in_irreps : e3nn.o3.Irreps
+        Irreps for the input node features.
+
+    sh_irreps : e3nn.o3.Irreps
+        Irreps for the spherical harmonic representations of edge vectors.
+
+    out_irreps : e3nn.o3.Irreps
+        Irreps for the output.
+
+    batch_norm : bool, optional (default=True)
+        If true, batch normalization is applied.
+
+    mlp_channels : sequence of ints, optional (default=None)
+        A sequence of integers defining number of neurons in each layer in MLP
+        before the output layer. If `None`, no MLP will be added. The input layer
+        contains edge embeddings and node scalar features.
+
+    mlp_activation : nn.Module or sequence of nn.Module, optional (default=nn.GELU())
+        A sequence of functions to be applied in between linear layers in MLP,
+        e.g., `nn.Sequential(nn.ReLU(), nn.Dropout(0.4))`.
+
+    e3nn_compat_mode: bool, optional (default=False)
+        cugraph-ops and e3nn use different memory layout for Irreps-tensors.
+        The last (fastest moving) dimension is num_channels for cugraph-ops and
+        ir.dim for e3nn. When enabled, the input and output of this layer will
+        follow e3nn's memory layout.
+
+    Examples
+    --------
+    >>> # Case 1: MLP with the input layer having 6 channels and 2 hidden layers
+    >>> #         having 16 channels. edge_emb.size(1) must match the size of
+    >>> #         the input layer: 6
+    >>>
+    >>> conv1 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
+    >>>     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU()).cuda()
+    >>> out = conv1(src_features, edge_sh, edge_emb, graph)
+    >>>
+    >>> # Case 2: Same as case 1 but with the scalar features from edges, sources
+    >>> #         and destinations passed in separately.
+    >>>
+    >>> conv2 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
+    >>>     mlp_channels=[6, 16, 16], mlp_activation=nn.ReLU()).cuda()
+    >>> out = conv3(src_features, edge_sh, edge_scalars, graph,
+    >>>     src_scalars=src_scalars, dst_scalars=dst_scalars)
+    >>>
+    >>> # Case 3: No MLP, edge_emb will be directly used as the tensor product weights
+    >>>
+    >>> conv3 = FullyConnectedTensorProductConv(in_irreps, sh_irreps, out_irreps,
+    >>>     mlp_channels=None).cuda()
+    >>> out = conv2(src_features, edge_sh, edge_emb, graph)
+
+    """
+
+    def __init__(
+        self,
+        in_irreps: o3.Irreps,
+        sh_irreps: o3.Irreps,
+        out_irreps: o3.Irreps,
+        batch_norm: bool = True,
+        mlp_channels: Optional[Sequence[int]] = None,
+        mlp_activation: Union[nn.Module, Sequence[nn.Module]] = nn.GELU(),
+        e3nn_compat_mode: bool = False,
+    ):
+        super().__init__()
+        self.in_irreps = in_irreps
+        self.out_irreps = out_irreps
+        self.sh_irreps = sh_irreps
+        self.e3nn_compat_mode = e3nn_compat_mode
+
+        self.tp = FusedFullyConnectedTensorProduct(
+            in_irreps, sh_irreps, out_irreps, e3nn_compat_mode=e3nn_compat_mode
+        )
+
+        self.batch_norm = BatchNorm(out_irreps) if batch_norm else None
+
+        if mlp_activation is None:
+            mlp_activation = []
+        elif hasattr(mlp_activation, "__len__") and hasattr(
+            mlp_activation, "__getitem__"
+        ):
+            mlp_activation = list(mlp_activation)
+        else:
+            mlp_activation = [mlp_activation]
+
+        if mlp_channels is not None:
+            dims = list(mlp_channels) + [self.tp.weight_numel]
+            mlp = []
+            for i in range(len(dims) - 1):
+                mlp.append(nn.Linear(dims[i], dims[i + 1]))
+                if i != len(dims) - 2:
+                    mlp.extend(mlp_activation)
+            self.mlp = nn.Sequential(*mlp)
+        else:
+            self.mlp = None
+
+    def forward(
+        self,
+        src_features: torch.Tensor,
+        edge_sh: torch.Tensor,
+        edge_emb: torch.Tensor,
+        graph: tuple[torch.Tensor, tuple[int, int]],
+        src_scalars: Optional[torch.Tensor] = None,
+        dst_scalars: Optional[torch.Tensor] = None,
+        reduce: str = "mean",
+        edge_envelope: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass.
+
+        Parameters
+        ----------
+        src_features : torch.Tensor
+            Source node features.
+            Shape: (num_src_nodes, in_irreps.dim)
+
+        edge_sh : torch.Tensor
+            The spherical harmonic representations of the edge vectors.
+            Shape: (num_edges, sh_irreps.dim)
+
+        edge_emb: torch.Tensor
+            Edge embeddings that are fed into MLPs to generate tensor product weights.
+            Shape: (num_edges, dim), where `dim` should be:
+            - `tp.weight_numel` when the layer does not contain MLPs.
+            - num_edge_scalars, with the sum of num_[edge/src/dst]_scalars being
+              mlp_channels[0]
+
+        graph : tuple
+            A tuple that stores the graph information, with the first element being
+            the adjacency matrix in COO, and the second element being its shape:
+            (num_src_nodes, num_dst_nodes).
+
+        src_scalars: torch.Tensor, optional
+            Scalar features of source nodes.
+            Shape: (num_src_nodes, num_src_scalars)
+
+        dst_scalars: torch.Tensor, optional
+            Scalar features of destination nodes.
+            Shape: (num_dst_nodes, num_dst_scalars)
+
+        reduce : str, optional (default="mean")
+            Reduction operator. Choose between "mean" and "sum".
+
+        edge_envelope: torch.Tensor, optional
+            Typically used as attenuation factors to fade out messages coming
+            from nodes close to the cutoff distance used to create the graph.
+            This is important to make the model smooth to the changes in node's
+            coordinates.
+            Shape: (num_edges,)
+
+        Returns
+        -------
+        torch.Tensor
+            Output node features.
+            Shape: (num_dst_nodes, out_irreps.dim)
+        """
+        edge_emb_size = edge_emb.size(-1)
+        src_scalars_size = 0 if src_scalars is None else src_scalars.size(-1)
+        dst_scalars_size = 0 if dst_scalars is None else dst_scalars.size(-1)
+
+        if self.mlp is None:
+            if self.tp.weight_numel != edge_emb_size:
+                raise RuntimeError(
+                    f"When MLP is not present, edge_emb's last dimension must "
+                    f"equal tp.weight_numel (but got {edge_emb_size} and "
+                    f"{self.tp.weight_numel})"
+                )
+        else:
+            total_size = edge_emb_size + src_scalars_size + dst_scalars_size
+            if self.mlp[0].in_features != total_size:
+                raise RuntimeError(
+                    f"The size of MLP's input layer ({self.mlp[0].in_features}) "
+                    f"does not match the total number of scalar features from "
+                    f"edge_emb, src_scalars and dst_scalars ({total_size})"
+                )
+
+        if reduce not in ["mean", "sum"]:
+            raise RuntimeError(
+                f"reduce argument must be either 'mean' or 'sum', got {reduce}."
+            )
+
+        (src, dst), (num_src_nodes, num_dst_nodes) = graph
+
+        if self.mlp is not None:
+            if src_scalars is None and dst_scalars is None:
+                tp_weights = self.mlp(edge_emb)
+            else:
+                w_edge, w_src, w_dst = torch.split(
+                    self.mlp[0].weight,
+                    (edge_emb_size, src_scalars_size, dst_scalars_size),
+                    dim=-1,
+                )
+                tp_weights = edge_emb @ w_edge.T + self.mlp[0].bias
+
+                if src_scalars is not None:
+                    tp_weights += (src_scalars @ w_src.T)[src]
+
+                if dst_scalars is not None:
+                    tp_weights += (dst_scalars @ w_dst.T)[dst]
+
+                tp_weights = self.mlp[1:](tp_weights)
+        else:
+            tp_weights = edge_emb
+
+        out = self.tp(src_features[src], edge_sh, tp_weights)
+
+        if edge_envelope is not None:
+            out = out * edge_envelope.view(-1, 1)
+
+        out = scatter_reduce(out, dst, dim=0, dim_size=num_dst_nodes, reduce=reduce)
+
+        if self.batch_norm:
+            out = self.batch_norm(out)
+
+        return out
diff --git a/ci/utils/is_pascal.py b/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
similarity index 50%
rename from ci/utils/is_pascal.py
rename to python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
index e716f59422f..c7c6bad07db 100644
--- a/ci/utils/is_pascal.py
+++ b/python/cugraph-equivariant/cugraph_equivariant/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,27 +11,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
-import sys
-import glob
+import pytest
+import torch
 
-from numba import cuda
 
-# FIXME: consolidate this code with ci/gpu/notebook_list.py
+@pytest.fixture
+def example_scatter_data():
+    src_feat = torch.Tensor([3, 1, 0, 1, 1, 2])
+    dst_indices = torch.Tensor([0, 1, 2, 2, 3, 1])
 
-#
-# Not strictly true... however what we mean is
-# Pascal or earlier
-#
-pascal = False
-
-device = cuda.get_current_device()
-cc = device.compute_capability
-if (cc[0] < 7):
-    pascal = True
+    results = {
+        "sum": torch.Tensor([3.0, 3.0, 1.0, 1.0]),
+        "mean": torch.Tensor([3.0, 1.5, 0.5, 1.0]),
+        "prod": torch.Tensor([3.0, 2.0, 0.0, 1.0]),
+        "amax": torch.Tensor([3.0, 2.0, 1.0, 1.0]),
+        "amin": torch.Tensor([3.0, 1.0, 0.0, 1.0]),
+    }
 
-# Return zero (success) if pascal is True
-if pascal:
-    sys.exit(0)
-else:
-    sys.exit(1)
+    return src_feat, dst_indices, results
diff --git a/python/cugraph-dgl/tests/nn/common.py b/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
similarity index 51%
rename from python/cugraph-dgl/tests/nn/common.py
rename to python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
index 34787d20c9a..ff8048468ee 100644
--- a/python/cugraph-dgl/tests/nn/common.py
+++ b/python/cugraph-equivariant/cugraph_equivariant/tests/test_scatter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,14 +10,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from cugraph.utilities.utils import import_optional
 
-th = import_optional("torch")
-dgl = import_optional("dgl")
+import pytest
+import torch
+from cugraph_equivariant.utils import scatter_reduce
 
 
-def create_graph1():
-    u = th.tensor([0, 1, 0, 2, 3, 0, 4, 0, 5, 0, 6, 7, 0, 8, 9])
-    v = th.tensor([1, 9, 2, 9, 9, 4, 9, 5, 9, 6, 9, 9, 8, 9, 0])
-    g = dgl.graph((u, v))
-    return g
+@pytest.mark.parametrize("reduce", ["sum", "mean", "prod", "amax", "amin"])
+def test_scatter_reduce(example_scatter_data, reduce):
+    device = torch.device("cuda:0")
+    src, index, out_true = example_scatter_data
+    src = src.to(device)
+    index = index.to(device)
+
+    out = scatter_reduce(src, index, dim=0, dim_size=None, reduce=reduce)
+
+    assert torch.allclose(out.cpu(), out_true[reduce])
diff --git a/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py b/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py
new file mode 100644
index 00000000000..a2a13b32cd2
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/tests/test_tensor_product_conv.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import torch
+from torch import nn
+from e3nn import o3
+from cugraph_equivariant.nn import FullyConnectedTensorProductConv
+
+device = torch.device("cuda:0")
+
+
+@pytest.mark.parametrize("e3nn_compat_mode", [True, False])
+@pytest.mark.parametrize("batch_norm", [True, False])
+@pytest.mark.parametrize(
+    "mlp_channels, mlp_activation, scalar_sizes",
+    [
+        [(30, 8, 8), nn.Sequential(nn.Dropout(0.3), nn.ReLU()), (15, 15, 0)],
+        [(7,), nn.GELU(), (2, 3, 2)],
+        [None, None, None],
+    ],
+)
+def test_tensor_product_conv_equivariance(
+    mlp_channels, mlp_activation, scalar_sizes, batch_norm, e3nn_compat_mode
+):
+    torch.manual_seed(12345)
+
+    in_irreps = o3.Irreps("10x0e + 10x1e")
+    out_irreps = o3.Irreps("20x0e + 10x1e")
+    sh_irreps = o3.Irreps.spherical_harmonics(lmax=2)
+
+    tp_conv = FullyConnectedTensorProductConv(
+        in_irreps=in_irreps,
+        sh_irreps=sh_irreps,
+        out_irreps=out_irreps,
+        mlp_channels=mlp_channels,
+        mlp_activation=mlp_activation,
+        batch_norm=batch_norm,
+        e3nn_compat_mode=e3nn_compat_mode,
+    ).to(device)
+
+    num_src_nodes, num_dst_nodes = 9, 7
+    num_edges = 40
+    src = torch.randint(num_src_nodes, (num_edges,), device=device)
+    dst = torch.randint(num_dst_nodes, (num_edges,), device=device)
+    edge_index = torch.vstack((src, dst))
+
+    src_pos = torch.randn(num_src_nodes, 3, device=device)
+    dst_pos = torch.randn(num_dst_nodes, 3, device=device)
+    edge_vec = dst_pos[dst] - src_pos[src]
+    edge_sh = o3.spherical_harmonics(
+        tp_conv.sh_irreps, edge_vec, normalize=True, normalization="component"
+    ).to(device)
+    src_features = torch.randn(num_src_nodes, in_irreps.dim, device=device)
+
+    rot = o3.rand_matrix()
+    D_in = tp_conv.in_irreps.D_from_matrix(rot).to(device)
+    D_sh = tp_conv.sh_irreps.D_from_matrix(rot).to(device)
+    D_out = tp_conv.out_irreps.D_from_matrix(rot).to(device)
+
+    if mlp_channels is None:
+        edge_emb = torch.randn(num_edges, tp_conv.tp.weight_numel, device=device)
+        src_scalars = dst_scalars = None
+    else:
+        if scalar_sizes:
+            edge_emb = torch.randn(num_edges, scalar_sizes[0], device=device)
+            src_scalars = (
+                None
+                if scalar_sizes[1] == 0
+                else torch.randn(num_src_nodes, scalar_sizes[1], device=device)
+            )
+            dst_scalars = (
+                None
+                if scalar_sizes[2] == 0
+                else torch.randn(num_dst_nodes, scalar_sizes[2], device=device)
+            )
+        else:
+            edge_emb = torch.randn(num_edges, tp_conv.mlp[0].in_features, device=device)
+            src_scalars = dst_scalars = None
+
+    # rotate before
+    out_before = tp_conv(
+        src_features=src_features @ D_in.T,
+        edge_sh=edge_sh @ D_sh.T,
+        edge_emb=edge_emb,
+        graph=(edge_index, (num_src_nodes, num_dst_nodes)),
+        src_scalars=src_scalars,
+        dst_scalars=dst_scalars,
+    )
+
+    # rotate after
+    out_after = (
+        tp_conv(
+            src_features=src_features,
+            edge_sh=edge_sh,
+            edge_emb=edge_emb,
+            graph=(edge_index, (num_src_nodes, num_dst_nodes)),
+            src_scalars=src_scalars,
+            dst_scalars=dst_scalars,
+        )
+        @ D_out.T
+    )
+
+    torch.allclose(out_before, out_after, rtol=1e-4, atol=1e-4)
diff --git a/python/cugraph-dgl/setup.py b/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
similarity index 72%
rename from python/cugraph-dgl/setup.py
rename to python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
index afb8002af42..b4acfe8d090 100644
--- a/python/cugraph-dgl/setup.py
+++ b/python/cugraph-equivariant/cugraph_equivariant/utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,9 +11,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from setuptools import find_packages, setup
+from .scatter import scatter_reduce
 
-packages = find_packages(include=["cugraph_dgl*"])
-setup(
-    package_data={key: ["VERSION"] for key in packages},
-)
+__all__ = [
+    "scatter_reduce",
+]
diff --git a/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py b/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py
new file mode 100644
index 00000000000..45cc541fc7b
--- /dev/null
+++ b/python/cugraph-equivariant/cugraph_equivariant/utils/scatter.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+
+
+def broadcast(src: torch.Tensor, ref: torch.Tensor, dim: int) -> torch.Tensor:
+    size = ((1,) * dim) + (-1,) + ((1,) * (ref.dim() - dim - 1))
+    return src.view(size).expand_as(ref)
+
+
+def scatter_reduce(
+    src: torch.Tensor,
+    index: torch.Tensor,
+    dim: int = 0,
+    dim_size: Optional[int] = None,  # value of out.size(dim)
+    reduce: str = "sum",  # "sum", "prod", "mean", "amax", "amin"
+):
+    # scatter() expects index to be int64
+    index = broadcast(index, src, dim).to(torch.int64)
+
+    size = list(src.size())
+
+    if dim_size is not None:
+        assert dim_size >= int(index.max()) + 1
+        size[dim] = dim_size
+    else:
+        size[dim] = int(index.max()) + 1
+
+    out = torch.zeros(size, dtype=src.dtype, device=src.device)
+    return out.scatter_reduce_(dim, index, src, reduce, include_self=False)
diff --git a/python/cugraph-equivariant/pyproject.toml b/python/cugraph-equivariant/pyproject.toml
new file mode 100644
index 00000000000..f261b0e3535
--- /dev/null
+++ b/python/cugraph-equivariant/pyproject.toml
@@ -0,0 +1,64 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+requires = [
+    "setuptools>=61.0.0",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "cugraph-equivariant"
+dynamic = ["version"]
+description = "Fast GPU-based equivariant operations and convolutional layers."
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+classifiers = [
+    "Intended Audience :: Developers",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+]
+dependencies = [
+    "pylibcugraphops==24.2.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cugraph"
+Documentation = "https://docs.rapids.ai/api/cugraph/stable/api_docs/cugraph-ops/"
+
+[project.optional-dependencies]
+test = [
+    "pandas",
+    "pytest",
+    "pytest-benchmark",
+    "pytest-cov",
+    "pytest-xdist",
+    "scipy",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.dynamic]
+version = {file = "cugraph_equivariant/VERSION"}
+
+[tool.setuptools.packages.find]
+include = [
+    "cugraph_equivariant*",
+    "cugraph_equivariant.*",
+]
diff --git a/python/cugraph-service/client/setup.py b/python/cugraph-equivariant/setup.py
similarity index 73%
rename from python/cugraph-service/client/setup.py
rename to python/cugraph-equivariant/setup.py
index 61c758cef4a..acd0df3f717 100644
--- a/python/cugraph-service/client/setup.py
+++ b/python/cugraph-equivariant/setup.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,7 +13,8 @@
 
 from setuptools import find_packages, setup
 
-packages = find_packages(include=["cugraph_service_client*"])
-setup(
-    package_data={key: ["VERSION"] for key in packages},
-)
+if __name__ == "__main__":
+    packages = find_packages(include=["cugraph_equivariant*"])
+    setup(
+        package_data={key: ["VERSION"] for key in packages},
+    )
diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
index 71d1c7e389c..0c63dc9d80e 100644
--- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
+++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
@@ -10,11 +10,11 @@ channels:
 - conda-forge
 - nvidia
 dependencies:
-- cugraph==23.12.*
+- cugraph==24.2.*
 - pandas
 - pre-commit
 - pyg>=2.4.0
-- pylibcugraphops==23.12.*
+- pylibcugraphops==24.2.*
 - pytest
 - pytest-benchmark
 - pytest-cov
diff --git a/python/cugraph-pyg/cugraph_pyg/data/__init__.py b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
index 0567b69ecf2..66a9843c047 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/__init__.py
@@ -11,8 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.utilities.api_tools import experimental_warning_wrapper
-
-from cugraph_pyg.data.cugraph_store import EXPERIMENTAL__CuGraphStore
-
-CuGraphStore = experimental_warning_wrapper(EXPERIMENTAL__CuGraphStore)
+from cugraph_pyg.data.cugraph_store import CuGraphStore
diff --git a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
index 14dc5d84f90..05d540b7c45 100644
--- a/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/data/cugraph_store.py
@@ -199,7 +199,7 @@ def cast(cls, *args, **kwargs):
         return cls(*args, **kwargs)
 
 
-class EXPERIMENTAL__CuGraphStore:
+class CuGraphStore:
     """
     Duck-typed version of PyG's GraphStore and FeatureStore.
     """
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
index 0682c598fdf..2c3d7eff89e 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/__init__.py
@@ -11,14 +11,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.utilities.api_tools import experimental_warning_wrapper
+from cugraph_pyg.loader.cugraph_node_loader import CuGraphNeighborLoader
 
-from cugraph_pyg.loader.cugraph_node_loader import EXPERIMENTAL__CuGraphNeighborLoader
-
-CuGraphNeighborLoader = experimental_warning_wrapper(
-    EXPERIMENTAL__CuGraphNeighborLoader
-)
-
-from cugraph_pyg.loader.cugraph_node_loader import EXPERIMENTAL__BulkSampleLoader
-
-BulkSampleLoader = experimental_warning_wrapper(EXPERIMENTAL__BulkSampleLoader)
+from cugraph_pyg.loader.cugraph_node_loader import BulkSampleLoader
diff --git a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
index 200a82b460b..bcfaf579820 100644
--- a/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
+++ b/python/cugraph-pyg/cugraph_pyg/loader/cugraph_node_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,7 +20,7 @@
 import cupy
 import cudf
 
-from cugraph.experimental.gnn import BulkSampler
+from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import import_optional, MissingModule
 
 from cugraph_pyg.data import CuGraphStore
@@ -42,7 +42,7 @@
 )
 
 
-class EXPERIMENTAL__BulkSampleLoader:
+class BulkSampleLoader:
 
     __ex_parquet_file = re.compile(r"batch=([0-9]+)\-([0-9]+)\.parquet")
 
@@ -151,9 +151,25 @@ def __init__(
                 self.__input_files = iter(input_files)
             return
 
-        input_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
-            (feature_store, graph_store), input_nodes
+        # To accommodate DLFW/PyG 2.5
+        get_input_nodes = torch_geometric.loader.utils.get_input_nodes
+        get_input_nodes_kwargs = {}
+        if "input_id" in get_input_nodes.__annotations__:
+            get_input_nodes_kwargs["input_id"] = None
+        input_node_info = get_input_nodes(
+            (feature_store, graph_store), input_nodes, **get_input_nodes_kwargs
         )
+
+        # PyG 2.4
+        if len(input_node_info) == 2:
+            input_type, input_nodes = input_node_info
+        # PyG 2.5
+        elif len(input_node_info) == 3:
+            input_type, input_nodes, input_id = input_node_info
+        # Invalid
+        else:
+            raise ValueError("Invalid output from get_input_nodes")
+
         if input_type is not None:
             input_nodes = graph_store._get_sample_from_vertex_groups(
                 {input_type: input_nodes}
@@ -439,7 +455,12 @@ def __next__(self):
         start_time_feature = perf_counter()
         # Create a PyG HeteroData object, loading the required features
         if self.__coo:
-            out = torch_geometric.loader.utils.filter_custom_store(
+            pyg_filter_fn = (
+                torch_geometric.loader.utils.filter_custom_hetero_store
+                if hasattr(torch_geometric.loader.utils, "filter_custom_hetero_store")
+                else torch_geometric.loader.utils.filter_custom_store
+            )
+            out = pyg_filter_fn(
                 self.__feature_store,
                 self.__graph_store,
                 sampler_output.node,
@@ -478,7 +499,7 @@ def __iter__(self):
         return self
 
 
-class EXPERIMENTAL__CuGraphNeighborLoader:
+class CuGraphNeighborLoader:
     def __init__(
         self,
         data: Union[CuGraphStore, Tuple[CuGraphStore, CuGraphStore]],
@@ -527,7 +548,7 @@ def batch_size(self) -> int:
         return self.__batch_size
 
     def __iter__(self):
-        self.current_loader = EXPERIMENTAL__BulkSampleLoader(
+        self.current_loader = BulkSampleLoader(
             self.__feature_store,
             self.__graph_store,
             self.__input_nodes,
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
index 9c9dcdb43bb..bef3a023b93 100644
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
@@ -13,6 +13,7 @@
 
 from .gat_conv import GATConv
 from .gatv2_conv import GATv2Conv
+from .hetero_gat_conv import HeteroGATConv
 from .rgcn_conv import RGCNConv
 from .sage_conv import SAGEConv
 from .transformer_conv import TransformerConv
@@ -20,6 +21,7 @@
 __all__ = [
     "GATConv",
     "GATv2Conv",
+    "HeteroGATConv",
     "RGCNConv",
     "SAGEConv",
     "TransformerConv",
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
new file mode 100644
index 00000000000..3b717552a96
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+from collections import defaultdict
+
+from cugraph.utilities.utils import import_optional
+from pylibcugraphops.pytorch.operators import mha_gat_n2n
+
+from .base import BaseConv
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+
+class HeteroGATConv(BaseConv):
+    r"""The graph attentional operator on heterogeneous graphs, where a separate
+    `GATConv` is applied on the homogeneous graph for each edge type. Compared
+    with directly wrapping `GATConv`s with `HeteroConv`, `HeteroGATConv` fuses
+    all the linear transformation associated with each node type together into 1
+    GEMM call, to improve the performance on GPUs.
+
+    Parameters
+    ----------
+    in_channels : int or Dict[str, int])
+        Size of each input sample of every node type.
+
+    out_channels : int
+        Size of each output sample.
+
+    node_types : List[str]
+        List of Node types.
+
+    edge_types : List[Tuple[str, str, str]]
+        List of Edge types.
+
+    heads : int, optional (default=1)
+        Number of multi-head-attentions.
+
+    concat : bool, optional (default=True):
+        If set to :obj:`False`, the multi-head attentions are averaged instead
+        of concatenated.
+
+    negative_slope : float, optional (default=0.2)
+        LeakyReLU angle of the negative slope.
+
+    bias : bool, optional (default=True)
+        If set to :obj:`False`, the layer will not learn an additive bias.
+
+    aggr : str, optional (default="sum")
+        The aggregation scheme to use for grouping node embeddings generated by
+        different relations. Choose from "sum", "mean", "min", "max".
+    """
+
+    def __init__(
+        self,
+        in_channels: Union[int, dict[str, int]],
+        out_channels: int,
+        node_types: list[str],
+        edge_types: list[tuple[str, str, str]],
+        heads: int = 1,
+        concat: bool = True,
+        negative_slope: float = 0.2,
+        bias: bool = True,
+        aggr: str = "sum",
+    ):
+        major, minor, patch = torch_geometric.__version__.split(".")[:3]
+        pyg_version = tuple(map(int, [major, minor, patch]))
+        if pyg_version < (2, 4, 0):
+            raise RuntimeError(f"{self.__class__.__name__} requires pyg >= 2.4.0.")
+
+        super().__init__()
+
+        if isinstance(in_channels, int):
+            in_channels = dict.fromkeys(node_types, in_channels)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+
+        self.node_types = node_types
+        self.edge_types = edge_types
+        self.num_heads = heads
+        self.concat_heads = concat
+
+        self.negative_slope = negative_slope
+        self.aggr = aggr
+
+        self.relations_per_ntype = defaultdict(lambda: ([], []))
+
+        lin_weights = dict.fromkeys(self.node_types)
+        attn_weights = dict.fromkeys(self.edge_types)
+        biases = dict.fromkeys(self.edge_types)
+
+        ParameterDict = torch_geometric.nn.parameter_dict.ParameterDict
+
+        for edge_type in self.edge_types:
+            src_type, _, dst_type = edge_type
+            self.relations_per_ntype[src_type][0].append(edge_type)
+            if src_type != dst_type:
+                self.relations_per_ntype[dst_type][1].append(edge_type)
+
+            attn_weights[edge_type] = torch.empty(
+                2 * self.num_heads * self.out_channels
+            )
+
+            if bias and concat:
+                biases[edge_type] = torch.empty(self.num_heads * out_channels)
+            elif bias:
+                biases[edge_type] = torch.empty(out_channels)
+            else:
+                biases[edge_type] = None
+
+        for ntype in self.node_types:
+            n_src_rel = len(self.relations_per_ntype[ntype][0])
+            n_dst_rel = len(self.relations_per_ntype[ntype][1])
+            n_rel = n_src_rel + n_dst_rel
+
+            lin_weights[ntype] = torch.empty(
+                (n_rel * self.num_heads * self.out_channels, self.in_channels[ntype])
+            )
+
+        self.lin_weights = ParameterDict(lin_weights)
+        self.attn_weights = ParameterDict(attn_weights)
+
+        if bias:
+            self.bias = ParameterDict(biases)
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def split_tensors(
+        self, x_fused_dict: dict[str, torch.Tensor], dim: int
+    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+        """Split fused tensors into chunks based on edge types.
+
+        Parameters
+        ----------
+        x_fused_dict : dict[str, torch.Tensor]
+            A dictionary to hold node feature for each node type. The key is
+            node type; the value is a fused tensor that account for all
+            relations for that node type.
+
+        dim : int
+            Dimension along which to split the fused tensor.
+
+        Returns
+        -------
+        x_src_dict : dict[str, torch.Tensor]
+            A dictionary to hold source node feature for each relation graph.
+
+        x_dst_dict : dict[str, torch.Tensor]
+            A dictionary to hold destination node feature for each relation graph.
+        """
+        x_src_dict = dict.fromkeys(self.edge_types)
+        x_dst_dict = dict.fromkeys(self.edge_types)
+
+        for ntype, t in x_fused_dict.items():
+            n_src_rel = len(self.relations_per_ntype[ntype][0])
+            n_dst_rel = len(self.relations_per_ntype[ntype][1])
+            n_rel = n_src_rel + n_dst_rel
+            t_list = torch.chunk(t, chunks=n_rel, dim=dim)
+
+            for i, src_rel in enumerate(self.relations_per_ntype[ntype][0]):
+                x_src_dict[src_rel] = t_list[i]
+
+            for i, dst_rel in enumerate(self.relations_per_ntype[ntype][1]):
+                x_dst_dict[dst_rel] = t_list[i + n_src_rel]
+
+        return x_src_dict, x_dst_dict
+
+    def reset_parameters(self, seed: Optional[int] = None):
+        if seed is not None:
+            torch.manual_seed(seed)
+
+        w_src, w_dst = self.split_tensors(self.lin_weights, dim=0)
+
+        for edge_type in self.edge_types:
+            src_type, _, dst_type = edge_type
+
+            # lin_src
+            torch_geometric.nn.inits.glorot(w_src[edge_type])
+
+            # lin_dst
+            if src_type != dst_type:
+                torch_geometric.nn.inits.glorot(w_dst[edge_type])
+
+            # attn_weights
+            torch_geometric.nn.inits.glorot(
+                self.attn_weights[edge_type].view(-1, self.num_heads, self.out_channels)
+            )
+
+            # bias
+            if self.bias is not None:
+                torch_geometric.nn.inits.zeros(self.bias[edge_type])
+
+    def forward(
+        self,
+        x_dict: dict[str, torch.Tensor],
+        edge_index_dict: dict[tuple[str, str, str], torch.Tensor],
+    ) -> dict[str, torch.Tensor]:
+        feat_dict = dict.fromkeys(x_dict.keys())
+
+        for ntype, x in x_dict.items():
+            feat_dict[ntype] = x @ self.lin_weights[ntype].T
+
+        x_src_dict, x_dst_dict = self.split_tensors(feat_dict, dim=1)
+
+        out_dict = defaultdict(list)
+
+        for edge_type, edge_index in edge_index_dict.items():
+            src_type, _, dst_type = edge_type
+
+            csc = BaseConv.to_csc(
+                edge_index, (x_dict[src_type].size(0), x_dict[dst_type].size(0))
+            )
+
+            if src_type == dst_type:
+                graph = self.get_cugraph(
+                    csc,
+                    bipartite=False,
+                )
+                out = mha_gat_n2n(
+                    x_src_dict[edge_type],
+                    self.attn_weights[edge_type],
+                    graph,
+                    num_heads=self.num_heads,
+                    activation="LeakyReLU",
+                    negative_slope=self.negative_slope,
+                    concat_heads=self.concat_heads,
+                )
+
+            else:
+                graph = self.get_cugraph(
+                    csc,
+                    bipartite=True,
+                )
+                out = mha_gat_n2n(
+                    (x_src_dict[edge_type], x_dst_dict[edge_type]),
+                    self.attn_weights[edge_type],
+                    graph,
+                    num_heads=self.num_heads,
+                    activation="LeakyReLU",
+                    negative_slope=self.negative_slope,
+                    concat_heads=self.concat_heads,
+                )
+
+            if self.bias is not None:
+                out = out + self.bias[edge_type]
+
+            out_dict[dst_type].append(out)
+
+        for key, value in out_dict.items():
+            out_dict[key] = torch_geometric.nn.conv.hetero_conv.group(value, self.aggr)
+
+        return out_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
index 300ca9beb5a..65cb63d25e0 100644
--- a/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
+++ b/python/cugraph-pyg/cugraph_pyg/sampler/cugraph_sampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -216,7 +216,6 @@ def _sampler_output_from_sampling_results_homogeneous_csr(
 
     if renumber_map is None:
         raise ValueError("Renumbered input is expected for homogeneous graphs")
-
     node_type = graph_store.node_types[0]
     edge_type = graph_store.edge_types[0]
 
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
index 1512901822a..30994289f9c 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
@@ -284,3 +284,32 @@ def basic_pyg_graph_2():
     )
     size = (10, 10)
     return edge_index, size
+
+
+@pytest.fixture
+def sample_pyg_hetero_data():
+    torch.manual_seed(12345)
+    raw_data_dict = {
+        "v0": torch.randn(6, 3),
+        "v1": torch.randn(7, 2),
+        "v2": torch.randn(5, 4),
+        ("v2", "e0", "v1"): torch.tensor([[0, 2, 2, 4, 4], [4, 3, 6, 0, 1]]),
+        ("v1", "e1", "v1"): torch.tensor(
+            [[0, 2, 2, 2, 3, 5, 5], [4, 0, 4, 5, 3, 0, 1]]
+        ),
+        ("v0", "e2", "v0"): torch.tensor([[0, 2, 2, 3, 5, 5], [1, 1, 5, 1, 1, 2]]),
+        ("v1", "e3", "v2"): torch.tensor(
+            [[0, 1, 1, 2, 4, 5, 6], [1, 2, 3, 1, 2, 2, 2]]
+        ),
+        ("v0", "e4", "v2"): torch.tensor([[1, 1, 3, 3, 4, 4], [1, 4, 1, 4, 0, 3]]),
+    }
+
+    # create a nested dictionary to facilitate PyG's HeteroData construction
+    hetero_data_dict = {}
+    for key, value in raw_data_dict.items():
+        if isinstance(key, tuple):
+            hetero_data_dict[key] = {"edge_index": value}
+        else:
+            hetero_data_dict[key] = {"x": value}
+
+    return hetero_data_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
index be8f8245807..7047c62250b 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/mg/test_mg_cugraph_store.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -385,7 +385,7 @@ def test_get_input_nodes(karate_gnn, dask_client):
 def test_mg_frame_handle(graph, dask_client):
     F, G, N = graph
     cugraph_store = CuGraphStore(F, G, N, multi_gpu=True)
-    assert isinstance(cugraph_store._EXPERIMENTAL__CuGraphStore__graph._plc_graph, dict)
+    assert isinstance(cugraph_store._CuGraphStore__graph._plc_graph, dict)
 
 
 @pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
new file mode 100644
index 00000000000..1c841a17df7
--- /dev/null
+++ b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
+from cugraph.utilities.utils import import_optional, MissingModule
+
+torch = import_optional("torch")
+torch_geometric = import_optional("torch_geometric")
+
+ATOL = 1e-6
+
+
+@pytest.mark.cugraph_ops
+@pytest.mark.skipif(isinstance(torch, MissingModule), reason="torch not available")
+@pytest.mark.skipif(
+    isinstance(torch_geometric, MissingModule), reason="torch_geometric not available"
+)
+@pytest.mark.parametrize("heads", [1, 3, 10])
+@pytest.mark.parametrize("aggr", ["sum", "mean"])
+def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
+    major, minor, patch = torch_geometric.__version__.split(".")[:3]
+    pyg_version = tuple(map(int, [major, minor, patch]))
+    if pyg_version < (2, 4, 0):
+        pytest.skip("Skipping HeteroGATConv test")
+
+    from torch_geometric.data import HeteroData
+    from torch_geometric.nn import HeteroConv, GATConv
+
+    device = torch.device("cuda:0")
+    data = HeteroData(sample_pyg_hetero_data).to(device)
+
+    in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
+    out_channels = 2
+
+    convs_dict = {}
+    kwargs1 = dict(heads=heads, add_self_loops=False, bias=False)
+    for edge_type in data.edge_types:
+        src_t, _, dst_t = edge_type
+        in_channels_src, in_channels_dst = data.x_dict[src_t].size(-1), data.x_dict[
+            dst_t
+        ].size(-1)
+        if src_t == dst_t:
+            convs_dict[edge_type] = GATConv(in_channels_src, out_channels, **kwargs1)
+        else:
+            convs_dict[edge_type] = GATConv(
+                (in_channels_src, in_channels_dst), out_channels, **kwargs1
+            )
+
+    conv1 = HeteroConv(convs_dict, aggr=aggr).to(device)
+    kwargs2 = dict(
+        heads=heads,
+        aggr=aggr,
+        node_types=data.node_types,
+        edge_types=data.edge_types,
+        bias=False,
+    )
+    conv2 = CuGraphHeteroGATConv(in_channels_dict, out_channels, **kwargs2).to(device)
+
+    # copy over linear and attention weights
+    w_src, w_dst = conv2.split_tensors(conv2.lin_weights, dim=0)
+    with torch.no_grad():
+        for edge_type in conv2.edge_types:
+            src_t, _, dst_t = edge_type
+            w_src[edge_type][:, :] = conv1.convs[edge_type].lin_src.weight[:, :]
+            if w_dst[edge_type] is not None:
+                w_dst[edge_type][:, :] = conv1.convs[edge_type].lin_dst.weight[:, :]
+
+            conv2.attn_weights[edge_type][: heads * out_channels] = conv1.convs[
+                edge_type
+            ].att_src.data.flatten()
+            conv2.attn_weights[edge_type][heads * out_channels :] = conv1.convs[
+                edge_type
+            ].att_dst.data.flatten()
+
+    out1 = conv1(data.x_dict, data.edge_index_dict)
+    out2 = conv2(data.x_dict, data.edge_index_dict)
+
+    for node_type in data.node_types:
+        assert torch.allclose(out1[node_type], out2[node_type], atol=ATOL)
+
+    loss1 = 0
+    loss2 = 0
+    for node_type in data.node_types:
+        loss1 += out1[node_type].mean()
+        loss2 += out2[node_type].mean()
+
+    loss1.backward()
+    loss2.backward()
+
+    # check gradient w.r.t attention weights
+    out_dim = heads * out_channels
+    for edge_type in conv2.edge_types:
+        assert torch.allclose(
+            conv1.convs[edge_type].att_src.grad.flatten(),
+            conv2.attn_weights[edge_type].grad[:out_dim],
+            atol=ATOL,
+        )
+        assert torch.allclose(
+            conv1.convs[edge_type].att_dst.grad.flatten(),
+            conv2.attn_weights[edge_type].grad[out_dim:],
+            atol=ATOL,
+        )
+
+    # check gradient w.r.t linear weights
+    grad_lin_weights_ref = dict.fromkeys(out1.keys())
+    for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
+        grad_list = []
+        for rel_t in rels_as_src:
+            grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
+        for rel_t in rels_as_dst:
+            grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
+        assert len(grad_list) > 0
+        grad_lin_weights_ref[node_t] = torch.vstack(grad_list)
+
+    for node_type in conv2.lin_weights:
+        assert torch.allclose(
+            grad_lin_weights_ref[node_type],
+            conv2.lin_weights[node_type].grad,
+            atol=ATOL,
+        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
index b39ebad8254..c99fd447aa0 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/test_cugraph_store.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -365,10 +365,20 @@ def test_get_input_nodes(karate_gnn):
     F, G, N = karate_gnn
     cugraph_store = CuGraphStore(F, G, N)
 
-    node_type, input_nodes = torch_geometric.loader.utils.get_input_nodes(
+    input_node_info = torch_geometric.loader.utils.get_input_nodes(
         (cugraph_store, cugraph_store), "type0"
     )
 
+    # PyG 2.4
+    if len(input_node_info) == 2:
+        node_type, input_nodes = input_node_info
+    # PyG 2.5
+    elif len(input_node_info) == 3:
+        node_type, input_nodes, input_id = input_node_info
+    # Invalid
+    else:
+        raise ValueError("Invalid output from get_input_nodes")
+
     assert node_type == "type0"
     assert input_nodes.tolist() == torch.arange(17, dtype=torch.int32).tolist()
 
diff --git a/python/cugraph-pyg/pyproject.toml b/python/cugraph-pyg/pyproject.toml
index 95b1fa27402..b8666c0d806 100644
--- a/python/cugraph-pyg/pyproject.toml
+++ b/python/cugraph-pyg/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 [build-system]
 
@@ -11,9 +11,9 @@ requires = [
 testpaths = ["cugraph_pyg/tests"]
 
 [project]
-name = "cugraph_pyg"
+name = "cugraph-pyg"
 dynamic = ["version"]
-description = "cugraph_pyg - PyG support for cuGraph massive-scale, ultra-fast GPU graph analytics."
+description = "cugraph-pyg - PyG support for cuGraph massive-scale, ultra-fast GPU graph analytics."
 authors = [
     { name = "NVIDIA Corporation" },
 ]
@@ -26,15 +26,26 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
 ]
 dependencies = [
-    "cugraph==23.12.*",
+    "cugraph==24.2.*",
     "numba>=0.57",
     "numpy>=1.21",
+    "pylibcugraphops==24.2.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cugraph"
 Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 
+[project.optional-dependencies]
+test = [
+    "pandas",
+    "pytest",
+    "pytest-benchmark",
+    "pytest-cov",
+    "pytest-xdist",
+    "scipy",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 
diff --git a/python/cugraph-pyg/setup.py b/python/cugraph-pyg/setup.py
deleted file mode 100644
index 50f023050bf..00000000000
--- a/python/cugraph-pyg/setup.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-
-from setuptools import Command, find_packages, setup
-
-from setuputils import get_environment_option
-
-
-CUDA_HOME = get_environment_option("CUDA_HOME")
-
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError("Invalid CUDA_HOME: " "directory does not exist: {CUDA_HOME}")
-
-
-class CleanCommand(Command):
-    """Custom clean command to tidy up the project root."""
-
-    user_options = [
-        ("all", None, None),
-    ]
-
-    def initialize_options(self):
-        self.all = None
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        setupFileDir = os.path.dirname(os.path.abspath(__file__))
-        os.chdir(setupFileDir)
-        os.system("rm -rf build")
-        os.system("rm -rf dist")
-        os.system("rm -rf dask-worker-space")
-        os.system('find . -name "__pycache__" -type d -exec rm -rf {} +')
-        os.system("rm -rf *.egg-info")
-
-
-packages = find_packages(include=["cugraph_pyg*"])
-setup(
-    cmdclass={"clean": CleanCommand},
-    package_data={key: ["VERSION"] for key in packages},
-)
diff --git a/python/cugraph-pyg/setuputils.py b/python/cugraph-pyg/setuputils.py
deleted file mode 100644
index 48b169ea3b9..00000000000
--- a/python/cugraph-pyg/setuputils.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import glob
-import os
-import re
-import shutil
-import sys
-
-
-def get_environment_option(name):
-    ENV_VARIABLE = os.environ.get(name, False)
-
-    if not ENV_VARIABLE:
-        print("-- " + name + " environment variable not set.")
-
-    else:
-        print("-- " + name + " detected with value: " + str(ENV_VARIABLE))
-
-    return ENV_VARIABLE
-
-
-def get_cli_option(name):
-    if name in sys.argv:
-        print("-- Detected " + str(name) + " build option.")
-        return True
-
-    else:
-        return False
-
-
-def clean_folder(path):
-    """
-    Function to clean all Cython and Python artifacts and cache folders. It
-    clean the folder as well as its direct children recursively.
-
-    Parameters
-    ----------
-    path : String
-        Path to the folder to be cleaned.
-    """
-    shutil.rmtree(path + "/__pycache__", ignore_errors=True)
-
-    folders = glob.glob(path + "/*/")
-    for folder in folders:
-        shutil.rmtree(folder + "/__pycache__", ignore_errors=True)
-
-        clean_folder(folder)
-
-        cython_exts = glob.glob(folder + "/*.cpp")
-        cython_exts.extend(glob.glob(folder + "/*.cpython*"))
-        for file in cython_exts:
-            os.remove(file)
-
-
-def get_cuda_version_from_header(cuda_include_dir, delimiter=""):
-
-    cuda_version = None
-
-    with open(os.path.join(cuda_include_dir, "cuda.h"), encoding="utf-8") as f:
-        for line in f.readlines():
-            if re.search(r"#define CUDA_VERSION ", line) is not None:
-                cuda_version = line
-                break
-
-    if cuda_version is None:
-        raise TypeError("CUDA_VERSION not found in cuda.h")
-    cuda_version = int(cuda_version.split()[2])
-    return "%d%s%d" % (
-        cuda_version // 1000,
-        delimiter,
-        (cuda_version % 1000) // 10,
-    )
diff --git a/python/cugraph-service/server/pyproject.toml b/python/cugraph-service/server/pyproject.toml
index d68f8055ded..84c0358668b 100644
--- a/python/cugraph-service/server/pyproject.toml
+++ b/python/cugraph-service/server/pyproject.toml
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cugraph-service-server"
-dynamic = ["version", "entry-points"]
+dynamic = ["version"]
 description = "cuGraph Service server"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
@@ -19,18 +19,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.12.*",
-    "cugraph-service-client==23.12.*",
-    "cugraph==23.12.*",
+    "cudf==24.2.*",
+    "cugraph-service-client==24.2.*",
+    "cugraph==24.2.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==23.12.*",
-    "dask-cudf==23.12.*",
+    "dask-cuda==24.2.*",
+    "dask-cudf==24.2.*",
     "numba>=0.57",
     "numpy>=1.21",
-    "rapids-dask-dependency==23.12.*",
-    "rmm==23.12.*",
+    "rapids-dask-dependency==24.2.*",
+    "rmm==24.2.*",
     "thriftpy2",
-    "ucx-py==0.35.*",
+    "ucx-py==0.36.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -39,6 +39,9 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
 ]
 
+[project.scripts]
+cugraph-service-server = "cugraph_service_server.__main__:main"
+
 [project.optional-dependencies]
 test = [
     "networkx>=2.5.1",
diff --git a/python/cugraph/CMakeLists.txt b/python/cugraph/CMakeLists.txt
index 8693c0e9e1f..92345a324e4 100644
--- a/python/cugraph/CMakeLists.txt
+++ b/python/cugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cugraph_version 23.12.00)
+set(cugraph_version 24.02.00)
 
 include(../../fetch_rapids.cmake)
 
@@ -26,11 +26,7 @@ rapids_cuda_init_architectures(cugraph-python)
 project(
   cugraph-python
   VERSION ${cugraph_version}
-  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
-            # language to be enabled here. The test project that is built in scikit-build to verify
-            # various linking options for the python library is hardcoded to build with C, so until
-            # that is fixed we need to keep C.
-            C CXX CUDA
+  LANGUAGES CXX CUDA
 )
 
 ################################################################################
@@ -38,7 +34,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -53,28 +48,20 @@ else()
   set(cugraph_FOUND OFF)
 endif()
 
-include(rapids-cython)
+include(rapids-cython-core)
 
 if(NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir cugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
diff --git a/python/cugraph/cugraph/dask/common/mg_utils.py b/python/cugraph/cugraph/dask/common/mg_utils.py
index 6acda48c9da..b04f293dc0e 100644
--- a/python/cugraph/cugraph/dask/common/mg_utils.py
+++ b/python/cugraph/cugraph/dask/common/mg_utils.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 
 import os
-
+import gc
 import numba.cuda
 
 
@@ -68,3 +68,8 @@ def get_visible_devices():
     else:
         visible_devices = _visible_devices.strip().split(",")
     return visible_devices
+
+
+def run_gc_on_dask_cluster(client):
+    gc.collect()
+    client.run(gc.collect)
diff --git a/python/cugraph/cugraph/datasets/dataset.py b/python/cugraph/cugraph/datasets/dataset.py
index dd7aa0df00a..9817d15dacb 100644
--- a/python/cugraph/cugraph/datasets/dataset.py
+++ b/python/cugraph/cugraph/datasets/dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,10 +12,13 @@
 # limitations under the License.
 
 import cudf
+import dask_cudf
 import yaml
 import os
 import pandas as pd
+import cugraph.dask as dcg
 from pathlib import Path
+import urllib.request
 from cugraph.structure.graph_classes import Graph
 
 
@@ -138,9 +141,8 @@ def __download_csv(self, url):
 
         filename = self.metadata["name"] + self.metadata["file_type"]
         if self._dl_path.path.is_dir():
-            df = cudf.read_csv(url)
             self._path = self._dl_path.path / filename
-            df.to_csv(self._path, index=False)
+            urllib.request.urlretrieve(url, str(self._path))
 
         else:
             raise RuntimeError(
@@ -149,7 +151,6 @@ def __download_csv(self, url):
         return self._path
 
     def unload(self):
-
         """
         Remove all saved internal objects, forcing them to be re-created when
         accessed.
@@ -162,7 +163,7 @@ def unload(self):
 
     def get_edgelist(self, download=False, reader="cudf"):
         """
-        Return an Edgelist
+        Return an Edgelist.
 
         Parameters
         ----------
@@ -212,6 +213,47 @@ def get_edgelist(self, download=False, reader="cudf"):
 
         return self._edgelist.copy()
 
+    def get_dask_edgelist(self, download=False):
+        """
+        Return a distributed Edgelist.
+
+        Parameters
+        ----------
+        download : Boolean (default=False)
+            Automatically download the dataset from the 'url' location within
+            the YAML file.
+        """
+        if self._edgelist is None:
+            full_path = self.get_path()
+            if not full_path.is_file():
+                if download:
+                    full_path = self.__download_csv(self.metadata["url"])
+                else:
+                    raise RuntimeError(
+                        f"The datafile {full_path} does not"
+                        " exist. Try setting download=True"
+                        " to download the datafile"
+                    )
+
+            header = None
+            if isinstance(self.metadata["header"], int):
+                header = self.metadata["header"]
+
+            blocksize = dcg.get_chunksize(full_path)
+            self._edgelist = dask_cudf.read_csv(
+                path=full_path,
+                blocksize=blocksize,
+                delimiter=self.metadata["delim"],
+                names=self.metadata["col_names"],
+                dtype={
+                    self.metadata["col_names"][i]: self.metadata["col_types"][i]
+                    for i in range(len(self.metadata["col_types"]))
+                },
+                header=header,
+            )
+
+        return self._edgelist.copy()
+
     def get_graph(
         self,
         download=False,
@@ -249,10 +291,10 @@ def get_graph(
         if create_using is None:
             G = Graph()
         elif isinstance(create_using, Graph):
-            # what about BFS if trnaposed is True
+            # what about BFS if transposed is True
             attrs = {"directed": create_using.is_directed()}
             G = type(create_using)(**attrs)
-        elif type(create_using) is type:
+        elif issubclass(create_using, Graph):
             G = create_using()
         else:
             raise TypeError(
@@ -277,9 +319,74 @@ def get_graph(
             )
         return G
 
+    def get_dask_graph(
+        self,
+        download=False,
+        create_using=Graph,
+        ignore_weights=False,
+        store_transposed=False,
+    ):
+        """
+        Return a distributed Graph object.
+
+        Parameters
+        ----------
+        download : Boolean (default=False)
+            Downloads the dataset from the web.
+
+        create_using: cugraph.Graph (instance or class), optional
+        (default=Graph)
+            Specify the type of Graph to create. Can pass in an instance to
+            create a Graph instance with specified 'directed' attribute.
+
+        ignore_weights : Boolean (default=False)
+            Ignores weights in the dataset if True, resulting in an
+            unweighted Graph. If False (the default), weights from the
+            dataset -if present- will be applied to the Graph. If the
+            dataset does not contain weights, the Graph returned will
+            be unweighted regardless of ignore_weights.
+
+        store_transposed : bool, optional (default=False)
+            If True, stores the transpose of the adjacency matrix.  Required
+            for certain algorithms.
+        """
+        if self._edgelist is None:
+            self.get_dask_edgelist(download)
+
+        if create_using is None:
+            G = Graph()
+        elif isinstance(create_using, Graph):
+            attrs = {"directed": create_using.is_directed()}
+            G = type(create_using)(**attrs)
+        elif issubclass(create_using, Graph):
+            G = create_using()
+        else:
+            raise TypeError(
+                "create_using must be a cugraph.Graph "
+                "(or subclass) type or instance, got: "
+                f"{type(create_using)}"
+            )
+
+        if len(self.metadata["col_names"]) > 2 and not (ignore_weights):
+            G.from_dask_cudf_edgelist(
+                self._edgelist,
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
+                edge_attr=self.metadata["col_names"][2],
+                store_transposed=store_transposed,
+            )
+        else:
+            G.from_dask_cudf_edgelist(
+                self._edgelist,
+                source=self.metadata["col_names"][0],
+                destination=self.metadata["col_names"][1],
+                store_transposed=store_transposed,
+            )
+        return G
+
     def get_path(self):
         """
-        Returns the location of the stored dataset file
+        Returns the location of the stored dataset file.
         """
         if self._path is None:
             self._path = self._dl_path.path / (
@@ -347,8 +454,7 @@ def download_all(force=False):
                 filename = meta["name"] + meta["file_type"]
                 save_to = default_download_dir.path / filename
                 if not save_to.is_file() or force:
-                    df = cudf.read_csv(meta["url"])
-                    df.to_csv(save_to, index=False)
+                    urllib.request.urlretrieve(meta["url"], str(save_to))
 
 
 def set_download_dir(path):
diff --git a/python/cugraph/cugraph/experimental/__init__.py b/python/cugraph/cugraph/experimental/__init__.py
index 2309a529047..7e8fd666972 100644
--- a/python/cugraph/cugraph/experimental/__init__.py
+++ b/python/cugraph/cugraph/experimental/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -48,9 +48,9 @@
     experimental_warning_wrapper(EXPERIMENTAL__find_bicliques)
 )
 
-from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
+from cugraph.gnn.data_loading import BulkSampler
 
-BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
+BulkSampler = promoted_experimental_warning_wrapper(BulkSampler)
 
 
 from cugraph.link_prediction.jaccard import jaccard, jaccard_coefficient
diff --git a/python/cugraph/cugraph/experimental/gnn/__init__.py b/python/cugraph/cugraph/experimental/gnn/__init__.py
index 2f06bb20abe..9c366a2ee28 100644
--- a/python/cugraph/cugraph/experimental/gnn/__init__.py
+++ b/python/cugraph/cugraph/experimental/gnn/__init__.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.gnn.data_loading import EXPERIMENTAL__BulkSampler
-from cugraph.utilities.api_tools import experimental_warning_wrapper
+from cugraph.gnn.data_loading import BulkSampler
+from cugraph.utilities.api_tools import promoted_experimental_warning_wrapper
 
-BulkSampler = experimental_warning_wrapper(EXPERIMENTAL__BulkSampler)
+BulkSampler = promoted_experimental_warning_wrapper(BulkSampler)
diff --git a/python/cugraph/cugraph/gnn/__init__.py b/python/cugraph/cugraph/gnn/__init__.py
index a62e0cbd242..f8a3035440b 100644
--- a/python/cugraph/cugraph/gnn/__init__.py
+++ b/python/cugraph/cugraph/gnn/__init__.py
@@ -12,3 +12,4 @@
 # limitations under the License.
 
 from .feature_storage.feat_storage import FeatureStore
+from .data_loading.bulk_sampler import BulkSampler
diff --git a/python/cugraph/cugraph/gnn/data_loading/__init__.py b/python/cugraph/cugraph/gnn/data_loading/__init__.py
index 6150bf5b422..4b725fba75a 100644
--- a/python/cugraph/cugraph/gnn/data_loading/__init__.py
+++ b/python/cugraph/cugraph/gnn/data_loading/__init__.py
@@ -11,4 +11,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.gnn.data_loading.bulk_sampler import EXPERIMENTAL__BulkSampler
+from cugraph.gnn.data_loading.bulk_sampler import BulkSampler
diff --git a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
index dbfcb124ce5..ff72e0ea2d6 100644
--- a/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
+++ b/python/cugraph/cugraph/gnn/data_loading/bulk_sampler.py
@@ -31,7 +31,7 @@
 import time
 
 
-class EXPERIMENTAL__BulkSampler:
+class BulkSampler:
     """
     Performs sampling based on input seeds grouped into batches by
     a batch id.  Writes the output minibatches to parquet, with
@@ -158,7 +158,7 @@ def add_batches(
         Examples
         --------
         >>> import cudf
-        >>> from cugraph.experimental.gnn import BulkSampler
+        >>> from cugraph.gnn import BulkSampler
         >>> from cugraph.datasets import karate
         >>> import tempfile
         >>> df = cudf.DataFrame({
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
index f666900b226..cdf1e937e67 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -35,12 +35,11 @@
 from cugraph.structure.number_map import NumberMap
 from cugraph.structure.symmetrize import symmetrize
 from cugraph.dask.common.part_utils import (
-    get_persisted_df_worker_map,
     persist_dask_df_equal_parts_per_worker,
-    _chunk_lst,
 )
-from cugraph.dask import get_n_workers
+from cugraph.dask.common.mg_utils import run_gc_on_dask_cluster
 import cugraph.dask.comms.comms as Comms
+from cugraph.structure.symmetrize import _memory_efficient_drop_duplicates
 
 
 class simpleDistributedGraphImpl:
@@ -97,6 +96,7 @@ def _make_plc_graph(
         weight_type,
         edge_id_type,
         edge_type_id,
+        drop_multi_edges,
     ):
         weights = None
         edge_ids = None
@@ -151,6 +151,7 @@ def _make_plc_graph(
             num_arrays=num_arrays,
             store_transposed=store_transposed,
             do_expensive_check=False,
+            drop_multi_edges=drop_multi_edges,
         )
         del edata_x
         gc.collect()
@@ -171,7 +172,6 @@ def __from_edgelist(
         store_transposed=False,
         legacy_renum_only=False,
     ):
-
         if not isinstance(input_ddf, dask_cudf.DataFrame):
             raise TypeError("input should be a dask_cudf dataFrame")
 
@@ -270,18 +270,17 @@ def __from_edgelist(
                 input_ddf,
                 source,
                 destination,
-                multi=self.properties.multi_edge,
+                multi=True,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
             value_col = None
         else:
-
             source_col, dest_col, value_col = symmetrize(
                 input_ddf,
                 source,
                 destination,
                 value_col_names,
-                multi=self.properties.multi_edge,
+                multi=True,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -350,9 +349,11 @@ def __from_edgelist(
             is_symmetric=not self.properties.directed,
         )
         ddf = ddf.repartition(npartitions=len(workers) * 2)
-        ddf_keys = ddf.to_delayed()
         workers = _client.scheduler_info()["workers"].keys()
-        ddf_keys_ls = _chunk_lst(ddf_keys, len(workers))
+        persisted_keys_d = persist_dask_df_equal_parts_per_worker(
+            ddf, _client, return_type="dict"
+        )
+        del ddf
 
         delayed_tasks_d = {
             w: delayed(simpleDistributedGraphImpl._make_plc_graph)(
@@ -366,20 +367,21 @@ def __from_edgelist(
                 self.weight_type,
                 self.edge_id_type,
                 self.edge_type_id_type,
+                not self.properties.multi_edge,
             )
-            for w, edata in zip(workers, ddf_keys_ls)
+            for w, edata in persisted_keys_d.items()
         }
+        del persisted_keys_d
         self._plc_graph = {
             w: _client.compute(
                 delayed_task, workers=w, allow_other_workers=False, pure=False
             )
             for w, delayed_task in delayed_tasks_d.items()
         }
-        wait(list(self._plc_graph.values()))
-        del ddf_keys
         del delayed_tasks_d
-        gc.collect()
-        _client.run(gc.collect)
+        run_gc_on_dask_cluster(_client)
+        wait(list(self._plc_graph.values()))
+        run_gc_on_dask_cluster(_client)
 
     @property
     def renumbered(self):
@@ -457,6 +459,15 @@ def view_edge_list(self):
                 else:
                     is_multi_column = True
 
+            if not self.properties.multi_edge:
+                # Drop parallel edges for non MultiGraph
+                # FIXME: Drop multi edges with the CAPI instead.
+                _client = default_client()
+                workers = _client.scheduler_info()["workers"]
+                edgelist_df = _memory_efficient_drop_duplicates(
+                    edgelist_df, [srcCol, dstCol], len(workers)
+                )
+
             edgelist_df[srcCol], edgelist_df[dstCol] = edgelist_df[
                 [srcCol, dstCol]
             ].min(axis=1), edgelist_df[[srcCol, dstCol]].max(axis=1)
@@ -825,12 +836,13 @@ def get_two_hop_neighbors(self, start_vertices=None):
         _client = default_client()
 
         def _call_plc_two_hop_neighbors(sID, mg_graph_x, start_vertices):
-            return pylibcugraph_get_two_hop_neighbors(
+            results_ = pylibcugraph_get_two_hop_neighbors(
                 resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
                 graph=mg_graph_x,
                 start_vertices=start_vertices,
                 do_expensive_check=False,
             )
+            return results_
 
         if isinstance(start_vertices, int):
             start_vertices = [start_vertices]
@@ -845,31 +857,31 @@ def _call_plc_two_hop_neighbors(sID, mg_graph_x, start_vertices):
             else:
                 start_vertices_type = self.input_df.dtypes[0]
 
-            if not isinstance(start_vertices, (dask_cudf.Series)):
-                start_vertices = dask_cudf.from_cudf(
+            start_vertices = start_vertices.astype(start_vertices_type)
+
+            def create_iterable_args(
+                session_id, input_graph, start_vertices=None, npartitions=None
+            ):
+                session_id_it = [session_id] * npartitions
+                graph_it = input_graph.values()
+                start_vertices = cp.array_split(start_vertices.values, npartitions)
+                return [
+                    session_id_it,
+                    graph_it,
                     start_vertices,
-                    npartitions=min(self._npartitions, len(start_vertices)),
-                )
-                start_vertices = start_vertices.astype(start_vertices_type)
+                ]
 
-            n_workers = get_n_workers()
-            start_vertices = start_vertices.repartition(npartitions=n_workers)
-            start_vertices = persist_dask_df_equal_parts_per_worker(
-                start_vertices, _client
+            result = _client.map(
+                _call_plc_two_hop_neighbors,
+                *create_iterable_args(
+                    Comms.get_session_id(),
+                    self._plc_graph,
+                    start_vertices,
+                    self._npartitions,
+                ),
+                pure=False,
             )
-            start_vertices = get_persisted_df_worker_map(start_vertices, _client)
 
-            result = [
-                _client.submit(
-                    _call_plc_two_hop_neighbors,
-                    Comms.get_session_id(),
-                    self._plc_graph[w],
-                    start_vertices[w][0],
-                    workers=[w],
-                    allow_other_workers=False,
-                )
-                for w in start_vertices.keys()
-            ]
         else:
             result = [
                 _client.submit(
@@ -896,7 +908,8 @@ def convert_to_cudf(cp_arrays):
             return df
 
         cudf_result = [
-            _client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result
+            _client.submit(convert_to_cudf, cp_arrays, pure=False)
+            for cp_arrays in result
         ]
 
         wait(cudf_result)
@@ -945,7 +958,6 @@ def convert_to_cudf(cp_arrays: cp.ndarray) -> cudf.Series:
         def _call_plc_select_random_vertices(
             mg_graph_x, sID: bytes, random_state: int, num_vertices: int
         ) -> cudf.Series:
-
             cp_arrays = pylibcugraph_select_random_vertices(
                 graph=mg_graph_x,
                 resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()),
@@ -961,7 +973,6 @@ def _mg_call_plc_select_random_vertices(
             random_state: int,
             num_vertices: int,
         ) -> dask_cudf.Series:
-
             result = [
                 client.submit(
                     _call_plc_select_random_vertices,
diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
index 22d82eb1796..121a4c6245a 100644
--- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
+++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -264,7 +264,7 @@ def __from_edgelist(
                 source,
                 destination,
                 edge_attr,
-                multi=self.properties.multi_edge,
+                multi=self.properties.multi_edge,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -279,7 +279,7 @@ def __from_edgelist(
                 elist,
                 source,
                 destination,
-                multi=self.properties.multi_edge,
+                multi=self.properties.multi_edge,  # Deprecated parameter
                 symmetrize=not self.properties.directed,
             )
 
@@ -298,7 +298,10 @@ def __from_edgelist(
             self._replicate_edgelist()
 
         self._make_plc_graph(
-            value_col=value_col, store_transposed=store_transposed, renumber=renumber
+            value_col=value_col,
+            store_transposed=store_transposed,
+            renumber=renumber,
+            drop_multi_edges=not self.properties.multi_edge,
         )
 
     def to_pandas_edgelist(
@@ -477,6 +480,7 @@ def view_edge_list(self):
                     edgelist_df[simpleGraphImpl.srcCol]
                     <= edgelist_df[simpleGraphImpl.dstCol]
                 ]
+
         elif not use_initial_input_df and self.properties.renumbered:
             # Do not unrenumber the vertices if the initial input df was used
             if not self.properties.directed:
@@ -484,6 +488,7 @@ def view_edge_list(self):
                     edgelist_df[simpleGraphImpl.srcCol]
                     <= edgelist_df[simpleGraphImpl.dstCol]
                 ]
+
             edgelist_df = self.renumber_map.unrenumber(
                 edgelist_df, simpleGraphImpl.srcCol
             )
@@ -1084,6 +1089,7 @@ def _make_plc_graph(
         value_col: Dict[str, cudf.DataFrame] = None,
         store_transposed: bool = False,
         renumber: bool = True,
+        drop_multi_edges: bool = False,
     ):
         """
         Parameters
@@ -1100,6 +1106,8 @@ def _make_plc_graph(
             Whether to renumber the vertices of the graph.
             Required if inputted vertex ids are not of
             int32 or int64 type.
+        drop_multi_edges: bool (default=False)
+            Whether to drop multi edges
         """
 
         if value_col is None:
@@ -1163,6 +1171,7 @@ def _make_plc_graph(
             renumber=renumber,
             do_expensive_check=True,
             input_array_format=input_array_format,
+            drop_multi_edges=drop_multi_edges,
         )
 
     def to_directed(self, DiG, store_transposed=False):
diff --git a/python/cugraph/cugraph/structure/symmetrize.py b/python/cugraph/cugraph/structure/symmetrize.py
index b324ff65834..30c6394ade9 100644
--- a/python/cugraph/cugraph/structure/symmetrize.py
+++ b/python/cugraph/cugraph/structure/symmetrize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,7 @@
 import cudf
 import dask_cudf
 from dask.distributed import default_client
+import warnings
 
 
 def symmetrize_df(
@@ -54,6 +55,11 @@ def symmetrize_df(
         Name of the column in the data frame containing the weight ids
 
     multi : bool, optional (default=False)
+        [Deprecated,  Multi will be removed in future version,  and the removal
+        of multi edges will no longer be supported from 'symmetrize'.
+        Multi edges will be removed upon  creation of graph instance directly
+        based on if the graph is `curgaph.MultiGraph` or `cugraph.Graph`.]
+
         Set to True if graph is a Multi(Di)Graph. This allows multiple
         edges instead of dropping them.
 
@@ -84,6 +90,12 @@ def symmetrize_df(
     if multi:
         return result
     else:
+        warnings.warn(
+            "Multi is deprecated and the removal of multi edges will no longer be "
+            "supported from 'symmetrize'. Multi edges will be removed upon creation "
+            "of graph instance.",
+            FutureWarning,
+        )
         vertex_col_name = src_name + dst_name
         result = result.groupby(by=[*vertex_col_name], as_index=False).min()
         return result
@@ -128,6 +140,11 @@ def symmetrize_ddf(
         Name of the column in the data frame containing the weight ids
 
     multi : bool, optional (default=False)
+        [Deprecated,  Multi will be removed in future version,  and the removal
+        of multi edges will no longer be supported from 'symmetrize'.
+        Multi edges will be removed upon  creation of graph instance directly
+        based on if the graph is `curgaph.MultiGraph` or `cugraph.Graph`.]
+
         Set to True if graph is a Multi(Di)Graph. This allows multiple
         edges instead of dropping them.
 
@@ -165,8 +182,15 @@ def symmetrize_ddf(
     else:
         result = ddf
     if multi:
+        result = result.reset_index(drop=True).repartition(npartitions=len(workers) * 2)
         return result
     else:
+        warnings.warn(
+            "Multi is deprecated and the removal of multi edges will no longer be "
+            "supported from 'symmetrize'. Multi edges will be removed upon creation "
+            "of graph instance.",
+            FutureWarning,
+        )
         vertex_col_name = src_name + dst_name
         result = _memory_efficient_drop_duplicates(
             result, vertex_col_name, len(workers)
@@ -181,6 +205,7 @@ def symmetrize(
     value_col_name=None,
     multi=False,
     symmetrize=True,
+    do_expensive_check=False,
 ):
     """
     Take a dataframe of source destination pairs along with associated
@@ -208,6 +233,11 @@ def symmetrize(
         weights column name.
 
     multi : bool, optional (default=False)
+        [Deprecated,  Multi will be removed in future version,  and the removal
+        of multi edges will no longer be supported from 'symmetrize'.
+        Multi edges will be removed upon  creation of graph instance directly
+        based on if the graph is `curgaph.MultiGraph` or `cugraph.Graph`.]
+
         Set to True if graph is a Multi(Di)Graph. This allows multiple
         edges instead of dropping them.
 
@@ -234,8 +264,9 @@ def symmetrize(
     if "edge_id" in input_df.columns and symmetrize:
         raise ValueError("Edge IDs are not supported on undirected graphs")
 
-    csg.null_check(input_df[source_col_name])
-    csg.null_check(input_df[dest_col_name])
+    if do_expensive_check:  # FIXME: Optimize this check as it is currently expensive
+        csg.null_check(input_df[source_col_name])
+        csg.null_check(input_df[dest_col_name])
 
     if isinstance(input_df, dask_cudf.DataFrame):
         output_df = symmetrize_ddf(
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
index a945881394b..943681fb6ff 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler.py
@@ -17,7 +17,7 @@
 import cupy
 import cugraph
 from cugraph.datasets import karate, email_Eu_core
-from cugraph.experimental.gnn import BulkSampler
+from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import create_directory_with_overwrite
 
 import os
diff --git a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
index aee81e5ffed..1f7c4277773 100644
--- a/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_bulk_sampler_mg.py
@@ -22,7 +22,7 @@
 import cugraph
 import dask_cudf
 from cugraph.datasets import karate, email_Eu_core
-from cugraph.experimental import BulkSampler
+from cugraph.gnn import BulkSampler
 from cugraph.utilities.utils import create_directory_with_overwrite
 
 
diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
index 460a25cbd14..371410b8bd5 100644
--- a/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
+++ b/python/cugraph/cugraph/tests/sampling/test_uniform_neighbor_sample_mg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,6 +26,7 @@
 from cugraph.testing import UNDIRECTED_DATASETS
 from cugraph.dask import uniform_neighbor_sample
 from cugraph.dask.common.mg_utils import is_single_gpu
+from cugraph.structure.symmetrize import _memory_efficient_drop_duplicates
 from cugraph.datasets import email_Eu_core, small_tree
 from pylibcugraph.testing.utils import gen_fixture_params_product
 
@@ -135,6 +136,14 @@ def test_mg_uniform_neighbor_sample_simple(dask_client, input_combo):
     dg = input_combo["MGGraph"]
 
     input_df = dg.input_df
+    # Drop parallel edges for non MultiGraph
+    # FIXME: Drop multi edges with the CAPI instead.
+    vertex_col_name = ["src", "dst"]
+    workers = dask_client.scheduler_info()["workers"]
+    input_df = _memory_efficient_drop_duplicates(
+        input_df, vertex_col_name, len(workers)
+    )
+
     result_nbr = uniform_neighbor_sample(
         dg,
         input_combo["start_list"],
diff --git a/python/cugraph/cugraph/tests/utils/test_dataset.py b/python/cugraph/cugraph/tests/utils/test_dataset.py
index 60bc6dbb45a..39f7ed8850b 100644
--- a/python/cugraph/cugraph/tests/utils/test_dataset.py
+++ b/python/cugraph/cugraph/tests/utils/test_dataset.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -20,6 +20,7 @@
 import pytest
 
 import cudf
+import dask_cudf
 from cugraph.structure import Graph
 from cugraph.testing import (
     RAPIDS_DATASET_ROOT_DIR_PATH,
@@ -29,6 +30,7 @@
     BENCHMARKING_DATASETS,
 )
 from cugraph import datasets
+from cugraph.dask.common.mg_utils import is_single_gpu
 
 # Add the sg marker to all tests in this module.
 pytestmark = pytest.mark.sg
@@ -37,6 +39,7 @@
 ###############################################################################
 # Fixtures
 
+
 # module fixture - called once for this module
 @pytest.fixture(scope="module")
 def tmpdir():
@@ -77,6 +80,7 @@ def setup(tmpdir):
 ###############################################################################
 # Helpers
 
+
 # check if there is a row where src == dst
 def has_selfloop(dataset):
     if not dataset.metadata["is_directed"]:
@@ -115,6 +119,7 @@ def is_symmetric(dataset):
 ###############################################################################
 # Tests
 
+
 # setting download_dir to None effectively re-initialized the default
 def test_env_var():
     os.environ["RAPIDS_DATASET_ROOT_DIR"] = "custom_storage_location"
@@ -150,9 +155,19 @@ def test_download(dataset):
     assert dataset.get_path().is_file()
 
 
+@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
+@pytest.mark.skip(reason="MG not supported on CI")
+@pytest.mark.parametrize("dataset", ALL_DATASETS)
+def test_download_dask(dask_client, dataset):
+    E = dataset.get_dask_edgelist(download=True)
+
+    assert E is not None
+    assert dataset.get_path().is_file()
+
+
 @pytest.mark.parametrize("dataset", SMALL_DATASETS)
 def test_reader(dataset):
-    # defaults to using cudf.read_csv
+    # defaults to using cudf
     E = dataset.get_edgelist(download=True)
 
     assert E is not None
@@ -171,18 +186,46 @@ def test_reader(dataset):
         dataset.get_edgelist(reader=None)
 
 
+@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
+@pytest.mark.skip(reason="MG not supported on CI")
+@pytest.mark.parametrize("dataset", SMALL_DATASETS)
+def test_reader_dask(dask_client, dataset):
+    # using dask_cudf
+    E = dataset.get_dask_edgelist(download=True)
+
+    assert E is not None
+    assert isinstance(E, dask_cudf.core.DataFrame)
+    dataset.unload()
+
+
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
 def test_get_edgelist(dataset):
     E = dataset.get_edgelist(download=True)
     assert E is not None
 
 
+@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
+@pytest.mark.skip(reason="MG not supported on CI")
+@pytest.mark.parametrize("dataset", ALL_DATASETS)
+def test_get_dask_edgelist(dask_client, dataset):
+    E = dataset.get_dask_edgelist(download=True)
+    assert E is not None
+
+
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
 def test_get_graph(dataset):
     G = dataset.get_graph(download=True)
     assert G is not None
 
 
+@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
+@pytest.mark.skip(reason="MG not supported on CI")
+@pytest.mark.parametrize("dataset", ALL_DATASETS)
+def test_get_dask_graph(dask_client, dataset):
+    G = dataset.get_dask_graph(download=True)
+    assert G is not None
+
+
 @pytest.mark.parametrize("dataset", ALL_DATASETS)
 def test_metadata(dataset):
     M = dataset.metadata
@@ -207,6 +250,16 @@ def test_weights(dataset):
     assert not G.is_weighted()
 
 
+@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
+@pytest.mark.skip(reason="MG not supported on CI")
+@pytest.mark.parametrize("dataset", WEIGHTED_DATASETS)
+def test_weights_dask(dask_client, dataset):
+    G = dataset.get_dask_graph(download=True)
+    assert G.is_weighted()
+    G = dataset.get_dask_graph(download=True, ignore_weights=True)
+    assert not G.is_weighted()
+
+
 @pytest.mark.parametrize("dataset", SMALL_DATASETS)
 def test_create_using(dataset):
     G = dataset.get_graph(download=True)
@@ -216,6 +269,26 @@ def test_create_using(dataset):
     G = dataset.get_graph(download=True, create_using=Graph(directed=True))
     assert G.is_directed()
 
+    # using a non-Graph type should raise an error
+    with pytest.raises(TypeError):
+        dataset.get_graph(download=True, create_using=set)
+
+
+@pytest.mark.skipif(is_single_gpu(), reason="skipping MG testing on Single GPU system")
+@pytest.mark.skip(reason="MG not supported on CI")
+@pytest.mark.parametrize("dataset", SMALL_DATASETS)
+def test_create_using_dask(dask_client, dataset):
+    G = dataset.get_dask_graph(download=True)
+    assert not G.is_directed()
+    G = dataset.get_dask_graph(download=True, create_using=Graph)
+    assert not G.is_directed()
+    G = dataset.get_dask_graph(download=True, create_using=Graph(directed=True))
+    assert G.is_directed()
+
+    # using a non-Graph type should raise an error
+    with pytest.raises(TypeError):
+        dataset.get_dask_graph(download=True, create_using=set)
+
 
 def test_ctor_with_datafile():
     from cugraph.datasets import karate
diff --git a/python/cugraph/pyproject.toml b/python/cugraph/pyproject.toml
index bd426291c8d..6a9d88bf5c8 100644
--- a/python/cugraph/pyproject.toml
+++ b/python/cugraph/pyproject.toml
@@ -6,14 +6,14 @@ requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "pylibcugraph==23.12.*",
-    "pylibraft==23.12.*",
-    "rmm==23.12.*",
-    "scikit-build>=0.13.1",
+    "pylibcugraph==24.2.*",
+    "pylibraft==24.2.*",
+    "rmm==24.2.*",
+    "scikit-build-core[pyproject]>=0.7.0",
     "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "setuptools.build_meta"
+build-backend = "scikit_build_core.build"
 
 [tool.pytest.ini_options]
 testpaths = ["cugraph/tests"]
@@ -29,18 +29,18 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==23.12.*",
+    "cudf==24.2.*",
     "cupy-cuda11x>=12.0.0",
-    "dask-cuda==23.12.*",
-    "dask-cudf==23.12.*",
+    "dask-cuda==24.2.*",
+    "dask-cudf==24.2.*",
     "fsspec[http]>=0.6.0",
     "numba>=0.57",
     "numpy>=1.21",
-    "pylibcugraph==23.12.*",
-    "raft-dask==23.12.*",
-    "rapids-dask-dependency==23.12.*",
-    "rmm==23.12.*",
-    "ucx-py==0.35.*",
+    "pylibcugraph==24.2.*",
+    "raft-dask==24.2.*",
+    "rapids-dask-dependency==24.2.*",
+    "rmm==24.2.*",
+    "ucx-py==0.36.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -67,8 +67,15 @@ test = [
 Homepage = "https://github.com/rapidsai/cugraph"
 Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 
-[tool.setuptools]
-license-files = ["LICENSE"]
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.minimum-version = "3.26.4"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["cugraph"]
 
-[tool.setuptools.dynamic]
-version = {file = "cugraph/VERSION"}
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "cugraph/VERSION"
+regex = "(?P<value>.*)"
diff --git a/python/cugraph/setup.py b/python/cugraph/setup.py
deleted file mode 100644
index 81916444cfd..00000000000
--- a/python/cugraph/setup.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from setuptools import find_packages, Command
-from skbuild import setup
-
-
-class CleanCommand(Command):
-    """Custom clean command to tidy up the project root."""
-
-    user_options = [
-        ("all", None, None),
-    ]
-
-    def initialize_options(self):
-        self.all = None
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        setupFileDir = os.path.dirname(os.path.abspath(__file__))
-        os.chdir(setupFileDir)
-        os.system("rm -rf build")
-        os.system("rm -rf dist")
-        os.system("rm -rf dask-worker-space")
-        os.system('find . -name "__pycache__" -type d -exec rm -rf {} +')
-        os.system("rm -rf *.egg-info")
-        os.system('find . -name "*.cpp" -type f -delete')
-        os.system('find . -name "*.cpython*.so" -type f -delete')
-        os.system("rm -rf _skbuild")
-
-
-packages = find_packages(include=["cugraph*"])
-setup(
-    packages=packages,
-    package_data={key: ["VERSION", "*.pxd", "*.yaml"] for key in packages},
-    cmdclass={"clean": CleanCommand},
-    zip_safe=False,
-)
diff --git a/python/nx-cugraph/README.md b/python/nx-cugraph/README.md
index f6a9aac1088..5d0554734a8 100644
--- a/python/nx-cugraph/README.md
+++ b/python/nx-cugraph/README.md
@@ -89,48 +89,146 @@ interface to its CUDA-based graph analytics library) and
 [CuPy](https://cupy.dev/) (a GPU-accelerated array library) to NetworkX's
 familiar and easy-to-use API.
 
-Below is the list of algorithms (many listed using pylibcugraph names),
-available today in pylibcugraph or implemented using CuPy, that are or will be
-supported in nx-cugraph.
-
-| feature/algo | release/target version |
-| ----- | ----- |
-| analyze_clustering_edge_cut | ? |
-| analyze_clustering_modularity | ? |
-| analyze_clustering_ratio_cut | ? |
-| balanced_cut_clustering | ? |
-| betweenness_centrality | 23.10 |
-| bfs | ? |
-| connected_components | 23.12 |
-| core_number | ? |
-| degree_centrality | 23.12 |
-| ecg | ? |
-| edge_betweenness_centrality | 23.10 |
-| ego_graph | ? |
-| eigenvector_centrality | 23.12 |
-| get_two_hop_neighbors | ? |
-| hits | 23.12 |
-| in_degree_centrality | 23.12 |
-| induced_subgraph | ? |
-| jaccard_coefficients | ? |
-| katz_centrality | 23.12 |
-| k_core | ? |
-| k_truss_subgraph | 23.12 |
-| leiden | ? |
-| louvain | 23.10 |
-| node2vec | ? |
-| out_degree_centrality | 23.12 |
-| overlap_coefficients | ? |
-| pagerank | 23.12 |
-| personalized_pagerank | ? |
-| sorensen_coefficients | ? |
-| spectral_modularity_maximization | ? |
-| sssp | 23.12 |
-| strongly_connected_components | ? |
-| triangle_count | ? |
-| uniform_neighbor_sample | ? |
-| uniform_random_walks | ? |
-| weakly_connected_components | ? |
+Below is the list of algorithms that are currently supported in nx-cugraph.
+
+### Algorithms
+
+```
+bipartite
+ ├─ basic
+ │   └─ is_bipartite
+ └─ generators
+     └─ complete_bipartite_graph
+centrality
+ ├─ betweenness
+ │   ├─ betweenness_centrality
+ │   └─ edge_betweenness_centrality
+ ├─ degree_alg
+ │   ├─ degree_centrality
+ │   ├─ in_degree_centrality
+ │   └─ out_degree_centrality
+ ├─ eigenvector
+ │   └─ eigenvector_centrality
+ └─ katz
+     └─ katz_centrality
+cluster
+ ├─ average_clustering
+ ├─ clustering
+ ├─ transitivity
+ └─ triangles
+community
+ └─ louvain
+     └─ louvain_communities
+components
+ ├─ connected
+ │   ├─ connected_components
+ │   ├─ is_connected
+ │   ├─ node_connected_component
+ │   └─ number_connected_components
+ └─ weakly_connected
+     ├─ is_weakly_connected
+     ├─ number_weakly_connected_components
+     └─ weakly_connected_components
+core
+ ├─ core_number
+ └─ k_truss
+dag
+ ├─ ancestors
+ └─ descendants
+isolate
+ ├─ is_isolate
+ ├─ isolates
+ └─ number_of_isolates
+link_analysis
+ ├─ hits_alg
+ │   └─ hits
+ └─ pagerank_alg
+     └─ pagerank
+operators
+ └─ unary
+     ├─ complement
+     └─ reverse
+reciprocity
+ ├─ overall_reciprocity
+ └─ reciprocity
+shortest_paths
+ └─ unweighted
+     ├─ single_source_shortest_path_length
+     └─ single_target_shortest_path_length
+traversal
+ └─ breadth_first_search
+     ├─ bfs_edges
+     ├─ bfs_layers
+     ├─ bfs_predecessors
+     ├─ bfs_successors
+     ├─ bfs_tree
+     ├─ descendants_at_distance
+     └─ generic_bfs_edges
+tree
+ └─ recognition
+     ├─ is_arborescence
+     ├─ is_branching
+     ├─ is_forest
+     └─ is_tree
+```
+
+### Generators
+
+```
+classic
+ ├─ barbell_graph
+ ├─ circular_ladder_graph
+ ├─ complete_graph
+ ├─ complete_multipartite_graph
+ ├─ cycle_graph
+ ├─ empty_graph
+ ├─ ladder_graph
+ ├─ lollipop_graph
+ ├─ null_graph
+ ├─ path_graph
+ ├─ star_graph
+ ├─ tadpole_graph
+ ├─ trivial_graph
+ ├─ turan_graph
+ └─ wheel_graph
+community
+ └─ caveman_graph
+small
+ ├─ bull_graph
+ ├─ chvatal_graph
+ ├─ cubical_graph
+ ├─ desargues_graph
+ ├─ diamond_graph
+ ├─ dodecahedral_graph
+ ├─ frucht_graph
+ ├─ heawood_graph
+ ├─ house_graph
+ ├─ house_x_graph
+ ├─ icosahedral_graph
+ ├─ krackhardt_kite_graph
+ ├─ moebius_kantor_graph
+ ├─ octahedral_graph
+ ├─ pappus_graph
+ ├─ petersen_graph
+ ├─ sedgewick_maze_graph
+ ├─ tetrahedral_graph
+ ├─ truncated_cube_graph
+ ├─ truncated_tetrahedron_graph
+ └─ tutte_graph
+social
+ ├─ davis_southern_women_graph
+ ├─ florentine_families_graph
+ ├─ karate_club_graph
+ └─ les_miserables_graph
+```
+
+### Other
+
+```
+convert_matrix
+ ├─ from_pandas_edgelist
+ └─ from_scipy_sparse_array
+```
 
 To request nx-cugraph backend support for a NetworkX API that is not listed
 above, visit the [cuGraph GitHub repo](https://github.com/rapidsai/cugraph).
diff --git a/python/nx-cugraph/_nx_cugraph/__init__.py b/python/nx-cugraph/_nx_cugraph/__init__.py
index ef5f8f3fc23..8b5c87a63f9 100644
--- a/python/nx-cugraph/_nx_cugraph/__init__.py
+++ b/python/nx-cugraph/_nx_cugraph/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,7 +12,11 @@
 # limitations under the License.
 """Tell NetworkX about the cugraph backend. This file can update itself:
 
-$ make plugin-info  # Recommended method for development
+$ make plugin-info
+
+or
+
+$ make all  # Recommended - runs 'plugin-info' followed by 'lint'
 
 or
 
@@ -24,26 +28,38 @@
     "backend_name": "cugraph",
     "project": "nx-cugraph",
     "package": "nx_cugraph",
-    "url": "https://github.com/rapidsai/cugraph/tree/branch-23.12/python/nx-cugraph",
+    "url": "https://github.com/rapidsai/cugraph/tree/branch-24.02/python/nx-cugraph",
     "short_summary": "GPU-accelerated backend.",
     # "description": "TODO",
     "functions": {
         # BEGIN: functions
+        "ancestors",
+        "average_clustering",
         "barbell_graph",
         "betweenness_centrality",
+        "bfs_edges",
+        "bfs_layers",
+        "bfs_predecessors",
+        "bfs_successors",
+        "bfs_tree",
         "bull_graph",
         "caveman_graph",
         "chvatal_graph",
         "circular_ladder_graph",
+        "clustering",
+        "complement",
         "complete_bipartite_graph",
         "complete_graph",
         "complete_multipartite_graph",
         "connected_components",
+        "core_number",
         "cubical_graph",
         "cycle_graph",
         "davis_southern_women_graph",
         "degree_centrality",
         "desargues_graph",
+        "descendants",
+        "descendants_at_distance",
         "diamond_graph",
         "dodecahedral_graph",
         "edge_betweenness_centrality",
@@ -53,14 +69,21 @@
         "from_pandas_edgelist",
         "from_scipy_sparse_array",
         "frucht_graph",
+        "generic_bfs_edges",
         "heawood_graph",
         "hits",
         "house_graph",
         "house_x_graph",
         "icosahedral_graph",
         "in_degree_centrality",
+        "is_arborescence",
+        "is_bipartite",
+        "is_branching",
         "is_connected",
+        "is_forest",
         "is_isolate",
+        "is_tree",
+        "is_weakly_connected",
         "isolates",
         "k_truss",
         "karate_club_graph",
@@ -76,39 +99,55 @@
         "number_connected_components",
         "number_of_isolates",
         "number_of_selfloops",
+        "number_weakly_connected_components",
         "octahedral_graph",
         "out_degree_centrality",
+        "overall_reciprocity",
         "pagerank",
         "pappus_graph",
         "path_graph",
         "petersen_graph",
+        "reciprocity",
+        "reverse",
         "sedgewick_maze_graph",
         "single_source_shortest_path_length",
         "single_target_shortest_path_length",
         "star_graph",
         "tadpole_graph",
         "tetrahedral_graph",
+        "transitivity",
+        "triangles",
         "trivial_graph",
         "truncated_cube_graph",
         "truncated_tetrahedron_graph",
         "turan_graph",
         "tutte_graph",
+        "weakly_connected_components",
         "wheel_graph",
         # END: functions
     },
     "extra_docstrings": {
         # BEGIN: extra_docstrings
-        "betweenness_centrality": "`weight` parameter is not yet supported.",
-        "edge_betweenness_centrality": "`weight` parameter is not yet supported.",
+        "average_clustering": "Directed graphs and `weight` parameter are not yet supported.",
+        "betweenness_centrality": "`weight` parameter is not yet supported, and RNG with seed may be different.",
+        "bfs_edges": "`sort_neighbors` parameter is not yet supported.",
+        "bfs_predecessors": "`sort_neighbors` parameter is not yet supported.",
+        "bfs_successors": "`sort_neighbors` parameter is not yet supported.",
+        "bfs_tree": "`sort_neighbors` parameter is not yet supported.",
+        "clustering": "Directed graphs and `weight` parameter are not yet supported.",
+        "core_number": "Directed graphs are not yet supported.",
+        "edge_betweenness_centrality": "`weight` parameter is not yet supported, and RNG with seed may be different.",
         "eigenvector_centrality": "`nstart` parameter is not used, but it is checked for validity.",
-        "from_pandas_edgelist": "cudf.DataFrame inputs also supported.",
+        "from_pandas_edgelist": "cudf.DataFrame inputs also supported; value columns with str is unsuppported.",
+        "generic_bfs_edges": "`neighbors` and `sort_neighbors` parameters are not yet supported.",
         "k_truss": (
             "Currently raises `NotImplementedError` for graphs with more than one connected\n"
             "component when k >= 3. We expect to fix this soon."
         ),
         "katz_centrality": "`nstart` isn't used (but is checked), and `normalized=False` is not supported.",
-        "louvain_communities": "`seed` parameter is currently ignored.",
+        "louvain_communities": "`seed` parameter is currently ignored, and self-loops are not yet supported.",
         "pagerank": "`dangling` parameter is not supported, but it is checked for validity.",
+        "transitivity": "Directed graphs are not yet supported.",
         # END: extra_docstrings
     },
     "extra_parameters": {
@@ -160,7 +199,7 @@ def get_info():
 
 
 # FIXME: can this use the standard VERSION file and update mechanism?
-__version__ = "23.12.00"
+__version__ = "24.02.00"
 
 if __name__ == "__main__":
     from pathlib import Path
diff --git a/python/nx-cugraph/lint.yaml b/python/nx-cugraph/lint.yaml
index a94aa9f0448..8e87fc23592 100644
--- a/python/nx-cugraph/lint.yaml
+++ b/python/nx-cugraph/lint.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # https://pre-commit.com/
 #
@@ -26,7 +26,7 @@ repos:
       - id: mixed-line-ending
       - id: trailing-whitespace
   - repo: https://github.com/abravalheri/validate-pyproject
-    rev: v0.15
+    rev: v0.16
     hooks:
       - id: validate-pyproject
         name: Validate pyproject.toml
@@ -36,7 +36,7 @@ repos:
       - id: autoflake
         args: [--in-place]
   - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
   - repo: https://github.com/asottile/pyupgrade
@@ -45,24 +45,24 @@ repos:
       - id: pyupgrade
         args: [--py39-plus]
   - repo: https://github.com/psf/black
-    rev: 23.10.1
+    rev: 23.12.1
     hooks:
       - id: black
       # - id: black-jupyter
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.1.14
     hooks:
       - id: ruff
         args: [--fix-only, --show-fixes]  # --unsafe-fixes]
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.1.0
+    rev: 7.0.0
     hooks:
       - id: flake8
         args: ['--per-file-ignores=_nx_cugraph/__init__.py:E501', '--extend-ignore=SIM105']  # Why is this necessary?
         additional_dependencies: &flake8_dependencies
           # These versions need updated manually
-          - flake8==6.1.0
-          - flake8-bugbear==23.9.16
+          - flake8==7.0.0
+          - flake8-bugbear==24.1.17
           - flake8-simplify==0.21.0
   - repo: https://github.com/asottile/yesqa
     rev: v1.5.0
@@ -77,7 +77,7 @@ repos:
         additional_dependencies: [tomli]
         files: ^(nx_cugraph|docs)/
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.3
+    rev: v0.1.14
     hooks:
       - id: ruff
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/python/nx-cugraph/nx_cugraph/_version.py b/python/nx-cugraph/nx_cugraph/_version.py
index 868a2e19475..a528a3bfe1b 100644
--- a/python/nx-cugraph/nx_cugraph/_version.py
+++ b/python/nx-cugraph/nx_cugraph/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,12 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-
 import importlib.resources
 
 # Read VERSION file from the module that is symlinked to VERSION file
-# in the root of the repo at build time or copied to the moudle at
+# in the root of the repo at build time or copied to the module at
 # installation. VERSION is a separate file that allows CI build-time scripts
 # to update version info (including commit hashes) without modifying
 # source files.
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
index 63841b15bd5..7aafa85f5b7 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -13,15 +13,25 @@
 from . import (
     bipartite,
     centrality,
+    cluster,
     community,
     components,
-    shortest_paths,
     link_analysis,
+    operators,
+    shortest_paths,
+    traversal,
+    tree,
 )
-from .bipartite import complete_bipartite_graph
+from .bipartite import complete_bipartite_graph, is_bipartite
 from .centrality import *
+from .cluster import *
 from .components import *
 from .core import *
+from .dag import *
 from .isolate import *
-from .shortest_paths import *
 from .link_analysis import *
+from .operators import *
+from .reciprocity import *
+from .shortest_paths import *
+from .traversal import *
+from .tree.recognition import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
index 062be973d55..e028299c675 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,4 +10,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .basic import *
 from .generators import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
new file mode 100644
index 00000000000..46c6b54075b
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/basic.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+
+from nx_cugraph.algorithms.cluster import _triangles
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import networkx_algorithm
+
+__all__ = [
+    "is_bipartite",
+]
+
+
+@networkx_algorithm(version_added="24.02", _plc="triangle_count")
+def is_bipartite(G):
+    G = _to_graph(G)
+    # Counting triangles may not be the fastest way to do this, but it is simple.
+    node_ids, triangles, is_single_node = _triangles(
+        G, None, symmetrize="union" if G.is_directed() else None
+    )
+    return int(cp.count_nonzero(triangles)) == 0
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
index 1d3e762b4fd..5a0c970c984 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/bipartite/generators.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,15 +17,14 @@
 import numpy as np
 
 from nx_cugraph.generators._utils import _create_using_class, _number_and_nodes
-from nx_cugraph.utils import index_dtype, networkx_algorithm, nodes_or_number
+from nx_cugraph.utils import index_dtype, networkx_algorithm
 
 __all__ = [
     "complete_bipartite_graph",
 ]
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1], version_added="23.12")
 def complete_bipartite_graph(n1, n2, create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
index 210e1f0a2b2..f6bb142cded 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/betweenness.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,11 +18,16 @@
 __all__ = ["betweenness_centrality", "edge_betweenness_centrality"]
 
 
-@networkx_algorithm
+@networkx_algorithm(
+    is_incomplete=True,  # weight not supported
+    is_different=True,  # RNG with seed is different
+    version_added="23.10",
+    _plc="betweenness_centrality",
+)
 def betweenness_centrality(
     G, k=None, normalized=True, weight=None, endpoints=False, seed=None
 ):
-    """`weight` parameter is not yet supported."""
+    """`weight` parameter is not yet supported, and RNG with seed may be different."""
     if weight is not None:
         raise NotImplementedError(
             "Weighted implementation of betweenness centrality not currently supported"
@@ -46,9 +51,14 @@ def _(G, k=None, normalized=True, weight=None, endpoints=False, seed=None):
     return weight is None
 
 
-@networkx_algorithm
+@networkx_algorithm(
+    is_incomplete=True,  # weight not supported
+    is_different=True,  # RNG with seed is different
+    version_added="23.10",
+    _plc="edge_betweenness_centrality",
+)
 def edge_betweenness_centrality(G, k=None, normalized=True, weight=None, seed=None):
-    """`weight` parameter is not yet supported."""
+    """`weight` parameter is not yet supported, and RNG with seed may be different."""
     if weight is not None:
         raise NotImplementedError(
             "Weighted implementation of betweenness centrality not currently supported"
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/degree_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/degree_alg.py
index 0b2fd24af79..a319eb3a12c 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/degree_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/degree_alg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,7 +16,7 @@
 __all__ = ["degree_centrality", "in_degree_centrality", "out_degree_centrality"]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def degree_centrality(G):
     G = _to_graph(G)
     if len(G) <= 1:
@@ -27,7 +27,7 @@ def degree_centrality(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def in_degree_centrality(G):
     G = _to_directed_graph(G)
     if len(G) <= 1:
@@ -38,7 +38,7 @@ def in_degree_centrality(G):
 
 
 @not_implemented_for("undirected")
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def out_degree_centrality(G):
     G = _to_directed_graph(G)
     if len(G) <= 1:
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
index c0f02a6258e..65a8633667a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/eigenvector.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,7 +26,12 @@
 
 
 @not_implemented_for("multigraph")
-@networkx_algorithm(extra_params=_dtype_param)
+@networkx_algorithm(
+    extra_params=_dtype_param,
+    is_incomplete=True,  # nstart not supported
+    version_added="23.12",
+    _plc="eigenvector_centrality",
+)
 def eigenvector_centrality(
     G, max_iter=100, tol=1.0e-6, nstart=None, weight=None, *, dtype=None
 ):
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
index b61b811b8fa..4a0684f72ee 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/centrality/katz.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,7 +26,12 @@
 
 
 @not_implemented_for("multigraph")
-@networkx_algorithm(extra_params=_dtype_param)
+@networkx_algorithm(
+    extra_params=_dtype_param,
+    is_incomplete=True,  # nstart and normalized=False not supported
+    version_added="23.12",
+    _plc="katz_centrality",
+)
 def katz_centrality(
     G,
     alpha=0.1,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/cluster.py b/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
new file mode 100644
index 00000000000..a458e6c04db
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/cluster.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import pylibcugraph as plc
+
+from nx_cugraph.convert import _to_undirected_graph
+from nx_cugraph.utils import networkx_algorithm, not_implemented_for
+
+__all__ = [
+    "triangles",
+    "average_clustering",
+    "clustering",
+    "transitivity",
+]
+
+
+def _triangles(G, nodes, symmetrize=None):
+    if nodes is not None:
+        if is_single_node := (nodes in G):
+            nodes = [nodes if G.key_to_id is None else G.key_to_id[nodes]]
+        else:
+            nodes = list(nodes)
+        nodes = G._list_to_nodearray(nodes)
+    else:
+        is_single_node = False
+    if len(G) == 0:
+        return None, None, is_single_node
+    node_ids, triangles = plc.triangle_count(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(symmetrize=symmetrize),
+        start_list=nodes,
+        do_expensive_check=False,
+    )
+    return node_ids, triangles, is_single_node
+
+
+@not_implemented_for("directed")
+@networkx_algorithm(version_added="24.02", _plc="triangle_count")
+def triangles(G, nodes=None):
+    G = _to_undirected_graph(G)
+    node_ids, triangles, is_single_node = _triangles(G, nodes)
+    if len(G) == 0:
+        return {}
+    if is_single_node:
+        return int(triangles[0])
+    return G._nodearrays_to_dict(node_ids, triangles)
+
+
+@not_implemented_for("directed")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
+def clustering(G, nodes=None, weight=None):
+    """Directed graphs and `weight` parameter are not yet supported."""
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of clustering not currently supported"
+        )
+    G = _to_undirected_graph(G)
+    node_ids, triangles, is_single_node = _triangles(G, nodes)
+    if len(G) == 0:
+        return {}
+    if is_single_node:
+        numer = int(triangles[0])
+        if numer == 0:
+            return 0
+        degree = int((G.src_indices == nodes).sum())
+        return 2 * numer / (degree * (degree - 1))
+    degrees = G._degrees_array(ignore_selfloops=True)[node_ids]
+    denom = degrees * (degrees - 1)
+    results = 2 * triangles / denom
+    results = cp.where(denom, results, 0)  # 0 where we divided by 0
+    return G._nodearrays_to_dict(node_ids, results)
+
+
+@clustering._can_run
+def _(G, nodes=None, weight=None):
+    return weight is None and not G.is_directed()
+
+
+@not_implemented_for("directed")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
+def average_clustering(G, nodes=None, weight=None, count_zeros=True):
+    """Directed graphs and `weight` parameter are not yet supported."""
+    if weight is not None:
+        raise NotImplementedError(
+            "Weighted implementation of average_clustering not currently supported"
+        )
+    G = _to_undirected_graph(G)
+    node_ids, triangles, is_single_node = _triangles(G, nodes)
+    if len(G) == 0:
+        raise ZeroDivisionError
+    degrees = G._degrees_array(ignore_selfloops=True)[node_ids]
+    if not count_zeros:
+        mask = triangles != 0
+        triangles = triangles[mask]
+        if triangles.size == 0:
+            raise ZeroDivisionError
+        degrees = degrees[mask]
+    denom = degrees * (degrees - 1)
+    results = 2 * triangles / denom
+    if count_zeros:
+        results = cp.where(denom, results, 0)  # 0 where we divided by 0
+    return float(results.mean())
+
+
+@average_clustering._can_run
+def _(G, nodes=None, weight=None, count_zeros=True):
+    return weight is None and not G.is_directed()
+
+
+@not_implemented_for("directed")
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="triangle_count")
+def transitivity(G):
+    """Directed graphs are not yet supported."""
+    G = _to_undirected_graph(G)
+    if len(G) == 0:
+        return 0
+    node_ids, triangles = plc.triangle_count(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        start_list=None,
+        do_expensive_check=False,
+    )
+    numer = int(triangles.sum())
+    if numer == 0:
+        return 0
+    degrees = G._degrees_array(ignore_selfloops=True)[node_ids]
+    denom = int((degrees * (degrees - 1)).sum())
+    return 2 * numer / denom
+
+
+@transitivity._can_run
+def _(G):
+    # Is transitivity supposed to work on directed graphs?
+    return not G.is_directed()
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
index 936d837dacd..f58f1000fc4 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/community/louvain.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -23,8 +23,6 @@
     not_implemented_for,
 )
 
-from ..isolate import _isolates
-
 __all__ = ["louvain_communities"]
 
 
@@ -35,7 +33,11 @@
             "Upper limit of the number of macro-iterations (max: 500)."
         ),
         **_dtype_param,
-    }
+    },
+    is_incomplete=True,  # seed not supported; self-loops not supported
+    is_different=True,  # RNG different
+    version_added="23.10",
+    _plc="louvain",
 )
 def louvain_communities(
     G,
@@ -47,12 +49,11 @@ def louvain_communities(
     max_level=None,
     dtype=None,
 ):
-    """`seed` parameter is currently ignored."""
+    """`seed` parameter is currently ignored, and self-loops are not yet supported."""
     # NetworkX allows both directed and undirected, but cugraph only allows undirected.
     seed = _seed_to_int(seed)  # Unused, but ensure it's valid for future compatibility
     G = _to_undirected_graph(G, weight)
     if G.src_indices.size == 0:
-        # TODO: PLC doesn't handle empty graphs gracefully!
         return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
     if max_level is None:
         max_level = 500
@@ -72,14 +73,7 @@ def louvain_communities(
         do_expensive_check=False,
     )
     groups = _groupby(clusters, node_ids, groups_are_canonical=True)
-    rv = [set(G._nodearray_to_list(ids)) for ids in groups.values()]
-    # TODO: PLC doesn't handle isolated node_ids yet, so this is a temporary fix
-    isolates = _isolates(G)
-    if isolates.size > 0:
-        isolates = isolates[isolates > node_ids.max()]
-        if isolates.size > 0:
-            rv.extend({node} for node in G._nodearray_to_list(isolates))
-    return rv
+    return [set(G._nodearray_to_list(ids)) for ids in groups.values()]
 
 
 @louvain_communities._can_run
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py
index 26816ef3692..12a09b535c0 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,3 +11,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .connected import *
+from .strongly_connected import *
+from .weakly_connected import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
index 41f3457d542..24955e3eac8 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/connected.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,8 +10,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import itertools
-
 import cupy as cp
 import networkx as nx
 import pylibcugraph as plc
@@ -19,8 +17,6 @@
 from nx_cugraph.convert import _to_undirected_graph
 from nx_cugraph.utils import _groupby, networkx_algorithm, not_implemented_for
 
-from ..isolate import _isolates
-
 __all__ = [
     "number_connected_components",
     "connected_components",
@@ -30,42 +26,46 @@
 
 
 @not_implemented_for("directed")
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def number_connected_components(G):
-    return sum(1 for _ in connected_components(G))
-    # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL
-    # G = _to_undirected_graph(G)
-    # unused_node_ids, labels = plc.weakly_connected_components(
-    #     resource_handle=plc.ResourceHandle(),
-    #     graph=G._get_plc_graph(),
-    #     offsets=None,
-    #     indices=None,
-    #     weights=None,
-    #     labels=None,
-    #     do_expensive_check=False,
-    # )
-    # return cp.unique(labels).size
+    G = _to_undirected_graph(G)
+    return _number_connected_components(G)
+
+
+def _number_connected_components(G, symmetrize=None):
+    if G.src_indices.size == 0:
+        return len(G)
+    unused_node_ids, labels = plc.weakly_connected_components(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(symmetrize=symmetrize),
+        offsets=None,
+        indices=None,
+        weights=None,
+        labels=None,
+        do_expensive_check=False,
+    )
+    return cp.unique(labels).size
 
 
 @number_connected_components._can_run
 def _(G):
     # NetworkX <= 3.2.1 does not check directedness for us
-    try:
-        return not G.is_directed()
-    except Exception:
-        return False
+    return not G.is_directed()
 
 
 @not_implemented_for("directed")
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def connected_components(G):
     G = _to_undirected_graph(G)
+    return _connected_components(G)
+
+
+def _connected_components(G, symmetrize=None):
     if G.src_indices.size == 0:
-        # TODO: PLC doesn't handle empty graphs (or isolated nodes) gracefully!
         return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
     node_ids, labels = plc.weakly_connected_components(
         resource_handle=plc.ResourceHandle(),
-        graph=G._get_plc_graph(),
+        graph=G._get_plc_graph(symmetrize=symmetrize),
         offsets=None,
         indices=None,
         weights=None,
@@ -73,44 +73,37 @@ def connected_components(G):
         do_expensive_check=False,
     )
     groups = _groupby(labels, node_ids)
-    it = (G._nodearray_to_set(connected_ids) for connected_ids in groups.values())
-    # TODO: PLC doesn't handle isolated vertices yet, so this is a temporary fix
-    isolates = _isolates(G)
-    if isolates.size > 0:
-        isolates = isolates[isolates > node_ids.max()]
-        if isolates.size > 0:
-            it = itertools.chain(
-                it, ({node} for node in G._nodearray_to_list(isolates))
-            )
-    return it
+    return (G._nodearray_to_set(connected_ids) for connected_ids in groups.values())
 
 
 @not_implemented_for("directed")
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def is_connected(G):
     G = _to_undirected_graph(G)
+    return _is_connected(G)
+
+
+def _is_connected(G, symmetrize=None):
     if len(G) == 0:
         raise nx.NetworkXPointlessConcept(
             "Connectivity is undefined for the null graph."
         )
-    for community in connected_components(G):
-        return len(community) == len(G)
-    raise RuntimeError  # pragma: no cover
-    # PREFERRED IMPLEMENTATION, BUT PLC DOES NOT HANDLE ISOLATED VERTICES WELL
-    # unused_node_ids, labels = plc.weakly_connected_components(
-    #     resource_handle=plc.ResourceHandle(),
-    #     graph=G._get_plc_graph(),
-    #     offsets=None,
-    #     indices=None,
-    #     weights=None,
-    #     labels=None,
-    #     do_expensive_check=False,
-    # )
-    # return labels.size == len(G) and cp.unique(labels).size == 1
+    if G.src_indices.size == 0:
+        return len(G) == 1
+    unused_node_ids, labels = plc.weakly_connected_components(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(symmetrize=symmetrize),
+        offsets=None,
+        indices=None,
+        weights=None,
+        labels=None,
+        do_expensive_check=False,
+    )
+    return bool((labels == labels[0]).all())
 
 
 @not_implemented_for("directed")
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12", _plc="weakly_connected_components")
 def node_connected_component(G, n):
     # We could also do plain BFS from n
     G = _to_undirected_graph(G)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
new file mode 100644
index 00000000000..a63b3237dfc
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/strongly_connected.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+import pylibcugraph as plc
+
+from nx_cugraph.convert import _to_directed_graph
+from nx_cugraph.utils import _groupby, index_dtype, not_implemented_for
+
+__all__ = [
+    "number_strongly_connected_components",
+    "strongly_connected_components",
+    "is_strongly_connected",
+]
+
+
+def _strongly_connected_components(G):
+    # TODO: create utility function to convert just the indices to CSR
+    # TODO: this uses a legacy PLC function (strongly_connected_components)
+    N = len(G)
+    indices = cp.lexsort(cp.vstack((G.dst_indices, G.src_indices)))
+    dst_indices = G.dst_indices[indices]
+    offsets = cp.searchsorted(
+        G.src_indices, cp.arange(N + 1, dtype=index_dtype), sorter=indices
+    ).astype(index_dtype)
+    labels = cp.zeros(N, dtype=index_dtype)
+    plc.strongly_connected_components(
+        offsets=offsets,
+        indices=dst_indices,
+        weights=None,
+        num_verts=N,
+        num_edges=dst_indices.size,
+        labels=labels,
+    )
+    return labels
+
+
+# The networkx_algorithm decorator is (temporarily) removed to disable
+# dispatching for this function. The current cugraph
+# strongly_connected_components is a legacy implementation with known issues,
+# and in most cases should not be used until the cugraph team can provide an
+# update.
+#
+# Users can still call this via the nx_cugraph module directly:
+# >>> import nx_cugraph as nxcg
+# >>> nxcg.strongly_connected_components(...)
+
+
+@not_implemented_for("undirected")
+# @networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
+def strongly_connected_components(G):
+    G = _to_directed_graph(G)
+    if G.src_indices.size == 0:
+        return [{key} for key in G._nodeiter_to_iter(range(len(G)))]
+    labels = _strongly_connected_components(G)
+    groups = _groupby(labels, cp.arange(len(G), dtype=index_dtype))
+    return (G._nodearray_to_set(connected_ids) for connected_ids in groups.values())
+
+
+@not_implemented_for("undirected")
+# @networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
+def number_strongly_connected_components(G):
+    G = _to_directed_graph(G)
+    if G.src_indices.size == 0:
+        return len(G)
+    labels = _strongly_connected_components(G)
+    return cp.unique(labels).size
+
+
+@not_implemented_for("undirected")
+# @networkx_algorithm(version_added="24.02", _plc="strongly_connected_components")
+def is_strongly_connected(G):
+    G = _to_directed_graph(G)
+    if len(G) == 0:
+        raise nx.NetworkXPointlessConcept(
+            "Connectivity is undefined for the null graph."
+        )
+    if G.src_indices.size == 0:
+        return len(G) == 1
+    labels = _strongly_connected_components(G)
+    return bool((labels == labels[0]).all())
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py b/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
new file mode 100644
index 00000000000..e42acdd3d84
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/components/weakly_connected.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from nx_cugraph.convert import _to_directed_graph
+from nx_cugraph.utils import networkx_algorithm, not_implemented_for
+
+from .connected import (
+    _connected_components,
+    _is_connected,
+    _number_connected_components,
+)
+
+__all__ = [
+    "number_weakly_connected_components",
+    "weakly_connected_components",
+    "is_weakly_connected",
+]
+
+
+@not_implemented_for("undirected")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
+def weakly_connected_components(G):
+    G = _to_directed_graph(G)
+    return _connected_components(G, symmetrize="union")
+
+
+@not_implemented_for("undirected")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
+def number_weakly_connected_components(G):
+    G = _to_directed_graph(G)
+    return _number_connected_components(G, symmetrize="union")
+
+
+@not_implemented_for("undirected")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
+def is_weakly_connected(G):
+    G = _to_directed_graph(G)
+    return _is_connected(G, symmetrize="union")
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/core.py b/python/nx-cugraph/nx_cugraph/algorithms/core.py
index 2219388bc58..71f61abf45b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/core.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,14 +15,47 @@
 import pylibcugraph as plc
 
 import nx_cugraph as nxcg
-from nx_cugraph.utils import _get_int_dtype, networkx_algorithm, not_implemented_for
+from nx_cugraph.convert import _to_undirected_graph
+from nx_cugraph.utils import (
+    _get_int_dtype,
+    index_dtype,
+    networkx_algorithm,
+    not_implemented_for,
+)
 
-__all__ = ["k_truss"]
+__all__ = ["core_number", "k_truss"]
 
 
 @not_implemented_for("directed")
 @not_implemented_for("multigraph")
-@networkx_algorithm
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="core_number")
+def core_number(G):
+    """Directed graphs are not yet supported."""
+    G = _to_undirected_graph(G)
+    if len(G) == 0:
+        return {}
+    if nxcg.number_of_selfloops(G) > 0:
+        raise nx.NetworkXNotImplemented(
+            "Input graph has self loops which is not permitted; "
+            "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
+        )
+    node_ids, core_numbers = plc.core_number(
+        resource_handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        degree_type="bidirectional",
+        do_expensive_check=False,
+    )
+    return G._nodearrays_to_dict(node_ids, core_numbers)
+
+
+@core_number._can_run
+def _(G):
+    return not G.is_directed()
+
+
+@not_implemented_for("directed")
+@not_implemented_for("multigraph")
+@networkx_algorithm(is_incomplete=True, version_added="23.12", _plc="k_truss_subgraph")
 def k_truss(G, k):
     """
     Currently raises `NotImplementedError` for graphs with more than one connected
@@ -31,7 +64,11 @@ def k_truss(G, k):
     if is_nx := isinstance(G, nx.Graph):
         G = nxcg.from_networkx(G, preserve_all_attrs=True)
     if nxcg.number_of_selfloops(G) > 0:
-        raise nx.NetworkXError(
+        if nx.__version__[:3] <= "3.2":
+            exc_class = nx.NetworkXError
+        else:
+            exc_class = nx.NetworkXNotImplemented
+        raise exc_class(
             "Input graph has self loops which is not permitted; "
             "Consider using G.remove_edges_from(nx.selfloop_edges(G))."
         )
@@ -77,10 +114,8 @@ def k_truss(G, k):
         edge_values = {key: val[edge_indices] for key, val in G.edge_values.items()}
         edge_masks = {key: val[edge_indices] for key, val in G.edge_masks.items()}
     # Renumber step 2: edge indices
-    mapper = cp.zeros(len(G), src_indices.dtype)
-    mapper[node_indices] = cp.arange(node_indices.size, dtype=mapper.dtype)
-    src_indices = mapper[src_indices]
-    dst_indices = mapper[dst_indices]
+    src_indices = cp.searchsorted(node_indices, src_indices).astype(index_dtype)
+    dst_indices = cp.searchsorted(node_indices, dst_indices).astype(index_dtype)
     # Renumber step 3: node values
     node_values = {key: val[node_indices] for key, val in G.node_values.items()}
     node_masks = {key: val[node_indices] for key, val in G.node_masks.items()}
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/dag.py b/python/nx-cugraph/nx_cugraph/algorithms/dag.py
new file mode 100644
index 00000000000..64be0a58105
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/dag.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+import numpy as np
+import pylibcugraph as plc
+
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import index_dtype, networkx_algorithm
+
+__all__ = [
+    "descendants",
+    "ancestors",
+]
+
+
+def _ancestors_and_descendants(G, source, *, is_ancestors):
+    G = _to_graph(G)
+    if source not in G:
+        hash(source)  # To raise TypeError if appropriate
+        raise nx.NetworkXError(
+            f"The node {source} is not in the {G.__class__.__name__.lower()}."
+        )
+    src_index = source if G.key_to_id is None else G.key_to_id[source]
+    distances, predecessors, node_ids = plc.bfs(
+        handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(switch_indices=is_ancestors),
+        sources=cp.array([src_index], dtype=index_dtype),
+        direction_optimizing=False,
+        depth_limit=-1,
+        compute_predecessors=False,
+        do_expensive_check=False,
+    )
+    mask = (distances != np.iinfo(distances.dtype).max) & (distances != 0)
+    return G._nodearray_to_set(node_ids[mask])
+
+
+@networkx_algorithm(version_added="24.02", _plc="bfs")
+def descendants(G, source):
+    return _ancestors_and_descendants(G, source, is_ancestors=False)
+
+
+@networkx_algorithm(version_added="24.02", _plc="bfs")
+def ancestors(G, source):
+    return _ancestors_and_descendants(G, source, is_ancestors=True)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/isolate.py b/python/nx-cugraph/nx_cugraph/algorithms/isolate.py
index d32223fb3ed..9621fbeaa9d 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/isolate.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/isolate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,9 +15,10 @@
 from typing import TYPE_CHECKING
 
 import cupy as cp
+import numpy as np
 
 from nx_cugraph.convert import _to_graph
-from nx_cugraph.utils import networkx_algorithm
+from nx_cugraph.utils import index_dtype, networkx_algorithm
 
 if TYPE_CHECKING:  # pragma: no cover
     from nx_cugraph.typing import IndexValue
@@ -25,7 +26,7 @@
 __all__ = ["is_isolate", "isolates", "number_of_isolates"]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.10")
 def is_isolate(G, n):
     G = _to_graph(G)
     index = n if G.key_to_id is None else G.key_to_id[n]
@@ -36,28 +37,37 @@ def is_isolate(G, n):
     )
 
 
-def _mark_isolates(G) -> cp.ndarray[bool]:
+def _mark_isolates(G, symmetrize=None) -> cp.ndarray[bool]:
     """Return a boolean mask array indicating indices of isolated nodes."""
     mark_isolates = cp.ones(len(G), bool)
-    mark_isolates[G.src_indices] = False
-    if G.is_directed():
-        mark_isolates[G.dst_indices] = False
+    if G.is_directed() and symmetrize == "intersection":
+        N = G._N
+        # Upcast to int64 so indices don't overflow
+        src_dst = N * G.src_indices.astype(np.int64) + G.dst_indices
+        src_dst_T = G.src_indices + N * G.dst_indices.astype(np.int64)
+        src_dst_new = cp.intersect1d(src_dst, src_dst_T)
+        new_indices = cp.floor_divide(src_dst_new, N, dtype=index_dtype)
+        mark_isolates[new_indices] = False
+    else:
+        mark_isolates[G.src_indices] = False
+        if G.is_directed():
+            mark_isolates[G.dst_indices] = False
     return mark_isolates
 
 
-def _isolates(G) -> cp.ndarray[IndexValue]:
+def _isolates(G, symmetrize=None) -> cp.ndarray[IndexValue]:
     """Like isolates, but return an array of indices instead of an iterator of nodes."""
     G = _to_graph(G)
-    return cp.nonzero(_mark_isolates(G))[0]
+    return cp.nonzero(_mark_isolates(G, symmetrize=symmetrize))[0]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.10")
 def isolates(G):
     G = _to_graph(G)
     return G._nodeiter_to_iter(iter(_isolates(G).tolist()))
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.10")
 def number_of_isolates(G):
     G = _to_graph(G)
-    return _mark_isolates(G).sum().tolist()
+    return int(cp.count_nonzero(_mark_isolates(G)))
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
index 1c8a47c24b1..9e723624a3b 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/hits_alg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -32,7 +32,9 @@
             "The edge attribute to use as the edge weight."
         ),
         **_dtype_param,
-    }
+    },
+    version_added="23.12",
+    _plc="hits",
 )
 def hits(
     G,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
index 63f6e89c33a..55fcc3e520a 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/link_analysis/pagerank_alg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -26,7 +26,12 @@
 __all__ = ["pagerank"]
 
 
-@networkx_algorithm(extra_params=_dtype_param)
+@networkx_algorithm(
+    extra_params=_dtype_param,
+    is_incomplete=True,  # dangling not supported
+    version_added="23.12",
+    _plc={"pagerank", "personalized_pagerank"},
+)
 def pagerank(
     G,
     alpha=0.85,
@@ -97,7 +102,7 @@ def pagerank(
 
 
 @pagerank._can_run
-def pagerank(
+def _(
     G,
     alpha=0.85,
     personalization=None,
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py
new file mode 100644
index 00000000000..32fd45f5726
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/operators/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .unary import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
new file mode 100644
index 00000000000..08abc9f2872
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/operators/unary.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+import numpy as np
+
+import nx_cugraph as nxcg
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import index_dtype, networkx_algorithm
+
+__all__ = ["complement", "reverse"]
+
+
+@networkx_algorithm(version_added="24.02")
+def complement(G):
+    G = _to_graph(G)
+    N = G._N
+    # Upcast to int64 so indices don't overflow.
+    edges_a_b = N * G.src_indices.astype(np.int64) + G.dst_indices
+    # Now compute flattened indices for all edges except self-loops
+    # Alt (slower):
+    # edges_full = np.arange(N * N)
+    # edges_full = edges_full[(edges_full % (N + 1)).astype(bool)]
+    edges_full = cp.arange(1, N * (N - 1) + 1) + cp.repeat(cp.arange(N - 1), N)
+    edges_comp = cp.setdiff1d(
+        edges_full,
+        edges_a_b,
+        assume_unique=not G.is_multigraph(),
+    )
+    src_indices, dst_indices = cp.divmod(edges_comp, N)
+    return G.__class__.from_coo(
+        N,
+        src_indices.astype(index_dtype),
+        dst_indices.astype(index_dtype),
+        key_to_id=G.key_to_id,
+    )
+
+
+@networkx_algorithm(version_added="24.02")
+def reverse(G, copy=True):
+    if not G.is_directed():
+        raise nx.NetworkXError("Cannot reverse an undirected graph.")
+    if isinstance(G, nx.Graph):
+        G = nxcg.from_networkx(G, preserve_all_attrs=True)
+    return G.reverse(copy=copy)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/reciprocity.py b/python/nx-cugraph/nx_cugraph/algorithms/reciprocity.py
new file mode 100644
index 00000000000..c87abdf9fa7
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/reciprocity.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+import numpy as np
+
+from nx_cugraph.convert import _to_directed_graph
+from nx_cugraph.utils import networkx_algorithm, not_implemented_for
+
+__all__ = ["reciprocity", "overall_reciprocity"]
+
+
+@not_implemented_for("undirected", "multigraph")
+@networkx_algorithm(version_added="24.02")
+def reciprocity(G, nodes=None):
+    if nodes is None:
+        return overall_reciprocity(G)
+    G = _to_directed_graph(G)
+    N = G._N
+    # 'nodes' can also be a single node identifier
+    if nodes in G:
+        index = nodes if G.key_to_id is None else G.key_to_id[nodes]
+        mask = (G.src_indices == index) | (G.dst_indices == index)
+        src_indices = G.src_indices[mask]
+        if src_indices.size == 0:
+            raise nx.NetworkXError("Not defined for isolated nodes.")
+        dst_indices = G.dst_indices[mask]
+        # Create two lists of edge identifiers, one for each direction.
+        # Edge identifiers can be created from a pair of node
+        # identifiers. Simply adding src IDs to dst IDs is not adequate, so
+        # make one set of values (either src or dst depending on direction)
+        # unique by multiplying values by N.
+        # Upcast to int64 so indices don't overflow.
+        edges_a_b = N * src_indices.astype(np.int64) + dst_indices
+        edges_b_a = src_indices + N * dst_indices.astype(np.int64)
+        # Find the matching edge identifiers in each list. The edge identifier
+        # generation ensures the ID for A->B == the ID for B->A
+        recip_indices = cp.intersect1d(
+            edges_a_b,
+            edges_b_a,
+            # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
+        )
+        num_selfloops = (src_indices == dst_indices).sum().tolist()
+        return (recip_indices.size - num_selfloops) / edges_a_b.size
+
+    # Don't include self-loops
+    mask = G.src_indices != G.dst_indices
+    src_indices = G.src_indices[mask]
+    dst_indices = G.dst_indices[mask]
+    # Create two lists of edges, one for each direction, and find the matching
+    # IDs in each list (see description above).
+    edges_a_b = N * src_indices.astype(np.int64) + dst_indices
+    edges_b_a = src_indices + N * dst_indices.astype(np.int64)
+    recip_indices = cp.intersect1d(
+        edges_a_b,
+        edges_b_a,
+        # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
+    )
+    numer = cp.bincount(recip_indices // N, minlength=N)
+    denom = cp.bincount(src_indices, minlength=N)
+    denom += cp.bincount(dst_indices, minlength=N)
+    recip = 2 * numer / denom
+    node_ids = G._nodekeys_to_nodearray(nodes)
+    return G._nodearrays_to_dict(node_ids, recip[node_ids])
+
+
+@not_implemented_for("undirected", "multigraph")
+@networkx_algorithm(version_added="24.02")
+def overall_reciprocity(G):
+    G = _to_directed_graph(G)
+    if G.number_of_edges() == 0:
+        raise nx.NetworkXError("Not defined for empty graphs")
+    # Create two lists of edges, one for each direction, and find the matching
+    # IDs in each list (see description in reciprocity()).
+    edges_a_b = G._N * G.src_indices.astype(np.int64) + G.dst_indices
+    edges_b_a = G.src_indices + G._N * G.dst_indices.astype(np.int64)
+    recip_indices = cp.intersect1d(
+        edges_a_b,
+        edges_b_a,
+        # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
+    )
+    num_selfloops = (G.src_indices == G.dst_indices).sum().tolist()
+    return (recip_indices.size - num_selfloops) / edges_a_b.size
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
index 3413a637b32..2012495953e 100644
--- a/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/shortest_paths/unweighted.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -21,12 +21,12 @@
 __all__ = ["single_source_shortest_path_length", "single_target_shortest_path_length"]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_source_shortest_path_length(G, source, cutoff=None):
     return _single_shortest_path_length(G, source, cutoff, "Source")
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12", _plc="bfs")
 def single_target_shortest_path_length(G, target, cutoff=None):
     return _single_shortest_path_length(G, target, cutoff, "Target")
 
diff --git a/python/cugraph-dgl/tests/nn/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/__init__.py
similarity index 94%
rename from python/cugraph-dgl/tests/nn/__init__.py
rename to python/nx-cugraph/nx_cugraph/algorithms/traversal/__init__.py
index a1dd01f33d4..1751cd46919 100644
--- a/python/cugraph-dgl/tests/nn/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/algorithms/traversal/__init__.py
@@ -10,3 +10,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .breadth_first_search import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
new file mode 100644
index 00000000000..f5d5e2a995d
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/traversal/breadth_first_search.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from itertools import repeat
+
+import cupy as cp
+import networkx as nx
+import numpy as np
+import pylibcugraph as plc
+
+import nx_cugraph as nxcg
+from nx_cugraph.convert import _to_graph
+from nx_cugraph.utils import _groupby, index_dtype, networkx_algorithm
+
+__all__ = [
+    "bfs_edges",
+    "bfs_tree",
+    "bfs_predecessors",
+    "bfs_successors",
+    "descendants_at_distance",
+    "bfs_layers",
+    "generic_bfs_edges",
+]
+
+
+def _check_G_and_source(G, source):
+    G = _to_graph(G)
+    if source not in G:
+        hash(source)  # To raise TypeError if appropriate
+        raise nx.NetworkXError(
+            f"The node {source} is not in the {G.__class__.__name__.lower()}."
+        )
+    return G
+
+
+def _bfs(G, source, *, depth_limit=None, reverse=False):
+    src_index = source if G.key_to_id is None else G.key_to_id[source]
+    distances, predecessors, node_ids = plc.bfs(
+        handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(switch_indices=reverse),
+        sources=cp.array([src_index], dtype=index_dtype),
+        direction_optimizing=False,
+        depth_limit=-1 if depth_limit is None else depth_limit,
+        compute_predecessors=True,
+        do_expensive_check=False,
+    )
+    mask = predecessors >= 0
+    return distances[mask], predecessors[mask], node_ids[mask]
+
+
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
+def generic_bfs_edges(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
+    """`neighbors` and `sort_neighbors` parameters are not yet supported."""
+    if neighbors is not None:
+        raise NotImplementedError(
+            "neighbors argument in generic_bfs_edges is not currently supported"
+        )
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in generic_bfs_edges is not currently supported"
+        )
+    return bfs_edges(G, source, depth_limit=depth_limit)
+
+
+@generic_bfs_edges._can_run
+def _(G, source, neighbors=None, depth_limit=None, sort_neighbors=None):
+    return neighbors is None and sort_neighbors is None
+
+
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
+def bfs_edges(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
+    """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_edges is not currently supported"
+        )
+    G = _check_G_and_source(G, source)
+    if depth_limit is not None and depth_limit < 1:
+        return
+    distances, predecessors, node_ids = _bfs(
+        G, source, depth_limit=depth_limit, reverse=reverse
+    )
+    # Using groupby like this is similar to bfs_predecessors
+    groups = _groupby([distances, predecessors], node_ids)
+    id_to_key = G.id_to_key
+    for key in sorted(groups):
+        children_ids = groups[key]
+        parent_id = key[1]
+        parent = id_to_key[parent_id] if id_to_key is not None else parent_id
+        yield from zip(
+            repeat(parent, children_ids.size),
+            G._nodeiter_to_iter(children_ids.tolist()),
+        )
+
+
+@bfs_edges._can_run
+def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
+    return sort_neighbors is None
+
+
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
+def bfs_tree(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
+    """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_tree is not currently supported"
+        )
+    G = _check_G_and_source(G, source)
+    if depth_limit is not None and depth_limit < 1:
+        return nxcg.DiGraph.from_coo(
+            1,
+            cp.array([], dtype=index_dtype),
+            cp.array([], dtype=index_dtype),
+            id_to_key=[source],
+        )
+
+    distances, predecessors, node_ids = _bfs(
+        G,
+        source,
+        depth_limit=depth_limit,
+        reverse=reverse,
+    )
+    if predecessors.size == 0:
+        return nxcg.DiGraph.from_coo(
+            1,
+            cp.array([], dtype=index_dtype),
+            cp.array([], dtype=index_dtype),
+            id_to_key=[source],
+        )
+    # TODO: create renumbering helper function(s)
+    unique_node_ids = cp.unique(cp.hstack((predecessors, node_ids)))
+    # Renumber edges
+    src_indices = cp.searchsorted(unique_node_ids, predecessors).astype(index_dtype)
+    dst_indices = cp.searchsorted(unique_node_ids, node_ids).astype(index_dtype)
+    # Renumber nodes
+    if (id_to_key := G.id_to_key) is not None:
+        key_to_id = {
+            id_to_key[old_index]: new_index
+            for new_index, old_index in enumerate(unique_node_ids.tolist())
+        }
+    else:
+        key_to_id = {
+            old_index: new_index
+            for new_index, old_index in enumerate(unique_node_ids.tolist())
+        }
+    return nxcg.DiGraph.from_coo(
+        unique_node_ids.size,
+        src_indices,
+        dst_indices,
+        key_to_id=key_to_id,
+    )
+
+
+@bfs_tree._can_run
+def _(G, source, reverse=False, depth_limit=None, sort_neighbors=None):
+    return sort_neighbors is None
+
+
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
+def bfs_successors(G, source, depth_limit=None, sort_neighbors=None):
+    """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_successors is not currently supported"
+        )
+    G = _check_G_and_source(G, source)
+    if depth_limit is not None and depth_limit < 1:
+        yield (source, [])
+        return
+
+    distances, predecessors, node_ids = _bfs(G, source, depth_limit=depth_limit)
+    groups = _groupby([distances, predecessors], node_ids)
+    id_to_key = G.id_to_key
+    for key in sorted(groups):
+        children_ids = groups[key]
+        parent_id = key[1]
+        parent = id_to_key[parent_id] if id_to_key is not None else parent_id
+        children = G._nodearray_to_list(children_ids)
+        yield (parent, children)
+
+
+@bfs_successors._can_run
+def _(G, source, depth_limit=None, sort_neighbors=None):
+    return sort_neighbors is None
+
+
+@networkx_algorithm(version_added="24.02", _plc="bfs")
+def bfs_layers(G, sources):
+    G = _to_graph(G)
+    if sources in G:
+        sources = [sources]
+    else:
+        sources = set(sources)
+        if not all(source in G for source in sources):
+            node = next(source for source in sources if source not in G)
+            raise nx.NetworkXError(f"The node {node} is not in the graph.")
+        sources = list(sources)
+    source_ids = G._list_to_nodearray(sources)
+    distances, predecessors, node_ids = plc.bfs(
+        handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        sources=source_ids,
+        direction_optimizing=False,
+        depth_limit=-1,
+        compute_predecessors=False,
+        do_expensive_check=False,
+    )
+    mask = distances != np.iinfo(distances.dtype).max
+    distances = distances[mask]
+    node_ids = node_ids[mask]
+    groups = _groupby(distances, node_ids)
+    return (G._nodearray_to_list(groups[key]) for key in range(len(groups)))
+
+
+@networkx_algorithm(is_incomplete=True, version_added="24.02", _plc="bfs")
+def bfs_predecessors(G, source, depth_limit=None, sort_neighbors=None):
+    """`sort_neighbors` parameter is not yet supported."""
+    if sort_neighbors is not None:
+        raise NotImplementedError(
+            "sort_neighbors argument in bfs_predecessors is not currently supported"
+        )
+    G = _check_G_and_source(G, source)
+    if depth_limit is not None and depth_limit < 1:
+        return
+
+    distances, predecessors, node_ids = _bfs(G, source, depth_limit=depth_limit)
+    # We include `predecessors` in the groupby for "nicer" iteration order
+    groups = _groupby([distances, predecessors], node_ids)
+    id_to_key = G.id_to_key
+    for key in sorted(groups):
+        children_ids = groups[key]
+        parent_id = key[1]
+        parent = id_to_key[parent_id] if id_to_key is not None else parent_id
+        yield from zip(
+            G._nodeiter_to_iter(children_ids.tolist()),
+            repeat(parent, children_ids.size),
+        )
+
+
+@bfs_predecessors._can_run
+def _(G, source, depth_limit=None, sort_neighbors=None):
+    return sort_neighbors is None
+
+
+@networkx_algorithm(version_added="24.02", _plc="bfs")
+def descendants_at_distance(G, source, distance):
+    G = _check_G_and_source(G, source)
+    if distance is None or distance < 0:
+        return set()
+    if distance == 0:
+        return {source}
+
+    src_index = source if G.key_to_id is None else G.key_to_id[source]
+    distances, predecessors, node_ids = plc.bfs(
+        handle=plc.ResourceHandle(),
+        graph=G._get_plc_graph(),
+        sources=cp.array([src_index], dtype=index_dtype),
+        direction_optimizing=False,
+        depth_limit=distance,
+        compute_predecessors=False,
+        do_expensive_check=False,
+    )
+    mask = distances == distance
+    node_ids = node_ids[mask]
+    return G._nodearray_to_set(node_ids)
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/tree/__init__.py b/python/nx-cugraph/nx_cugraph/algorithms/tree/__init__.py
new file mode 100644
index 00000000000..91bf72417be
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/tree/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .recognition import *
diff --git a/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py b/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
new file mode 100644
index 00000000000..74f57b5ea5a
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/algorithms/tree/recognition.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cupy as cp
+import networkx as nx
+
+import nx_cugraph as nxcg
+from nx_cugraph.convert import _to_directed_graph, _to_graph
+from nx_cugraph.utils import networkx_algorithm, not_implemented_for
+
+__all__ = ["is_arborescence", "is_branching", "is_forest", "is_tree"]
+
+
+@not_implemented_for("undirected")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
+def is_arborescence(G):
+    G = _to_directed_graph(G)
+    return is_tree(G) and int(G._in_degrees_array().max()) <= 1
+
+
+@not_implemented_for("undirected")
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
+def is_branching(G):
+    G = _to_directed_graph(G)
+    return is_forest(G) and int(G._in_degrees_array().max()) <= 1
+
+
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
+def is_forest(G):
+    G = _to_graph(G)
+    if len(G) == 0:
+        raise nx.NetworkXPointlessConcept("G has no nodes.")
+    if is_directed := G.is_directed():
+        connected_components = nxcg.weakly_connected_components
+    else:
+        connected_components = nxcg.connected_components
+    for components in connected_components(G):
+        node_ids = G._list_to_nodearray(list(components))
+        # TODO: create utilities for creating subgraphs
+        mask = cp.isin(G.src_indices, node_ids) & cp.isin(G.dst_indices, node_ids)
+        # A tree must have an edge count equal to the number of nodes minus the
+        # tree's root node.
+        if is_directed:
+            if int(cp.count_nonzero(mask)) != len(components) - 1:
+                return False
+        else:
+            src_indices = G.src_indices[mask]
+            dst_indices = G.dst_indices[mask]
+            if int(cp.count_nonzero(src_indices <= dst_indices)) != len(components) - 1:
+                return False
+    return True
+
+
+@networkx_algorithm(version_added="24.02", _plc="weakly_connected_components")
+def is_tree(G):
+    G = _to_graph(G)
+    if len(G) == 0:
+        raise nx.NetworkXPointlessConcept("G has no nodes.")
+    if G.is_directed():
+        is_connected = nxcg.is_weakly_connected
+    else:
+        is_connected = nxcg.is_connected
+    # A tree must have an edge count equal to the number of nodes minus the
+    # tree's root node.
+    return len(G) - 1 == G.number_of_edges() and is_connected(G)
diff --git a/python/nx-cugraph/nx_cugraph/classes/digraph.py b/python/nx-cugraph/nx_cugraph/classes/digraph.py
index 52ea2334c85..e5cfb8f6815 100644
--- a/python/nx-cugraph/nx_cugraph/classes/digraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/digraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,17 +12,20 @@
 # limitations under the License.
 from __future__ import annotations
 
+from copy import deepcopy
 from typing import TYPE_CHECKING
 
 import cupy as cp
 import networkx as nx
+import numpy as np
 
 import nx_cugraph as nxcg
 
+from ..utils import index_dtype
 from .graph import Graph
 
 if TYPE_CHECKING:  # pragma: no cover
-    from nx_cugraph.typing import NodeKey
+    from nx_cugraph.typing import AttrKey
 
 __all__ = ["DiGraph"]
 
@@ -44,10 +47,8 @@ def to_networkx_class(cls) -> type[nx.DiGraph]:
         return nx.DiGraph
 
     @networkx_api
-    def number_of_edges(
-        self, u: NodeKey | None = None, v: NodeKey | None = None
-    ) -> int:
-        if u is not None or v is not None:
+    def size(self, weight: AttrKey | None = None) -> int:
+        if weight is not None:
             raise NotImplementedError
         return self.src_indices.size
 
@@ -59,14 +60,135 @@ def number_of_edges(
     def reverse(self, copy: bool = True) -> DiGraph:
         return self._copy(not copy, self.__class__, reverse=True)
 
+    @networkx_api
+    def to_undirected(self, reciprocal=False, as_view=False):
+        N = self._N
+        # Upcast to int64 so indices don't overflow
+        src_dst_indices_old = N * self.src_indices.astype(np.int64) + self.dst_indices
+        if reciprocal:
+            src_dst_indices_new = cp.intersect1d(
+                src_dst_indices_old,
+                self.src_indices + N * self.dst_indices.astype(np.int64),
+                # assume_unique=True,  # cupy <= 12.2.0 also assumes sorted
+            )
+            if self.edge_values:
+                sorter = cp.argsort(src_dst_indices_old)
+                idx = cp.searchsorted(
+                    src_dst_indices_old, src_dst_indices_new, sorter=sorter
+                )
+                indices = sorter[idx]
+                src_indices = self.src_indices[indices].copy()
+                dst_indices = self.dst_indices[indices].copy()
+                edge_values = {
+                    key: val[indices].copy() for key, val in self.edge_values.items()
+                }
+                edge_masks = {
+                    key: val[indices].copy() for key, val in self.edge_masks.items()
+                }
+            else:
+                src_indices, dst_indices = cp.divmod(src_dst_indices_new, N)
+                src_indices = src_indices.astype(index_dtype)
+                dst_indices = dst_indices.astype(index_dtype)
+        else:
+            src_dst_indices_old_T = self.src_indices + N * self.dst_indices.astype(
+                np.int64
+            )
+            if self.edge_values:
+                src_dst_extra = cp.setdiff1d(
+                    src_dst_indices_old_T, src_dst_indices_old, assume_unique=True
+                )
+                sorter = cp.argsort(src_dst_indices_old_T)
+                idx = cp.searchsorted(
+                    src_dst_indices_old_T, src_dst_extra, sorter=sorter
+                )
+                indices = sorter[idx]
+                src_indices = cp.hstack((self.src_indices, self.dst_indices[indices]))
+                dst_indices = cp.hstack((self.dst_indices, self.src_indices[indices]))
+                edge_values = {
+                    key: cp.hstack((val, val[indices]))
+                    for key, val in self.edge_values.items()
+                }
+                edge_masks = {
+                    key: cp.hstack((val, val[indices]))
+                    for key, val in self.edge_masks.items()
+                }
+            else:
+                src_dst_indices_new = cp.union1d(
+                    src_dst_indices_old, src_dst_indices_old_T
+                )
+                src_indices, dst_indices = cp.divmod(src_dst_indices_new, N)
+                src_indices = src_indices.astype(index_dtype)
+                dst_indices = dst_indices.astype(index_dtype)
+
+        if self.edge_values:
+            recip_indices = cp.lexsort(cp.vstack((src_indices, dst_indices)))
+            for key, mask in edge_masks.items():
+                # Make sure we choose a value that isn't masked out
+                val = edge_values[key]
+                rmask = mask[recip_indices]
+                recip_only = rmask & ~mask
+                val[recip_only] = val[recip_indices[recip_only]]
+                only = mask & ~rmask
+                val[recip_indices[only]] = val[only]
+                mask |= mask[recip_indices]
+            # Arbitrarily choose to use value from (j > i) edge
+            mask = src_indices < dst_indices
+            left_idx = cp.nonzero(mask)[0]
+            right_idx = recip_indices[mask]
+            for val in edge_values.values():
+                val[left_idx] = val[right_idx]
+        else:
+            edge_values = {}
+            edge_masks = {}
+
+        node_values = self.node_values
+        node_masks = self.node_masks
+        key_to_id = self.key_to_id
+        id_to_key = None if key_to_id is None else self._id_to_key
+        if not as_view:
+            node_values = {key: val.copy() for key, val in node_values.items()}
+            node_masks = {key: val.copy() for key, val in node_masks.items()}
+            if key_to_id is not None:
+                key_to_id = key_to_id.copy()
+                if id_to_key is not None:
+                    id_to_key = id_to_key.copy()
+        rv = self.to_undirected_class().from_coo(
+            N,
+            src_indices,
+            dst_indices,
+            edge_values,
+            edge_masks,
+            node_values,
+            node_masks,
+            key_to_id=key_to_id,
+            id_to_key=id_to_key,
+        )
+        if as_view:
+            rv.graph = self.graph
+        else:
+            rv.graph.update(deepcopy(self.graph))
+        return rv
+
     # Many more methods to implement...
 
     ###################
     # Private methods #
     ###################
 
-    def _in_degrees_array(self):
-        return cp.bincount(self.dst_indices, minlength=self._N)
-
-    def _out_degrees_array(self):
-        return cp.bincount(self.src_indices, minlength=self._N)
+    def _in_degrees_array(self, *, ignore_selfloops=False):
+        dst_indices = self.dst_indices
+        if ignore_selfloops:
+            not_selfloops = self.src_indices != dst_indices
+            dst_indices = dst_indices[not_selfloops]
+        if dst_indices.size == 0:
+            return cp.zeros(self._N, dtype=np.int64)
+        return cp.bincount(dst_indices, minlength=self._N)
+
+    def _out_degrees_array(self, *, ignore_selfloops=False):
+        src_indices = self.src_indices
+        if ignore_selfloops:
+            not_selfloops = src_indices != self.dst_indices
+            src_indices = src_indices[not_selfloops]
+        if src_indices.size == 0:
+            return cp.zeros(self._N, dtype=np.int64)
+        return cp.bincount(src_indices, minlength=self._N)
diff --git a/python/nx-cugraph/nx_cugraph/classes/function.py b/python/nx-cugraph/nx_cugraph/classes/function.py
index 633e4abd7f4..7212a4d2da9 100644
--- a/python/nx-cugraph/nx_cugraph/classes/function.py
+++ b/python/nx-cugraph/nx_cugraph/classes/function.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -10,14 +10,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import cupy as cp
+
 from nx_cugraph.convert import _to_graph
 from nx_cugraph.utils import networkx_algorithm
 
 __all__ = ["number_of_selfloops"]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def number_of_selfloops(G):
     G = _to_graph(G)
     is_selfloop = G.src_indices == G.dst_indices
-    return is_selfloop.sum().tolist()
+    return int(cp.count_nonzero(is_selfloop))
diff --git a/python/nx-cugraph/nx_cugraph/classes/graph.py b/python/nx-cugraph/nx_cugraph/classes/graph.py
index e32f93d8bfe..0951ee6b135 100644
--- a/python/nx-cugraph/nx_cugraph/classes/graph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/graph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -65,6 +65,7 @@ class Graph:
     key_to_id: dict[NodeKey, IndexValue] | None
     _id_to_key: list[NodeKey] | None
     _N: int
+    _node_ids: cp.ndarray[IndexValue] | None  # holds plc.SGGraph.vertices_array data
 
     # Used by graph._get_plc_graph
     _plc_type_map: ClassVar[dict[np.dtype, np.dtype]] = {
@@ -116,6 +117,7 @@ def from_coo(
         new_graph.key_to_id = None if key_to_id is None else dict(key_to_id)
         new_graph._id_to_key = None if id_to_key is None else list(id_to_key)
         new_graph._N = op.index(N)  # Ensure N is integral
+        new_graph._node_ids = None
         new_graph.graph = new_graph.graph_attr_dict_factory()
         new_graph.graph.update(attr)
         size = new_graph.src_indices.size
@@ -157,6 +159,16 @@ def from_coo(
                     f"(got {new_graph.dst_indices.dtype.name})."
                 )
             new_graph.dst_indices = dst_indices
+
+        # If the graph contains isolates, plc.SGGraph() must be passed a value
+        # for vertices_array that contains every vertex ID, since the
+        # src/dst_indices arrays will not contain IDs for isolates. Create this
+        # only if needed. Like src/dst_indices, the _node_ids array must be
+        # maintained for the lifetime of the plc.SGGraph
+        isolates = nxcg.algorithms.isolate._isolates(new_graph)
+        if len(isolates) > 0:
+            new_graph._node_ids = cp.arange(new_graph._N, dtype=index_dtype)
+
         return new_graph
 
     @classmethod
@@ -405,6 +417,7 @@ def clear(self) -> None:
         self.src_indices = cp.empty(0, self.src_indices.dtype)
         self.dst_indices = cp.empty(0, self.dst_indices.dtype)
         self._N = 0
+        self._node_ids = None
         self.key_to_id = None
         self._id_to_key = None
 
@@ -458,6 +471,24 @@ def has_edge(self, u: NodeKey, v: NodeKey) -> bool:
                 return False
         return bool(((self.src_indices == u) & (self.dst_indices == v)).any())
 
+    def _neighbors(self, n: NodeKey) -> cp.ndarray[NodeValue]:
+        if n not in self:
+            hash(n)  # To raise TypeError if appropriate
+            raise nx.NetworkXError(
+                f"The node {n} is not in the {self.__class__.__name__.lower()}."
+            )
+        if self.key_to_id is not None:
+            n = self.key_to_id[n]
+        nbrs = self.dst_indices[self.src_indices == n]
+        if self.is_multigraph():
+            nbrs = cp.unique(nbrs)
+        return nbrs
+
+    @networkx_api
+    def neighbors(self, n: NodeKey) -> Iterator[NodeKey]:
+        nbrs = self._neighbors(n)
+        return iter(self._nodeiter_to_iter(nbrs.tolist()))
+
     @networkx_api
     def has_node(self, n: NodeKey) -> bool:
         return n in self
@@ -491,7 +522,7 @@ def size(self, weight: AttrKey | None = None) -> int:
         if weight is not None:
             raise NotImplementedError
         # If no self-edges, then `self.src_indices.size // 2`
-        return int((self.src_indices <= self.dst_indices).sum())
+        return int(cp.count_nonzero(self.src_indices <= self.dst_indices))
 
     @networkx_api
     def to_directed(self, as_view: bool = False) -> nxcg.DiGraph:
@@ -500,7 +531,7 @@ def to_directed(self, as_view: bool = False) -> nxcg.DiGraph:
     @networkx_api
     def to_undirected(self, as_view: bool = False) -> Graph:
         # Does deep copy in networkx
-        return self.copy(as_view)
+        return self._copy(as_view, self.to_undirected_class())
 
     # Not implemented...
     # adj, adjacency, add_edge, add_edges_from, add_node,
@@ -561,6 +592,7 @@ def _get_plc_graph(
         store_transposed: bool = False,
         switch_indices: bool = False,
         edge_array: cp.ndarray[EdgeValue] | None = None,
+        symmetrize: str | None = None,
     ):
         if edge_array is not None or edge_attr is None:
             pass
@@ -619,11 +651,32 @@ def _get_plc_graph(
         dst_indices = self.dst_indices
         if switch_indices:
             src_indices, dst_indices = dst_indices, src_indices
+        if symmetrize is not None:
+            if edge_array is not None:
+                raise NotImplementedError(
+                    "edge_array must be None when symmetrizing the graph"
+                )
+            N = self._N
+            # Upcast to int64 so indices don't overflow
+            src_dst = N * src_indices.astype(np.int64) + dst_indices
+            src_dst_T = src_indices + N * dst_indices.astype(np.int64)
+            if symmetrize == "union":
+                src_dst_new = cp.union1d(src_dst, src_dst_T)
+            elif symmetrize == "intersection":
+                src_dst_new = cp.intersect1d(src_dst, src_dst_T)
+            else:
+                raise ValueError(
+                    f'symmetrize must be "union" or "intersection"; got "{symmetrize}"'
+                )
+            src_indices, dst_indices = cp.divmod(src_dst_new, N)
+            src_indices = src_indices.astype(index_dtype)
+            dst_indices = dst_indices.astype(index_dtype)
+
         return plc.SGGraph(
             resource_handle=plc.ResourceHandle(),
             graph_properties=plc.GraphProperties(
-                is_multigraph=self.is_multigraph(),
-                is_symmetric=not self.is_directed(),
+                is_multigraph=self.is_multigraph() and symmetrize is None,
+                is_symmetric=not self.is_directed() or symmetrize is not None,
             ),
             src_or_offset_array=src_indices,
             dst_or_index_array=dst_indices,
@@ -631,6 +684,7 @@ def _get_plc_graph(
             store_transposed=store_transposed,
             renumber=False,
             do_expensive_check=False,
+            vertices_array=self._node_ids,
         )
 
     def _sort_edge_indices(self, primary="src"):
@@ -680,16 +734,30 @@ def _become(self, other: Graph):
         self.graph = graph
         return self
 
-    def _degrees_array(self):
-        degrees = cp.bincount(self.src_indices, minlength=self._N)
+    def _degrees_array(self, *, ignore_selfloops=False):
+        src_indices = self.src_indices
+        dst_indices = self.dst_indices
+        if ignore_selfloops:
+            not_selfloops = src_indices != dst_indices
+            src_indices = src_indices[not_selfloops]
+            if self.is_directed():
+                dst_indices = dst_indices[not_selfloops]
+        if src_indices.size == 0:
+            return cp.zeros(self._N, dtype=np.int64)
+        degrees = cp.bincount(src_indices, minlength=self._N)
         if self.is_directed():
-            degrees += cp.bincount(self.dst_indices, minlength=self._N)
+            degrees += cp.bincount(dst_indices, minlength=self._N)
         return degrees
 
     _in_degrees_array = _degrees_array
     _out_degrees_array = _degrees_array
 
     # Data conversions
+    def _nodekeys_to_nodearray(self, nodes: Iterable[NodeKey]) -> cp.array[IndexValue]:
+        if self.key_to_id is None:
+            return cp.fromiter(nodes, dtype=index_dtype)
+        return cp.fromiter(map(self.key_to_id.__getitem__, nodes), dtype=index_dtype)
+
     def _nodeiter_to_iter(self, node_ids: Iterable[IndexValue]) -> Iterable[NodeKey]:
         """Convert an iterable of node IDs to an iterable of node keys."""
         if (id_to_key := self.id_to_key) is not None:
@@ -701,6 +769,11 @@ def _nodearray_to_list(self, node_ids: cp.ndarray[IndexValue]) -> list[NodeKey]:
             return node_ids.tolist()
         return list(self._nodeiter_to_iter(node_ids.tolist()))
 
+    def _list_to_nodearray(self, nodes: list[NodeKey]) -> cp.ndarray[IndexValue]:
+        if (key_to_id := self.key_to_id) is not None:
+            nodes = [key_to_id[node] for node in nodes]
+        return cp.array(nodes, dtype=index_dtype)
+
     def _nodearray_to_set(self, node_ids: cp.ndarray[IndexValue]) -> set[NodeKey]:
         if self.key_to_id is None:
             return set(node_ids.tolist())
diff --git a/python/nx-cugraph/nx_cugraph/classes/multidigraph.py b/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
index 2c7bfc00752..2e7a55a9eb1 100644
--- a/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/multidigraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -33,3 +33,11 @@ def is_directed(cls) -> bool:
     @classmethod
     def to_networkx_class(cls) -> type[nx.MultiDiGraph]:
         return nx.MultiDiGraph
+
+    ##########################
+    # NetworkX graph methods #
+    ##########################
+
+    @networkx_api
+    def to_undirected(self, reciprocal=False, as_view=False):
+        raise NotImplementedError
diff --git a/python/nx-cugraph/nx_cugraph/classes/multigraph.py b/python/nx-cugraph/nx_cugraph/classes/multigraph.py
index 23466dc7dd4..fb787369e58 100644
--- a/python/nx-cugraph/nx_cugraph/classes/multigraph.py
+++ b/python/nx-cugraph/nx_cugraph/classes/multigraph.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -399,7 +399,7 @@ def to_directed(self, as_view: bool = False) -> nxcg.MultiDiGraph:
     @networkx_api
     def to_undirected(self, as_view: bool = False) -> MultiGraph:
         # Does deep copy in networkx
-        return self.copy(as_view)
+        return self._copy(as_view, self.to_undirected_class())
 
     ###################
     # Private methods #
diff --git a/python/nx-cugraph/nx_cugraph/convert.py b/python/nx-cugraph/nx_cugraph/convert.py
index 3c0814370d3..f265540a161 100644
--- a/python/nx-cugraph/nx_cugraph/convert.py
+++ b/python/nx-cugraph/nx_cugraph/convert.py
@@ -39,6 +39,24 @@
 REQUIRED = ...
 
 
+def _iterate_values(graph, adj, is_dicts, func):
+    # Using `dict.values` is faster and is the common case, but it doesn't always work
+    if is_dicts is not False:
+        it = concat(map(dict.values, adj.values()))
+        if graph is not None and graph.is_multigraph():
+            it = concat(map(dict.values, it))
+        try:
+            return func(it), True
+        except TypeError:
+            if is_dicts is True:
+                raise
+    # May not be regular dicts
+    it = concat(x.values() for x in adj.values())
+    if graph is not None and graph.is_multigraph():
+        it = concat(x.values() for x in it)
+    return func(it), False
+
+
 def from_networkx(
     graph: nx.Graph,
     edge_attrs: AttrKey | dict[AttrKey, EdgeValue | None] | None = None,
@@ -152,6 +170,7 @@ def from_networkx(
     if isinstance(adj, nx.classes.coreviews.FilterAdjacency):
         adj = {k: dict(v) for k, v in adj.items()}
 
+    is_dicts = None
     N = len(adj)
     if (
         not preserve_edge_attrs
@@ -162,12 +181,9 @@ def from_networkx(
         # Either we weren't asked to preserve edge attributes, or there are no edges
         edge_attrs = None
     elif preserve_edge_attrs:
-        # Using comprehensions should be just as fast starting in Python 3.11
-        it = concat(map(dict.values, adj.values()))
-        if graph.is_multigraph():
-            it = concat(map(dict.values, it))
-        # PERF: should we add `filter(None, ...)` to remove empty data dicts?
-        attr_sets = set(map(frozenset, it))
+        attr_sets, is_dicts = _iterate_values(
+            graph, adj, is_dicts, lambda it: set(map(frozenset, it))
+        )
         attrs = frozenset.union(*attr_sets)
         edge_attrs = dict.fromkeys(attrs, REQUIRED)
         if len(attr_sets) > 1:
@@ -207,10 +223,9 @@ def from_networkx(
                 del edge_attrs[attr]
             # Else some edges have attribute (default already None)
         else:
-            it = concat(map(dict.values, adj.values()))
-            if graph.is_multigraph():
-                it = concat(map(dict.values, it))
-            attr_sets = set(map(required.intersection, it))
+            attr_sets, is_dicts = _iterate_values(
+                graph, adj, is_dicts, lambda it: set(map(required.intersection, it))
+            )
             for attr in required - frozenset.union(*attr_sets):
                 # No edges have these attributes
                 del edge_attrs[attr]
@@ -269,17 +284,19 @@ def from_networkx(
         dst_iter = map(key_to_id.__getitem__, dst_iter)
     if graph.is_multigraph():
         dst_indices = np.fromiter(dst_iter, index_dtype)
-        num_multiedges = np.fromiter(
-            map(len, concat(map(dict.values, adj.values()))), index_dtype
+        num_multiedges, is_dicts = _iterate_values(
+            None, adj, is_dicts, lambda it: np.fromiter(map(len, it), index_dtype)
         )
         # cp.repeat is slow to use here, so use numpy instead
         dst_indices = cp.array(np.repeat(dst_indices, num_multiedges))
         # Determine edge keys and edge ids for multigraphs
-        edge_keys = list(concat(concat(map(dict.values, adj.values()))))
-        edge_indices = cp.fromiter(
-            concat(map(range, map(len, concat(map(dict.values, adj.values()))))),
-            index_dtype,
-        )
+        if is_dicts:
+            edge_keys = list(concat(concat(map(dict.values, adj.values()))))
+            it = concat(map(dict.values, adj.values()))
+        else:
+            edge_keys = list(concat(concat(x.values() for x in adj.values())))
+            it = concat(x.values() for x in adj.values())
+        edge_indices = cp.fromiter(concat(map(range, map(len, it))), index_dtype)
         if edge_keys == edge_indices.tolist():
             edge_keys = None  # Prefer edge_indices
     else:
@@ -323,19 +340,21 @@ def from_networkx(
                 edge_masks[edge_attr] = cp.fromiter(iter_mask, bool)
                 edge_values[edge_attr] = cp.array(vals, dtype)
                 # if vals.ndim > 1: ...
+            elif edge_default is REQUIRED:
+                if dtype is None:
+
+                    def func(it, edge_attr=edge_attr):
+                        return cp.array(list(map(op.itemgetter(edge_attr), it)))
+
+                else:
+
+                    def func(it, edge_attr=edge_attr, dtype=dtype):
+                        return cp.fromiter(map(op.itemgetter(edge_attr), it), dtype)
+
+                edge_value, is_dicts = _iterate_values(graph, adj, is_dicts, func)
+                edge_values[edge_attr] = edge_value
             else:
-                if edge_default is REQUIRED:
-                    # Using comprehensions should be fast starting in Python 3.11
-                    # iter_values = (
-                    #     edgedata[edge_attr]
-                    #     for rowdata in adj.values()
-                    #     for edgedata in rowdata.values()
-                    # )
-                    it = concat(map(dict.values, adj.values()))
-                    if graph.is_multigraph():
-                        it = concat(map(dict.values, it))
-                    iter_values = map(op.itemgetter(edge_attr), it)
-                elif graph.is_multigraph():
+                if graph.is_multigraph():
                     iter_values = (
                         edgedata.get(edge_attr, edge_default)
                         for rowdata in adj.values()
@@ -352,7 +371,7 @@ def from_networkx(
                     edge_values[edge_attr] = cp.array(list(iter_values))
                 else:
                     edge_values[edge_attr] = cp.fromiter(iter_values, dtype)
-                # if vals.ndim > 1: ...
+            # if vals.ndim > 1: ...
 
     # cp.repeat is slow to use here, so use numpy instead
     src_indices = np.repeat(
diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py
index 6c8b8fb4a1d..1a2ecde9b8c 100644
--- a/python/nx-cugraph/nx_cugraph/convert_matrix.py
+++ b/python/nx-cugraph/nx_cugraph/convert_matrix.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -23,7 +23,8 @@
 ]
 
 
-@networkx_algorithm
+# Value columns with string dtype is not supported
+@networkx_algorithm(is_incomplete=True, version_added="23.12")
 def from_pandas_edgelist(
     df,
     source="source",
@@ -32,10 +33,11 @@ def from_pandas_edgelist(
     create_using=None,
     edge_key=None,
 ):
-    """cudf.DataFrame inputs also supported."""
+    """cudf.DataFrame inputs also supported; value columns with str is unsuppported."""
     graph_class, inplace = _create_using_class(create_using)
     src_array = df[source].to_numpy()
     dst_array = df[target].to_numpy()
+    # TODO: create renumbering helper function(s)
     # Renumber step 0: node keys
     nodes = np.unique(np.concatenate([src_array, dst_array]))
     N = nodes.size
@@ -119,7 +121,7 @@ def from_pandas_edgelist(
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def from_scipy_sparse_array(
     A, parallel_edges=False, create_using=None, edge_attribute="weight"
 ):
diff --git a/python/nx-cugraph/nx_cugraph/generators/classic.py b/python/nx-cugraph/nx_cugraph/generators/classic.py
index b196c232320..a548beea34f 100644
--- a/python/nx-cugraph/nx_cugraph/generators/classic.py
+++ b/python/nx-cugraph/nx_cugraph/generators/classic.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -19,7 +19,7 @@
 
 import nx_cugraph as nxcg
 
-from ..utils import _get_int_dtype, index_dtype, networkx_algorithm, nodes_or_number
+from ..utils import _get_int_dtype, index_dtype, networkx_algorithm
 from ._utils import (
     _IS_NX32_OR_LESS,
     _common_small_graph,
@@ -51,7 +51,7 @@
 concat = itertools.chain.from_iterable
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def barbell_graph(m1, m2, create_using=None):
     # Like two complete graphs and a path_graph
     m1 = _ensure_nonnegative_int(m1)
@@ -81,13 +81,12 @@ def barbell_graph(m1, m2, create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def circular_ladder_graph(n, create_using=None):
     return _ladder_graph(n, create_using, is_circular=True)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0, version_added="23.12")
 def complete_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     if n < 3:
@@ -100,7 +99,7 @@ def complete_graph(n, create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def complete_multipartite_graph(*subset_sizes):
     if not subset_sizes:
         return nxcg.Graph()
@@ -143,8 +142,7 @@ def complete_multipartite_graph(*subset_sizes):
     )
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0, version_added="23.12")
 def cycle_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
@@ -174,8 +172,7 @@ def cycle_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0, version_added="23.12")
 def empty_graph(n=0, create_using=None, default=nx.Graph):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using, default=default)
@@ -237,13 +234,12 @@ def _ladder_graph(n, create_using, *, is_circular=False):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def ladder_graph(n, create_using=None):
     return _ladder_graph(n, create_using)
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1], version_added="23.12")
 def lollipop_graph(m, n, create_using=None):
     # Like complete_graph then path_graph
     orig_m, unused_nodes_m = m
@@ -278,13 +274,12 @@ def lollipop_graph(m, n, create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def null_graph(create_using=None):
     return _common_small_graph(0, None, create_using)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0, version_added="23.12")
 def path_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
@@ -304,8 +299,7 @@ def path_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0, version_added="23.12")
 def star_graph(n, create_using=None):
     orig_n, orig_nodes = n
     n, nodes = _number_and_nodes(n)
@@ -329,8 +323,7 @@ def star_graph(n, create_using=None):
     return G
 
 
-@nodes_or_number([0, 1])
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=[0, 1], version_added="23.12")
 def tadpole_graph(m, n, create_using=None):
     orig_m, unused_nodes_m = m
     orig_n, unused_nodes_n = n
@@ -368,12 +361,12 @@ def tadpole_graph(m, n, create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def trivial_graph(create_using=None):
     return _common_small_graph(1, None, create_using)
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def turan_graph(n, r):
     if not 1 <= r <= n:
         raise nx.NetworkXError("Must satisfy 1 <= r <= n")
@@ -382,8 +375,7 @@ def turan_graph(n, r):
     return complete_multipartite_graph(*partitions)
 
 
-@nodes_or_number(0)
-@networkx_algorithm
+@networkx_algorithm(nodes_or_number=0, version_added="23.12")
 def wheel_graph(n, create_using=None):
     n, nodes = _number_and_nodes(n)
     graph_class, inplace = _create_using_class(create_using)
diff --git a/python/nx-cugraph/nx_cugraph/generators/community.py b/python/nx-cugraph/nx_cugraph/generators/community.py
index e5cb03e8cc0..9b0e0848de9 100644
--- a/python/nx-cugraph/nx_cugraph/generators/community.py
+++ b/python/nx-cugraph/nx_cugraph/generators/community.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -27,7 +27,7 @@
 ]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def caveman_graph(l, k):  # noqa: E741
     l = _ensure_int(l)  # noqa: E741
     k = _ensure_int(k)
diff --git a/python/nx-cugraph/nx_cugraph/generators/small.py b/python/nx-cugraph/nx_cugraph/generators/small.py
index b9a189c31d5..45487571cda 100644
--- a/python/nx-cugraph/nx_cugraph/generators/small.py
+++ b/python/nx-cugraph/nx_cugraph/generators/small.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -43,7 +43,7 @@
 ]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def bull_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -56,7 +56,7 @@ def bull_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def chvatal_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -85,7 +85,7 @@ def chvatal_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def cubical_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -105,7 +105,7 @@ def cubical_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def desargues_graph(create_using=None):
     # This can also be defined w.r.t. LCF_graph
     graph_class, inplace = _create_using_class(create_using)
@@ -146,7 +146,7 @@ def desargues_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def diamond_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -159,7 +159,7 @@ def diamond_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def dodecahedral_graph(create_using=None):
     # This can also be defined w.r.t. LCF_graph
     graph_class, inplace = _create_using_class(create_using)
@@ -200,7 +200,7 @@ def dodecahedral_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def frucht_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -235,7 +235,7 @@ def frucht_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def heawood_graph(create_using=None):
     # This can also be defined w.r.t. LCF_graph
     graph_class, inplace = _create_using_class(create_using)
@@ -274,7 +274,7 @@ def heawood_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def house_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -287,7 +287,7 @@ def house_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def house_x_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -306,7 +306,7 @@ def house_x_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def icosahedral_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -337,7 +337,7 @@ def icosahedral_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def krackhardt_kite_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -366,7 +366,7 @@ def krackhardt_kite_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def moebius_kantor_graph(create_using=None):
     # This can also be defined w.r.t. LCF_graph
     graph_class, inplace = _create_using_class(create_using)
@@ -407,7 +407,7 @@ def moebius_kantor_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def octahedral_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -428,7 +428,7 @@ def octahedral_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def pappus_graph():
     # This can also be defined w.r.t. LCF_graph
     # fmt: off
@@ -452,7 +452,7 @@ def pappus_graph():
     return nxcg.Graph.from_coo(18, src_indices, dst_indices, name="Pappus Graph")
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def petersen_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -479,7 +479,7 @@ def petersen_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def sedgewick_maze_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -500,7 +500,7 @@ def sedgewick_maze_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def tetrahedral_graph(create_using=None):
     # This can also be defined w.r.t. complete_graph
     graph_class, inplace = _create_using_class(create_using)
@@ -517,7 +517,7 @@ def tetrahedral_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def truncated_cube_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -548,7 +548,7 @@ def truncated_cube_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def truncated_tetrahedron_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
@@ -583,7 +583,7 @@ def truncated_tetrahedron_graph(create_using=None):
     return G
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def tutte_graph(create_using=None):
     graph_class, inplace = _create_using_class(create_using)
     if graph_class.is_directed():
diff --git a/python/nx-cugraph/nx_cugraph/generators/social.py b/python/nx-cugraph/nx_cugraph/generators/social.py
index 3c936d07af3..07e82c63fbf 100644
--- a/python/nx-cugraph/nx_cugraph/generators/social.py
+++ b/python/nx-cugraph/nx_cugraph/generators/social.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -25,7 +25,7 @@
 ]
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def davis_southern_women_graph():
     # fmt: off
     src_indices = cp.array(
@@ -88,7 +88,7 @@ def davis_southern_women_graph():
     )
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def florentine_families_graph():
     # fmt: off
     src_indices = cp.array(
@@ -114,7 +114,7 @@ def florentine_families_graph():
     return nxcg.Graph.from_coo(15, src_indices, dst_indices, id_to_key=nodes)
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def karate_club_graph():
     # fmt: off
     src_indices = cp.array(
@@ -175,7 +175,7 @@ def karate_club_graph():
     )
 
 
-@networkx_algorithm
+@networkx_algorithm(version_added="23.12")
 def les_miserables_graph():
     # fmt: off
     src_indices = cp.array(
diff --git a/python/nx-cugraph/nx_cugraph/interface.py b/python/nx-cugraph/nx_cugraph/interface.py
index be6b3596030..46ea5831b0b 100644
--- a/python/nx-cugraph/nx_cugraph/interface.py
+++ b/python/nx-cugraph/nx_cugraph/interface.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -12,6 +12,7 @@
 # limitations under the License.
 from __future__ import annotations
 
+import os
 import sys
 
 import networkx as nx
@@ -67,25 +68,42 @@ def key(testpath):
         louvain_different = "Louvain may be different due to RNG"
         no_string_dtype = "string edge values not currently supported"
 
-        xfail = {}
+        xfail = {
+            # This is removed while strongly_connected_components() is not
+            # dispatchable. See algorithms/components/strongly_connected.py for
+            # details.
+            #
+            # key(
+            #     "test_strongly_connected.py:"
+            #     "TestStronglyConnected.test_condensation_mapping_and_members"
+            # ): "Strongly connected groups in different iteration order",
+        }
 
         from packaging.version import parse
 
         nxver = parse(nx.__version__)
 
         if nxver.major == 3 and nxver.minor <= 2:
-            # Networkx versions prior to 3.2.1 have tests written to expect
-            # sp.sparse.linalg.ArpackNoConvergence exceptions raised on no
-            # convergence in HITS. Newer versions since the merge of
-            # https://github.com/networkx/networkx/pull/7084 expect
-            # nx.PowerIterationFailedConvergence, which is what nx_cugraph.hits
-            # raises, so we mark them as xfail for previous versions of NX.
             xfail.update(
                 {
+                    # NetworkX versions prior to 3.2.1 have tests written to
+                    # expect sp.sparse.linalg.ArpackNoConvergence exceptions
+                    # raised on no convergence in HITS. Newer versions since
+                    # the merge of
+                    # https://github.com/networkx/networkx/pull/7084 expect
+                    # nx.PowerIterationFailedConvergence, which is what
+                    # nx_cugraph.hits raises, so we mark them as xfail for
+                    # previous versions of NX.
                     key(
                         "test_hits.py:TestHITS.test_hits_not_convergent"
                     ): "nx_cugraph.hits raises updated exceptions not caught in "
                     "these tests",
+                    # NetworkX versions 3.2 and older contain tests that fail
+                    # with pytest>=8. Assume pytest>=8 and mark xfail.
+                    key(
+                        "test_strongly_connected.py:"
+                        "TestStronglyConnected.test_connected_raise"
+                    ): "test is incompatible with pytest>=8",
                 }
             )
 
@@ -241,12 +259,45 @@ def key(testpath):
                     )
 
         too_slow = "Too slow to run"
-        maybe_oom = "out of memory in CI"
         skip = {
             key("test_tree_isomorphism.py:test_positive"): too_slow,
             key("test_tree_isomorphism.py:test_negative"): too_slow,
-            key("test_efficiency.py:TestEfficiency.test_using_ego_graph"): maybe_oom,
+            # These repeatedly call `bfs_layers`, which converts the graph every call
+            key(
+                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph2_different_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph3_same_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph3_different_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp.py:TestGraphISOVF2pp.test_custom_graph4_same_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp.py:TestGraphISOVF2pp."
+                "test_disconnected_graph_all_same_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp.py:TestGraphISOVF2pp."
+                "test_disconnected_graph_all_different_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp.py:TestGraphISOVF2pp."
+                "test_disconnected_graph_some_same_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp.py:TestMultiGraphISOVF2pp."
+                "test_custom_multigraph3_same_labels"
+            ): too_slow,
+            key(
+                "test_vf2pp_helpers.py:TestNodeOrdering."
+                "test_matching_order_all_branches"
+            ): too_slow,
         }
+        if os.environ.get("PYTEST_NO_SKIP", False):
+            skip.clear()
 
         for item in items:
             kset = set(item.keywords)
diff --git a/python/nx-cugraph/nx_cugraph/scripts/__init__.py b/python/nx-cugraph/nx_cugraph/scripts/__init__.py
new file mode 100644
index 00000000000..aeae6078111
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/scripts/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/nx-cugraph/nx_cugraph/scripts/__main__.py b/python/nx-cugraph/nx_cugraph/scripts/__main__.py
new file mode 100755
index 00000000000..c0963e64cc5
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/scripts/__main__.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if __name__ == "__main__":
+    import argparse
+
+    from nx_cugraph.scripts import print_table, print_tree
+
+    parser = argparse.ArgumentParser(
+        parents=[
+            print_table.get_argumentparser(add_help=False),
+            print_tree.get_argumentparser(add_help=False),
+        ],
+        description="Print info about functions implemented by nx-cugraph",
+    )
+    parser.add_argument("action", choices=["print_table", "print_tree"])
+    args = parser.parse_args()
+    if args.action == "print_table":
+        print_table.main()
+    else:
+        print_tree.main(
+            by=args.by,
+            networkx_path=args.networkx_path,
+            dispatch_name=args.dispatch_name or args.dispatch_name_always,
+            version_added=args.version_added,
+            plc=args.plc,
+            dispatch_name_if_different=not args.dispatch_name_always,
+        )
diff --git a/python/nx-cugraph/nx_cugraph/scripts/print_table.py b/python/nx-cugraph/nx_cugraph/scripts/print_table.py
new file mode 100755
index 00000000000..117a1444f48
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/scripts/print_table.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import sys
+from collections import namedtuple
+
+from networkx.utils.backends import _registered_algorithms as algos
+
+from _nx_cugraph import get_info
+from nx_cugraph.interface import BackendInterface
+
+
+def get_funcpath(func):
+    return f"{func.__module__}.{func.__name__}"
+
+
+def get_path_to_name():
+    return {
+        get_funcpath(algos[funcname]): funcname
+        for funcname in get_info()["functions"].keys() & algos.keys()
+    }
+
+
+Info = namedtuple(
+    "Info",
+    "networkx_path, dispatch_name, version_added, plc, is_incomplete, is_different",
+)
+
+
+def get_path_to_info(path_to_name=None, version_added_sep=".", plc_sep="/"):
+    if path_to_name is None:
+        path_to_name = get_path_to_name()
+    rv = {}
+    for funcpath in sorted(path_to_name):
+        funcname = path_to_name[funcpath]
+        cufunc = getattr(BackendInterface, funcname)
+        plc = plc_sep.join(sorted(cufunc._plc_names)) if cufunc._plc_names else ""
+        version_added = cufunc.version_added.replace(".", version_added_sep)
+        is_incomplete = cufunc.is_incomplete
+        is_different = cufunc.is_different
+        rv[funcpath] = Info(
+            funcpath, funcname, version_added, plc, is_incomplete, is_different
+        )
+    return rv
+
+
+def main(path_to_info=None, *, file=sys.stdout):
+    if path_to_info is None:
+        path_to_info = get_path_to_info(version_added_sep=".")
+    lines = ["networkx_path,dispatch_name,version_added,plc,is_incomplete,is_different"]
+    lines.extend(",".join(map(str, info)) for info in path_to_info.values())
+    text = "\n".join(lines)
+    print(text, file=file)
+    return text
+
+
+def get_argumentparser(add_help=True):
+    return argparse.ArgumentParser(
+        description="Print info about functions implemented by nx-cugraph as CSV",
+        add_help=add_help,
+    )
+
+
+if __name__ == "__main__":
+    parser = get_argumentparser()
+    args = parser.parse_args()
+    main()
diff --git a/python/nx-cugraph/nx_cugraph/scripts/print_tree.py b/python/nx-cugraph/nx_cugraph/scripts/print_tree.py
new file mode 100755
index 00000000000..485873a447d
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/scripts/print_tree.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import re
+import sys
+
+import networkx as nx
+
+from nx_cugraph.scripts.print_table import get_path_to_info
+
+
+def add_branch(G, path, extra="", *, skip=0):
+    branch = path.split(".")
+    prev = ".".join(branch[: skip + 1])
+    for i in range(skip + 2, len(branch)):
+        cur = ".".join(branch[:i])
+        G.add_edge(prev, cur)
+        prev = cur
+    if extra:
+        if not isinstance(extra, str):
+            extra = ", ".join(extra)
+        path += f" ({extra})"
+    G.add_edge(prev, path)
+
+
+def get_extra(
+    info,
+    *,
+    networkx_path=False,
+    dispatch_name=False,
+    version_added=False,
+    plc=False,
+    dispatch_name_if_different=False,
+    incomplete=False,
+    different=False,
+):
+    extra = []
+    if networkx_path:
+        extra.append(info.networkx_path)
+    if dispatch_name and (
+        not dispatch_name_if_different
+        or info.dispatch_name != info.networkx_path.rsplit(".", 1)[-1]
+    ):
+        extra.append(info.dispatch_name)
+    if version_added:
+        v = info.version_added
+        if len(v) != 5:
+            raise ValueError(f"Is there something wrong with version: {v!r}?")
+        extra.append(v[:2] + "." + v[-2:])
+    if plc and info.plc:
+        extra.append(info.plc)
+    if incomplete and info.is_incomplete:
+        extra.append("is-incomplete")
+    if different and info.is_different:
+        extra.append("is-different")
+    return extra
+
+
+def create_tree(
+    path_to_info=None,
+    *,
+    by="networkx_path",
+    skip=0,
+    networkx_path=False,
+    dispatch_name=False,
+    version_added=False,
+    plc=False,
+    dispatch_name_if_different=False,
+    incomplete=False,
+    different=False,
+    prefix="",
+):
+    if path_to_info is None:
+        path_to_info = get_path_to_info()
+    if isinstance(by, str):
+        by = [by]
+    G = nx.DiGraph()
+    for info in sorted(
+        path_to_info.values(),
+        key=lambda x: (*(getattr(x, b) for b in by), x.networkx_path),
+    ):
+        if not all(getattr(info, b) for b in by):
+            continue
+        path = prefix + ".".join(getattr(info, b) for b in by)
+        extra = get_extra(
+            info,
+            networkx_path=networkx_path,
+            dispatch_name=dispatch_name,
+            version_added=version_added,
+            plc=plc,
+            dispatch_name_if_different=dispatch_name_if_different,
+            incomplete=incomplete,
+            different=different,
+        )
+        add_branch(G, path, extra=extra, skip=skip)
+    return G
+
+
+def main(
+    path_to_info=None,
+    *,
+    by="networkx_path",
+    networkx_path=False,
+    dispatch_name=False,
+    version_added=False,
+    plc=False,
+    dispatch_name_if_different=True,
+    incomplete=False,
+    different=False,
+    file=sys.stdout,
+):
+    if path_to_info is None:
+        path_to_info = get_path_to_info(version_added_sep="-")
+    kwargs = {
+        "networkx_path": networkx_path,
+        "dispatch_name": dispatch_name,
+        "version_added": version_added,
+        "plc": plc,
+        "dispatch_name_if_different": dispatch_name_if_different,
+        "incomplete": incomplete,
+        "different": different,
+    }
+    if by == "networkx_path":
+        G = create_tree(path_to_info, by="networkx_path", **kwargs)
+        text = re.sub(
+            r" [A-Za-z_\./]+\.", " ", ("\n".join(nx.generate_network_text(G)))
+        )
+    elif by == "plc":
+        G = create_tree(
+            path_to_info, by=["plc", "networkx_path"], prefix="plc-", **kwargs
+        )
+        text = re.sub(
+            "plc-",
+            "plc.",
+            re.sub(
+                r" plc-[A-Za-z_\./]*\.",
+                " ",
+                "\n".join(nx.generate_network_text(G)),
+            ),
+        )
+    elif by == "version_added":
+        G = create_tree(
+            path_to_info,
+            by=["version_added", "networkx_path"],
+            prefix="version_added-",
+            **kwargs,
+        )
+        text = re.sub(
+            "version_added-",
+            "version: ",
+            re.sub(
+                r" version_added-[-0-9A-Za-z_\./]*\.",
+                " ",
+                "\n".join(nx.generate_network_text(G)),
+            ),
+        ).replace("-", ".")
+    else:
+        raise ValueError(
+            "`by` argument should be one of {'networkx_path', 'plc', 'version_added' "
+            f"got: {by}"
+        )
+    print(text, file=file)
+    return text
+
+
+def get_argumentparser(add_help=True):
+    parser = argparse.ArgumentParser(
+        "Print a tree showing NetworkX functions implemented by nx-cugraph",
+        add_help=add_help,
+    )
+    parser.add_argument(
+        "--by",
+        choices=["networkx_path", "plc", "version_added"],
+        default="networkx_path",
+        help="How to group functions",
+    )
+    parser.add_argument(
+        "--dispatch-name",
+        "--dispatch_name",
+        action="store_true",
+        help="Show the dispatch name in parentheses if different from NetworkX name",
+    )
+    parser.add_argument(
+        "--dispatch-name-always",
+        "--dispatch_name_always",
+        action="store_true",
+        help="Always show the dispatch name in parentheses",
+    )
+    parser.add_argument(
+        "--plc",
+        "--pylibcugraph",
+        action="store_true",
+        help="Show the used pylibcugraph function in parentheses",
+    )
+    parser.add_argument(
+        "--version-added",
+        "--version_added",
+        action="store_true",
+        help="Show the version added in parentheses",
+    )
+    parser.add_argument(
+        "--networkx-path",
+        "--networkx_path",
+        action="store_true",
+        help="Show the full networkx path in parentheses",
+    )
+    parser.add_argument(
+        "--incomplete",
+        action="store_true",
+        help="Show which functions are incomplete",
+    )
+    parser.add_argument(
+        "--different",
+        action="store_true",
+        help="Show which functions are different",
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_argumentparser()
+    args = parser.parse_args()
+    main(
+        by=args.by,
+        networkx_path=args.networkx_path,
+        dispatch_name=args.dispatch_name or args.dispatch_name_always,
+        version_added=args.version_added,
+        plc=args.plc,
+        dispatch_name_if_different=not args.dispatch_name_always,
+        incomplete=args.incomplete,
+        different=args.different,
+    )
diff --git a/python/nx-cugraph/nx_cugraph/tests/__init__.py b/python/nx-cugraph/nx_cugraph/tests/__init__.py
index ce94db52fa2..c2002fd3fb9 100644
--- a/python/nx-cugraph/nx_cugraph/tests/__init__.py
+++ b/python/nx-cugraph/nx_cugraph/tests/__init__.py
@@ -1,5 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
diff --git a/python/nx-cugraph/nx_cugraph/tests/ensure_algos_covered.py b/python/nx-cugraph/nx_cugraph/tests/ensure_algos_covered.py
new file mode 100644
index 00000000000..7047f0eeafd
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/ensure_algos_covered.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Ensure that all functions wrapped by @networkx_algorithm were called.
+
+This file is run by CI and should not normally be run manually.
+"""
+import inspect
+import json
+from pathlib import Path
+
+from nx_cugraph.interface import BackendInterface
+from nx_cugraph.utils import networkx_algorithm
+
+with Path("coverage.json").open() as f:
+    coverage = json.load(f)
+
+filenames_to_executed_lines = {
+    "nx_cugraph/"
+    + filename.rsplit("nx_cugraph/", 1)[-1]: set(coverage_info["executed_lines"])
+    for filename, coverage_info in coverage["files"].items()
+}
+
+
+def unwrap(func):
+    while hasattr(func, "__wrapped__"):
+        func = func.__wrapped__
+    return func
+
+
+def get_func_filename(func):
+    return "nx_cugraph" + inspect.getfile(unwrap(func)).rsplit("nx_cugraph", 1)[-1]
+
+
+def get_func_linenos(func):
+    lines, lineno = inspect.getsourcelines(unwrap(func))
+    for i, line in enumerate(lines, lineno):
+        if ":\n" in line:
+            return set(range(i + 1, lineno + len(lines)))
+    raise RuntimeError(f"Could not determine line numbers for function {func}")
+
+
+def has_any_coverage(func):
+    return bool(
+        filenames_to_executed_lines[get_func_filename(func)] & get_func_linenos(func)
+    )
+
+
+def main():
+    no_coverage = set()
+    for attr, func in vars(BackendInterface).items():
+        if not isinstance(func, networkx_algorithm):
+            continue
+        if not has_any_coverage(func):
+            no_coverage.add(attr)
+    if no_coverage:
+        msg = "The following algorithms have no coverage: " + ", ".join(
+            sorted(no_coverage)
+        )
+        # Create a border of "!"
+        msg = (
+            "\n\n"
+            + "!" * (len(msg) + 6)
+            + "\n!! "
+            + msg
+            + " !!\n"
+            + "!" * (len(msg) + 6)
+            + "\n"
+        )
+        raise AssertionError(msg)
+    print("\nSuccess: coverage determined all algorithms were called!\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_bfs.py b/python/nx-cugraph/nx_cugraph/tests/test_bfs.py
new file mode 100644
index 00000000000..c2b22e98949
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_bfs.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+from packaging.version import parse
+
+nxver = parse(nx.__version__)
+
+if nxver.major == 3 and nxver.minor < 2:
+    pytest.skip("Need NetworkX >=3.2 to test clustering", allow_module_level=True)
+
+
+def test_generic_bfs_edges():
+    # generic_bfs_edges currently isn't exercised by networkx tests
+    Gnx = nx.karate_club_graph()
+    Gcg = nx.karate_club_graph(backend="cugraph")
+    for depth_limit in (0, 1, 2):
+        for source in Gnx:
+            # Some ordering is arbitrary, so I think there's a chance
+            # this test may fail if networkx or nx-cugraph changes.
+            nx_result = nx.generic_bfs_edges(Gnx, source, depth_limit=depth_limit)
+            cg_result = nx.generic_bfs_edges(Gcg, source, depth_limit=depth_limit)
+            assert sorted(nx_result) == sorted(cg_result), (source, depth_limit)
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_cluster.py b/python/nx-cugraph/nx_cugraph/tests/test_cluster.py
new file mode 100644
index 00000000000..ad4770f1ab8
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_cluster.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+from packaging.version import parse
+
+nxver = parse(nx.__version__)
+
+if nxver.major == 3 and nxver.minor < 2:
+    pytest.skip("Need NetworkX >=3.2 to test clustering", allow_module_level=True)
+
+
+def test_selfloops():
+    G = nx.complete_graph(5)
+    H = nx.complete_graph(5)
+    H.add_edge(0, 0)
+    H.add_edge(1, 1)
+    H.add_edge(2, 2)
+    # triangles
+    expected = nx.triangles(G)
+    assert expected == nx.triangles(H)
+    assert expected == nx.triangles(G, backend="cugraph")
+    assert expected == nx.triangles(H, backend="cugraph")
+    # average_clustering
+    expected = nx.average_clustering(G)
+    assert expected == nx.average_clustering(H)
+    assert expected == nx.average_clustering(G, backend="cugraph")
+    assert expected == nx.average_clustering(H, backend="cugraph")
+    # clustering
+    expected = nx.clustering(G)
+    assert expected == nx.clustering(H)
+    assert expected == nx.clustering(G, backend="cugraph")
+    assert expected == nx.clustering(H, backend="cugraph")
+    # transitivity
+    expected = nx.transitivity(G)
+    assert expected == nx.transitivity(H)
+    assert expected == nx.transitivity(G, backend="cugraph")
+    assert expected == nx.transitivity(H, backend="cugraph")
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_connected.py b/python/nx-cugraph/nx_cugraph/tests/test_connected.py
new file mode 100644
index 00000000000..fa9f283abc0
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_connected.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+
+import nx_cugraph as nxcg
+
+
+def test_connected_isolated_nodes():
+    G = nx.complete_graph(4)
+    G.add_node(max(G) + 1)
+    assert nx.is_connected(G) is False
+    assert nxcg.is_connected(G) is False
+    assert nx.number_connected_components(G) == 2
+    assert nxcg.number_connected_components(G) == 2
+    assert sorted(nx.connected_components(G)) == [{0, 1, 2, 3}, {4}]
+    assert sorted(nxcg.connected_components(G)) == [{0, 1, 2, 3}, {4}]
+    assert nx.node_connected_component(G, 0) == {0, 1, 2, 3}
+    assert nxcg.node_connected_component(G, 0) == {0, 1, 2, 3}
+    assert nx.node_connected_component(G, 4) == {4}
+    assert nxcg.node_connected_component(G, 4) == {4}
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_generators.py b/python/nx-cugraph/nx_cugraph/tests/test_generators.py
index 511f8dcd8e2..c751b0fe2b3 100644
--- a/python/nx-cugraph/nx_cugraph/tests/test_generators.py
+++ b/python/nx-cugraph/nx_cugraph/tests/test_generators.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,30 +17,9 @@
 
 import nx_cugraph as nxcg
 
-nxver = parse(nx.__version__)
-
+from .testing_utils import assert_graphs_equal
 
-def assert_graphs_equal(Gnx, Gcg):
-    assert isinstance(Gnx, nx.Graph)
-    assert isinstance(Gcg, nxcg.Graph)
-    assert Gnx.number_of_nodes() == Gcg.number_of_nodes()
-    assert Gnx.number_of_edges() == Gcg.number_of_edges()
-    assert Gnx.is_directed() == Gcg.is_directed()
-    assert Gnx.is_multigraph() == Gcg.is_multigraph()
-    G = nxcg.to_networkx(Gcg)
-    rv = nx.utils.graphs_equal(G, Gnx)
-    if not rv:
-        print("GRAPHS ARE NOT EQUAL!")
-        assert sorted(G) == sorted(Gnx)
-        assert sorted(G._adj) == sorted(Gnx._adj)
-        assert sorted(G._node) == sorted(Gnx._node)
-        for k in sorted(G._adj):
-            print(k, sorted(G._adj[k]), sorted(Gnx._adj[k]))
-        print(nx.to_scipy_sparse_array(G).todense())
-        print(nx.to_scipy_sparse_array(Gnx).todense())
-        print(G.graph)
-        print(Gnx.graph)
-    assert rv
+nxver = parse(nx.__version__)
 
 
 if nxver.major == 3 and nxver.minor < 2:
diff --git a/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py b/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py
new file mode 100644
index 00000000000..3120995a2b2
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/test_graph_methods.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+import pytest
+
+import nx_cugraph as nxcg
+
+from .testing_utils import assert_graphs_equal
+
+
+def _create_Gs():
+    rv = []
+    rv.append(nx.DiGraph())
+    G = nx.DiGraph()
+    G.add_edge(0, 1)
+    G.add_edge(1, 0)
+    rv.append(G)
+    G = G.copy()
+    G.add_edge(0, 2)
+    rv.append(G)
+    G = G.copy()
+    G.add_edge(1, 1)
+    rv.append(G)
+    G = nx.DiGraph()
+    G.add_edge(0, 1, x=1, y=2)
+    G.add_edge(1, 0, x=10, z=3)
+    rv.append(G)
+    G = G.copy()
+    G.add_edge(0, 2, a=42)
+    rv.append(G)
+    G = G.copy()
+    G.add_edge(1, 1, a=4)
+    rv.append(G)
+    return rv
+
+
+@pytest.mark.parametrize("Gnx", _create_Gs())
+@pytest.mark.parametrize("reciprocal", [False, True])
+def test_to_undirected_directed(Gnx, reciprocal):
+    Gcg = nxcg.DiGraph(Gnx)
+    assert_graphs_equal(Gnx, Gcg)
+    Hnx1 = Gnx.to_undirected(reciprocal=reciprocal)
+    Hcg1 = Gcg.to_undirected(reciprocal=reciprocal)
+    assert_graphs_equal(Hnx1, Hcg1)
+    Hnx2 = Hnx1.to_directed()
+    Hcg2 = Hcg1.to_directed()
+    assert_graphs_equal(Hnx2, Hcg2)
+
+
+def test_multidigraph_to_undirected():
+    Gnx = nx.MultiDiGraph()
+    Gnx.add_edge(0, 1)
+    Gnx.add_edge(0, 1)
+    Gnx.add_edge(1, 0)
+    Gcg = nxcg.MultiDiGraph(Gnx)
+    with pytest.raises(NotImplementedError):
+        Gcg.to_undirected()
diff --git a/python/nx-cugraph/nx_cugraph/tests/testing_utils.py b/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
new file mode 100644
index 00000000000..6d4741c9ca6
--- /dev/null
+++ b/python/nx-cugraph/nx_cugraph/tests/testing_utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import networkx as nx
+
+import nx_cugraph as nxcg
+
+
+def assert_graphs_equal(Gnx, Gcg):
+    assert isinstance(Gnx, nx.Graph)
+    assert isinstance(Gcg, nxcg.Graph)
+    assert Gnx.number_of_nodes() == Gcg.number_of_nodes()
+    assert Gnx.number_of_edges() == Gcg.number_of_edges()
+    assert Gnx.is_directed() == Gcg.is_directed()
+    assert Gnx.is_multigraph() == Gcg.is_multigraph()
+    G = nxcg.to_networkx(Gcg)
+    rv = nx.utils.graphs_equal(G, Gnx)
+    if not rv:
+        print("GRAPHS ARE NOT EQUAL!")
+        assert sorted(G) == sorted(Gnx)
+        assert sorted(G._adj) == sorted(Gnx._adj)
+        assert sorted(G._node) == sorted(Gnx._node)
+        for k in sorted(G._adj):
+            print(k, sorted(G._adj[k]), sorted(Gnx._adj[k]))
+        print(nx.to_scipy_sparse_array(G).todense())
+        print(nx.to_scipy_sparse_array(Gnx).todense())
+        print(G.graph)
+        print(Gnx.graph)
+    assert rv
diff --git a/python/nx-cugraph/nx_cugraph/utils/decorators.py b/python/nx-cugraph/nx_cugraph/utils/decorators.py
index 0048aee51bb..011ebfd6ef7 100644
--- a/python/nx-cugraph/nx_cugraph/utils/decorators.py
+++ b/python/nx-cugraph/nx_cugraph/utils/decorators.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -15,6 +15,7 @@
 from functools import partial, update_wrapper
 from textwrap import dedent
 
+import networkx as nx
 from networkx.utils.decorators import nodes_or_number, not_implemented_for
 
 from nx_cugraph.interface import BackendInterface
@@ -40,17 +41,40 @@ class networkx_algorithm:
     name: str
     extra_doc: str | None
     extra_params: dict[str, str] | None
+    version_added: str
+    is_incomplete: bool
+    is_different: bool
+    _plc_names: set[str] | None
 
     def __new__(
         cls,
         func=None,
         *,
         name: str | None = None,
+        # Extra parameter info that is added to NetworkX docstring
         extra_params: dict[str, str] | str | None = None,
+        # Applies `nodes_or_number` decorator compatibly across versions (3.3 changed)
+        nodes_or_number: list[int] | int | None = None,
+        # Metadata (for introspection only)
+        version_added: str,  # Required
+        is_incomplete: bool = False,  # See self.extra_doc for details if True
+        is_different: bool = False,  # See self.extra_doc for details if True
+        _plc: str | set[str] | None = None,  # Hidden from user, may be removed someday
     ):
         if func is None:
-            return partial(networkx_algorithm, name=name, extra_params=extra_params)
+            return partial(
+                networkx_algorithm,
+                name=name,
+                extra_params=extra_params,
+                nodes_or_number=nodes_or_number,
+                version_added=version_added,
+                is_incomplete=is_incomplete,
+                is_different=is_different,
+                _plc=_plc,
+            )
         instance = object.__new__(cls)
+        if nodes_or_number is not None and nx.__version__[:3] > "3.2":
+            func = nx.utils.decorators.nodes_or_number(nodes_or_number)(func)
         # update_wrapper sets __wrapped__, which will be used for the signature
         update_wrapper(instance, func)
         instance.__defaults__ = func.__defaults__
@@ -65,6 +89,17 @@ def __new__(
                 f"extra_params must be dict, str, or None; got {type(extra_params)}"
             )
         instance.extra_params = extra_params
+        if _plc is None or isinstance(_plc, set):
+            instance._plc_names = _plc
+        elif isinstance(_plc, str):
+            instance._plc_names = {_plc}
+        else:
+            raise TypeError(
+                f"_plc argument must be str, set, or None; got {type(_plc)}"
+            )
+        instance.version_added = version_added
+        instance.is_incomplete = is_incomplete
+        instance.is_different = is_different
         # The docstring on our function is added to the NetworkX docstring.
         instance.extra_doc = (
             dedent(func.__doc__.lstrip("\n").rstrip()) if func.__doc__ else None
@@ -76,10 +111,17 @@ def __new__(
         setattr(BackendInterface, instance.name, instance)
         # Set methods so they are in __dict__
         instance._can_run = instance._can_run
+        if nodes_or_number is not None and nx.__version__[:3] <= "3.2":
+            instance = nx.utils.decorators.nodes_or_number(nodes_or_number)(instance)
         return instance
 
     def _can_run(self, func):
         """Set the `can_run` attribute to the decorated function."""
+        if not func.__name__.startswith("_"):
+            raise ValueError(
+                "The name of the function used by `_can_run` must begin with '_'; "
+                f"got: {func.__name__!r}"
+            )
         self.can_run = func
 
     def __call__(self, /, *args, **kwargs):
diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py
index e303375918d..aa06d7fd29b 100644
--- a/python/nx-cugraph/nx_cugraph/utils/misc.py
+++ b/python/nx-cugraph/nx_cugraph/utils/misc.py
@@ -58,16 +58,18 @@ def pairwise(it):
 
 
 def _groupby(
-    groups: cp.ndarray, values: cp.ndarray, groups_are_canonical: bool = False
+    groups: cp.ndarray | list[cp.ndarray],
+    values: cp.ndarray | list[cp.ndarray],
+    groups_are_canonical: bool = False,
 ) -> dict[int, cp.ndarray]:
     """Perform a groupby operation given an array of group IDs and array of values.
 
     Parameters
     ----------
-    groups : cp.ndarray
-        Array that holds the group IDs.
-    values : cp.ndarray
-        Array of values to be grouped according to groups.
+    groups : cp.ndarray or list of cp.ndarray
+        Array or list of arrays that holds the group IDs.
+    values : cp.ndarray or list of cp.ndarray
+        Array or list of arrays of values to be grouped according to groups.
         Must be the same size as groups array.
     groups_are_canonical : bool, default False
         Whether the group IDs are consecutive integers beginning with 0.
@@ -76,18 +78,42 @@ def _groupby(
     -------
     dict with group IDs as keys and cp.ndarray as values.
     """
-    if groups.size == 0:
-        return {}
-    sort_indices = cp.argsort(groups)
-    sorted_groups = groups[sort_indices]
-    sorted_values = values[sort_indices]
-    prepend = 1 if groups_are_canonical else sorted_groups[0] + 1
-    left_bounds = cp.nonzero(cp.diff(sorted_groups, prepend=prepend))[0]
-    boundaries = pairwise(itertools.chain(left_bounds.tolist(), [groups.size]))
+    if isinstance(groups, list):
+        if groups_are_canonical:
+            raise ValueError(
+                "`groups_are_canonical=True` is not allowed when `groups` is a list."
+            )
+        if len(groups) == 0 or (size := groups[0].size) == 0:
+            return {}
+        sort_indices = cp.lexsort(cp.vstack(groups[::-1]))
+        sorted_groups = cp.vstack([group[sort_indices] for group in groups])
+        prepend = sorted_groups[:, 0].max() + 1
+        changed = cp.abs(cp.diff(sorted_groups, prepend=prepend)).sum(axis=0)
+        changed[0] = 1
+        left_bounds = cp.nonzero(changed)[0]
+    else:
+        if (size := groups.size) == 0:
+            return {}
+        sort_indices = cp.argsort(groups)
+        sorted_groups = groups[sort_indices]
+        prepend = 1 if groups_are_canonical else sorted_groups[0] + 1
+        left_bounds = cp.nonzero(cp.diff(sorted_groups, prepend=prepend))[0]
+    if isinstance(values, list):
+        sorted_values = [vals[sort_indices] for vals in values]
+    else:
+        sorted_values = values[sort_indices]
+    boundaries = pairwise(itertools.chain(left_bounds.tolist(), [size]))
     if groups_are_canonical:
         it = enumerate(boundaries)
+    elif isinstance(groups, list):
+        it = zip(map(tuple, sorted_groups.T[left_bounds].tolist()), boundaries)
     else:
         it = zip(sorted_groups[left_bounds].tolist(), boundaries)
+    if isinstance(values, list):
+        return {
+            group: [sorted_vals[start:end] for sorted_vals in sorted_values]
+            for group, (start, end) in it
+        }
     return {group: sorted_values[start:end] for group, (start, end) in it}
 
 
diff --git a/python/nx-cugraph/pyproject.toml b/python/nx-cugraph/pyproject.toml
index f309f4797a7..63ac115918f 100644
--- a/python/nx-cugraph/pyproject.toml
+++ b/python/nx-cugraph/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "networkx>=3.0",
     "numpy>=1.21",
-    "pylibcugraph==23.12.*",
+    "pylibcugraph==24.2.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
@@ -52,12 +52,20 @@ test = [
 Homepage = "https://github.com/rapidsai/cugraph"
 Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 
+# "plugin" used in nx version < 3.2
 [project.entry-points."networkx.plugins"]
 cugraph = "nx_cugraph.interface:BackendInterface"
 
 [project.entry-points."networkx.plugin_info"]
 cugraph = "_nx_cugraph:get_info"
 
+# "backend" used in nx version >= 3.2
+[project.entry-points."networkx.backends"]
+cugraph = "nx_cugraph.interface:BackendInterface"
+
+[project.entry-points."networkx.backend_info"]
+cugraph = "_nx_cugraph:get_info"
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 
diff --git a/python/nx-cugraph/run_nx_tests.sh b/python/nx-cugraph/run_nx_tests.sh
index 07c97cdf947..da7a2014cef 100755
--- a/python/nx-cugraph/run_nx_tests.sh
+++ b/python/nx-cugraph/run_nx_tests.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # NETWORKX_GRAPH_CONVERT=cugraph
 #   Used by networkx versions 3.0 and 3.1
@@ -30,7 +30,13 @@ NETWORKX_TEST_BACKEND=cugraph \
 NETWORKX_FALLBACK_TO_NX=True \
     pytest \
     --pyargs networkx \
-    --cov=nx_cugraph.algorithms \
-    --cov-report term-missing \
-    --no-cov-on-fail \
+    --config-file=$(dirname $0)/pyproject.toml \
+    --cov-config=$(dirname $0)/pyproject.toml \
+    --cov=nx_cugraph \
+    --cov-report= \
     "$@"
+coverage report \
+    --include="*/nx_cugraph/algorithms/*" \
+    --omit=__init__.py \
+    --show-missing \
+    --rcfile=$(dirname $0)/pyproject.toml
diff --git a/python/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/CMakeLists.txt
index 057f30ef3ad..6ef3bf9dd40 100644
--- a/python/pylibcugraph/CMakeLists.txt
+++ b/python/pylibcugraph/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(pylibcugraph_version 23.12.00)
+set(pylibcugraph_version 24.02.00)
 
 include(../../fetch_rapids.cmake)
 
@@ -26,11 +26,7 @@ rapids_cuda_init_architectures(pylibcugraph-python)
 project(
   pylibcugraph-python
   VERSION ${pylibcugraph_version}
-  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
-            # language to be enabled here. The test project that is built in scikit-build to verify
-            # various linking options for the python library is hardcoded to build with C, so until
-            # that is fixed we need to keep C.
-            C CXX CUDA
+  LANGUAGES CXX CUDA
 )
 
 ################################################################################
@@ -38,7 +34,6 @@ project(
 option(FIND_CUGRAPH_CPP "Search for existing CUGRAPH C++ installations before defaulting to local files"
        OFF
 )
-option(CUGRAPH_BUILD_WHEELS "Whether we're building a wheel for pypi" OFF)
 option(USE_CUGRAPH_OPS "Enable all functions that call cugraph-ops" ON)
 
 if(NOT USE_CUGRAPH_OPS)
@@ -53,28 +48,20 @@ else()
   set(cugraph_FOUND OFF)
 endif()
 
-include(rapids-cython)
+include(rapids-cython-core)
 
 if (NOT cugraph_FOUND)
   set(BUILD_TESTS OFF)
   set(BUILD_CUGRAPH_MG_TESTS OFF)
   set(BUILD_CUGRAPH_OPS_CPP_TESTS OFF)
-
-  set(_exclude_from_all "")
-  if(CUGRAPH_BUILD_WHEELS)
-    # Statically link dependencies if building wheels
-    set(CUDA_STATIC_RUNTIME ON)
-    set(USE_RAFT_STATIC ON)
-    set(CUGRAPH_COMPILE_RAFT_LIB ON)
-    set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
-    set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
-    set(ALLOW_CLONE_CUGRAPH_OPS ON)
-
-    # Don't install the cuML C++ targets into wheels
-    set(_exclude_from_all EXCLUDE_FROM_ALL)
-  endif()
-
-  add_subdirectory(../../cpp cugraph-cpp ${_exclude_from_all})
+  set(CUDA_STATIC_RUNTIME ON)
+  set(USE_RAFT_STATIC ON)
+  set(CUGRAPH_COMPILE_RAFT_LIB ON)
+  set(CUGRAPH_USE_CUGRAPH_OPS_STATIC ON)
+  set(CUGRAPH_EXCLUDE_CUGRAPH_OPS_FROM_ALL ON)
+  set(ALLOW_CLONE_CUGRAPH_OPS ON)
+
+  add_subdirectory(../../cpp cugraph-cpp EXCLUDE_FROM_ALL)
 
   set(cython_lib_dir pylibcugraph)
   install(TARGETS cugraph DESTINATION ${cython_lib_dir})
diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx
index b3065fa0684..def47390ce5 100644
--- a/python/pylibcugraph/pylibcugraph/graphs.pyx
+++ b/python/pylibcugraph/pylibcugraph/graphs.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -64,7 +64,7 @@ cdef class SGGraph(_GPUGraph):
         Object defining intended properties for the graph.
 
     src_or_offset_array : device array type
-        Device array containing either the vertex identifiers of the source of 
+        Device array containing either the vertex identifiers of the source of
         each directed edge if represented in COO format or the offset if
         CSR format. In the case of a COO, the order of the array corresponds to
         the ordering of the dst_or_index_array, where the ith item in
@@ -77,9 +77,14 @@ cdef class SGGraph(_GPUGraph):
         CSR format. In the case of a COO, The order of the array corresponds to
         the ordering of the src_offset_array, where the ith item in src_offset_array
         and the ith item in dst_index_array define the ith edge of the graph.
-    
+
     vertices_array : device array type
-        Device array containing the isolated vertices of the graph.
+        Device array containing all vertices of the graph. This array is
+        optional, but must be used if the graph contains isolated vertices
+        which cannot be represented in the src_or_offset_array and
+        dst_index_array arrays.  If specified, this array must contain every
+        vertex identifier, including vertex identifiers that are already
+        included in the src_or_offset_array and dst_index_array arrays.
 
     weight_array : device array type
         Device array containing the weight values of each directed edge. The
@@ -99,25 +104,25 @@ cdef class SGGraph(_GPUGraph):
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure
         validitity, at the expense of increased run time.
-    
+
     edge_id_array : device array type
         Device array containing the edge ids of each directed edge.  Must match
         the ordering of the src/dst arrays.  Optional (may be null).  If
         provided, edge_type_array must also be provided.
-    
+
     edge_type_array : device array type
         Device array containing the edge types of each directed edge.  Must
         match the ordering of the src/dst/edge_id arrays.  Optional (may be
         null).  If provided, edge_id_array must be provided.
-    
+
     input_array_format: str, optional (default='COO')
         Input representation used to construct a graph
             COO: arrays represent src_array and dst_array
             CSR: arrays represent offset_array and index_array
-    
+
     drop_self_loops : bool, optional (default='False')
         If true, drop any self loops that exist in the provided edge list.
-    
+
     drop_multi_edges: bool, optional (default='False')
         If true, drop any multi edges that exist in the provided edge list
 
@@ -178,7 +183,7 @@ cdef class SGGraph(_GPUGraph):
         cdef cugraph_type_erased_device_array_view_t* srcs_or_offsets_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 src_or_offset_array
-            )  
+            )
         cdef cugraph_type_erased_device_array_view_t* dsts_or_indices_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 dst_or_index_array
@@ -192,7 +197,7 @@ cdef class SGGraph(_GPUGraph):
             )
         self.edge_id_view_ptr = create_cugraph_type_erased_device_array_view_from_py_obj(
                 edge_id_array
-            )  
+            )
         cdef cugraph_type_erased_device_array_view_t* edge_type_view_ptr = \
             create_cugraph_type_erased_device_array_view_from_py_obj(
                 edge_type_array
@@ -237,7 +242,7 @@ cdef class SGGraph(_GPUGraph):
 
             assert_success(error_code, error_ptr,
                        "cugraph_sg_graph_create_from_csr()")
-        
+
         else:
             raise ValueError("invalid 'input_array_format'. Only "
                 "'COO' and 'CSR' format are supported."
@@ -282,7 +287,7 @@ cdef class MGGraph(_GPUGraph):
         each directed edge. The order of the array corresponds to the ordering
         of the src_array, where the ith item in src_array and the ith item in
         dst_array define the ith edge of the graph.
-    
+
     vertices_array : device array type
         Device array containing the isolated vertices of the graph.
 
@@ -295,12 +300,12 @@ cdef class MGGraph(_GPUGraph):
     store_transposed : bool
         Set to True if the graph should be transposed. This is required for some
         algorithms, such as pagerank.
-    
+
     num_arrays : size_t
         Number of arrays.
-        
+
         If provided, all list of device arrays should be of the same size.
-    
+
     do_expensive_check : bool
         If True, performs more extensive tests on the inputs to ensure
         validitity, at the expense of increased run time.
@@ -309,15 +314,15 @@ cdef class MGGraph(_GPUGraph):
         Device array containing the edge ids of each directed edge.  Must match
         the ordering of the src/dst arrays.  Optional (may be null).  If
         provided, edge_type_array must also be provided.
-    
+
     edge_type_array : device array type
         Device array containing the edge types of each directed edge.  Must
         match the ordering of the src/dst/edge_id arrays.  Optional (may be
         null).  If provided, edge_id_array must be provided.
-    
+
     drop_self_loops : bool, optional (default='False')
         If true, drop any self loops that exist in the provided edge list.
-    
+
     drop_multi_edges: bool, optional (default='False')
         If true, drop any multi edges that exist in the provided edge list
     """
@@ -357,12 +362,12 @@ cdef class MGGraph(_GPUGraph):
             dst_array = [dst_array]
             if not any(dst_array):
                 dst_array = dst_array * num_arrays
-        
+
         if not isinstance(weight_array, list):
             weight_array = [weight_array]
             if not any(weight_array):
                 weight_array = weight_array * num_arrays
-        
+
         if not isinstance(edge_id_array, list):
             edge_id_array = [edge_id_array]
             if not any(edge_id_array):
@@ -372,7 +377,7 @@ cdef class MGGraph(_GPUGraph):
             edge_type_array = [edge_type_array]
             if not any(edge_type_array):
                 edge_type_array = edge_type_array * num_arrays
-        
+
         if not isinstance(vertices_array, list):
             vertices_array = [vertices_array]
             if not any(vertices_array):
@@ -394,7 +399,7 @@ cdef class MGGraph(_GPUGraph):
 
                 if edge_id_array is not None and len(edge_id_array[i]) != len(src_array[i]):
                     raise ValueError('Edge id array must be same length as edgelist')
-            
+
                 assert_CAI_type(edge_type_array[i], "edge_type_array", True)
                 if edge_type_array[i] is not None and len(edge_type_array[i]) != len(src_array[i]):
                     raise ValueError('Edge type array must be same length as edgelist')
@@ -421,7 +426,7 @@ cdef class MGGraph(_GPUGraph):
                         <cugraph_type_erased_device_array_view_t **>malloc(
                             num_arrays * sizeof(cugraph_type_erased_device_array_view_t*))
                 vertices_view_ptr_ptr[i] = \
-                    create_cugraph_type_erased_device_array_view_from_py_obj(vertices_array[i])                
+                    create_cugraph_type_erased_device_array_view_from_py_obj(vertices_array[i])
 
             if weight_array[i] is not None:
                 if i == 0:
@@ -458,9 +463,9 @@ cdef class MGGraph(_GPUGraph):
             edge_type_view_ptr_ptr,
             store_transposed,
             num_arrays,
-            do_expensive_check,
             drop_self_loops,
             drop_multi_edges,
+            do_expensive_check,
             &(self.c_graph_ptr),
             &error_ptr)
 
diff --git a/python/pylibcugraph/pyproject.toml b/python/pylibcugraph/pyproject.toml
index 96f5ec84efb..1d27d952af1 100644
--- a/python/pylibcugraph/pyproject.toml
+++ b/python/pylibcugraph/pyproject.toml
@@ -6,13 +6,13 @@ requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
     "ninja",
-    "pylibraft==23.12.*",
-    "rmm==23.12.*",
-    "scikit-build>=0.13.1",
+    "pylibraft==24.2.*",
+    "rmm==24.2.*",
+    "scikit-build-core[pyproject]>=0.7.0",
     "setuptools>=61.0.0",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "setuptools.build_meta"
+build-backend = "scikit_build_core.build"
 
 [tool.pytest.ini_options]
 testpaths = ["pylibcugraph/tests"]
@@ -28,8 +28,8 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "pylibraft==23.12.*",
-    "rmm==23.12.*",
+    "pylibraft==24.2.*",
+    "rmm==24.2.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -40,7 +40,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "cudf==23.12.*",
+    "cudf==24.2.*",
     "numpy>=1.21",
     "pandas",
     "pytest",
@@ -54,8 +54,15 @@ test = [
 Homepage = "https://github.com/rapidsai/cugraph"
 Documentation = "https://docs.rapids.ai/api/cugraph/stable/"
 
-[tool.setuptools]
-license-files = ["LICENSE"]
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.minimum-version = "3.26.4"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["pylibcugraph"]
 
-[tool.setuptools.dynamic]
-version = {file = "pylibcugraph/VERSION"}
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "pylibcugraph/VERSION"
+regex = "(?P<value>.*)"
diff --git a/python/pylibcugraph/setup.py b/python/pylibcugraph/setup.py
deleted file mode 100644
index a6c1bda3b5b..00000000000
--- a/python/pylibcugraph/setup.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-from setuptools import find_packages, Command
-from skbuild import setup
-
-
-class CleanCommand(Command):
-    """Custom clean command to tidy up the project root."""
-
-    user_options = [
-        ("all", None, None),
-    ]
-
-    def initialize_options(self):
-        self.all = None
-
-    def finalize_options(self):
-        pass
-
-    def run(self):
-        setupFileDir = os.path.dirname(os.path.abspath(__file__))
-        os.chdir(setupFileDir)
-        os.system("rm -rf build")
-        os.system("rm -rf dist")
-        os.system("rm -rf dask-worker-space")
-        os.system('find . -name "__pycache__" -type d -exec rm -rf {} +')
-        os.system("rm -rf *.egg-info")
-        os.system('find . -name "*.cpp" -type f -delete')
-        os.system('find . -name "*.cpython*.so" -type f -delete')
-        os.system("rm -rf _skbuild")
-
-
-def exclude_libcxx_symlink(cmake_manifest):
-    return list(
-        filter(
-            lambda name: not ("include/rapids/libcxx/include" in name), cmake_manifest
-        )
-    )
-
-
-packages = find_packages(include=["pylibcugraph*"])
-setup(
-    packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
-    cmake_process_manifest_hook=exclude_libcxx_symlink,
-    cmdclass={"clean": CleanCommand},
-    zip_safe=False,
-)