diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
index 9600dca..7401fa7 100644
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
+++ b/python/cugraph-dgl/cugraph_dgl/nn/conv/base.py
@@ -277,112 +277,3 @@ def to(self, device: Union[torch.device, str, int]) -> "cugraph_dgl.nn.SparseGra
         )
 
         return sg
-
-
-def conditional_class(import_name):
-    def decorator(cls):
-        try:
-            __import__(import_name)
-            return cls
-        except ImportError:
-            return None
-
-    return decorator
-
-
-@conditional_class("pylibcugraphops")
-class BaseConv(torch.nn.Module):
-    r"""An abstract base class for cugraph-ops nn module."""
-
-    def __init__(self):
-        super().__init__()
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        raise NotImplementedError
-
-    def forward(self, *args):
-        r"""Runs the forward pass of the module."""
-        raise NotImplementedError
-
-    def get_cugraph_ops_CSC(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        is_bipartite: bool = False,
-        max_in_degree: Optional[int] = None,
-    ) -> "ops_torch.CSC":
-        """Create CSC structure needed by cugraph-ops."""
-
-        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
-            raise TypeError(
-                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
-
-        # TODO: max_in_degree should default to None in pylibcugraphops
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            offsets, indices, _ = g.csc()
-        else:
-            offsets, indices, _ = g.adj_tensors("csc")
-
-        graph = ops_torch.CSC(
-            offsets=offsets,
-            indices=indices,
-            num_src_nodes=g.num_src_nodes(),
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=is_bipartite,
-        )
-
-        return graph
-
-    def get_cugraph_ops_HeteroCSC(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        num_edge_types: int,
-        etypes: Optional[torch.Tensor] = None,
-        is_bipartite: bool = False,
-        max_in_degree: Optional[int] = None,
-    ) -> "ops_torch.HeteroCSC":
-        """Create HeteroCSC structure needed by cugraph-ops."""
-
-        if not isinstance(g, (SparseGraph, dgl.DGLHeteroGraph)):
-            raise TypeError(
-                f"The graph has to be either a 'cugraph_dgl.nn.SparseGraph' or "
-                f"'dgl.DGLHeteroGraph', but got '{type(g)}'."
-            )
-
-        # TODO: max_in_degree should default to None in pylibcugraphops
-        if max_in_degree is None:
-            max_in_degree = -1
-
-        if isinstance(g, SparseGraph):
-            offsets, indices, etypes = g.csc()
-            if etypes is None:
-                raise ValueError(
-                    "SparseGraph must have 'values' to create HeteroCSC. "
-                    "Pass in edge types as 'values' when creating the SparseGraph."
-                )
-            etypes = etypes.int()
-        else:
-            if etypes is None:
-                raise ValueError(
-                    "'etypes' is required when creating HeteroCSC "
-                    "from dgl.DGLHeteroGraph."
-                )
-            offsets, indices, perm = g.adj_tensors("csc")
-            etypes = etypes[perm].int()
-
-        graph = ops_torch.HeteroCSC(
-            offsets=offsets,
-            indices=indices,
-            edge_types=etypes,
-            num_src_nodes=g.num_src_nodes(),
-            num_edge_types=num_edge_types,
-            dst_max_in_degree=max_in_degree,
-            is_bipartite=is_bipartite,
-        )
-
-        return graph
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
deleted file mode 100644
index e881327..0000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatconv.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class GATConv(BaseConv):
-    r"""Graph attention layer from `Graph Attention Network
-    <https://arxiv.org/pdf/1710.10903.pdf>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or (int, int)
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_feats : int
-        Output feature size.
-    num_heads : int
-        Number of heads in multi-head attention.
-    feat_drop : float, optional
-        Dropout rate on feature. Defaults: ``0``.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    edge_feats : int, optional
-        Edge feature size. Default: ``None``.
-    negative_slope : float, optional
-        LeakyReLU angle of negative slope. Defaults: ``0.2``.
-    residual : bool, optional
-        If True, use residual connection. Defaults: ``False``.
-    allow_zero_in_degree : bool, optional
-        If there are 0-in-degree nodes in the graph, output for those nodes will
-        be invalid since no message will be passed to those nodes. This is
-        harmful for some applications causing silent performance regression.
-        This module will raise a DGLError if it detects 0-in-degree nodes in
-        input graph. By setting ``True``, it will suppress the check and let the
-        users handle it by themselves. Defaults: ``False``.
-    bias : bool, optional
-        If True, learns a bias term. Defaults: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import GATConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> g = dgl.add_self_loop(g)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = GATConv(10, 2, num_heads=3).to(device)
-    >>> res = conv(g, feat)
-    >>> res
-    tensor([[[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]],
-            [[ 0.2340,  1.9226],
-            [ 1.6477, -1.9986],
-            [ 1.1138, -1.9302]]], device='cuda:0', grad_fn=<ViewBackward0>)
-    """
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        num_heads: int,
-        feat_drop: float = 0.0,
-        concat: bool = True,
-        edge_feats: Optional[int] = None,
-        negative_slope: float = 0.2,
-        residual: bool = False,
-        allow_zero_in_degree: bool = False,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if isinstance(in_feats, int):
-            self.in_feats_src = self.in_feats_dst = in_feats
-        else:
-            self.in_feats_src, self.in_feats_dst = in_feats
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_heads = num_heads
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.concat = concat
-        self.edge_feats = edge_feats
-        self.negative_slope = negative_slope
-        self.residual = residual
-        self.allow_zero_in_degree = allow_zero_in_degree
-
-        if isinstance(in_feats, int):
-            self.lin = nn.Linear(in_feats, num_heads * out_feats, bias=False)
-        else:
-            self.lin_src = nn.Linear(
-                self.in_feats_src, num_heads * out_feats, bias=False
-            )
-            self.lin_dst = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=False
-            )
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-            self.attn_weights = nn.Parameter(torch.empty(3 * num_heads * out_feats))
-        else:
-            self.register_parameter("lin_edge", None)
-            self.attn_weights = nn.Parameter(torch.empty(2 * num_heads * out_feats))
-
-        out_dim = num_heads * out_feats if concat else out_feats
-        if residual:
-            if self.in_feats_dst != out_dim:
-                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
-            else:
-                self.lin_res = nn.Identity()
-        else:
-            self.register_buffer("lin_res", None)
-
-        if bias and not isinstance(self.lin_res, nn.Linear):
-            if concat:
-                self.bias = nn.Parameter(torch.empty(num_heads, out_feats))
-            else:
-                self.bias = nn.Parameter(torch.empty(out_feats))
-        else:
-            self.register_buffer("bias", None)
-
-        self.reset_parameters()
-
-    def set_allow_zero_in_degree(self, set_value):
-        r"""Set allow_zero_in_degree flag."""
-        self.allow_zero_in_degree = set_value
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain("relu")
-        if hasattr(self, "lin"):
-            nn.init.xavier_normal_(self.lin.weight, gain=gain)
-        else:
-            nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
-            nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
-
-        nn.init.xavier_normal_(
-            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
-        )
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        if self.lin_res is not None:
-            self.lin_res.reset_parameters()
-
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-        max_in_degree: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-        high_precision_dgrad: bool = False,
-        high_precision_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        graph : DGLGraph or SparseGraph
-            The graph.
-        nfeat : torch.Tensor or (torch.Tensor, torch.Tensor)
-            Node features. If given as a tuple, the two elements correspond to
-            the source and destination node features, respectively, in a
-            bipartite graph.
-        efeat: torch.Tensor, optional
-            Optional edge features.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-        deterministic_dgrad : bool, default=False
-            Optional flag indicating whether the feature gradients
-            are computed deterministically using a dedicated workspace buffer.
-        deterministic_wgrad: bool, default=False
-            Optional flag indicating whether the weight gradients
-            are computed deterministically using a dedicated workspace buffer.
-        high_precision_dgrad: bool, default=False
-            Optional flag indicating whether gradients for inputs in half precision
-            are kept in single precision as long as possible and only casted to
-            the corresponding input type at the very end.
-        high_precision_wgrad: bool, default=False
-            Optional flag indicating whether gradients for weights in half precision
-            are kept in single precision as long as possible and only casted to
-            the corresponding input type at the very end.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, H, D_{out})` where
-            :math:`H` is the number of heads, and :math:`D_{out}` is size of
-            output feature.
-        """
-        if isinstance(g, dgl.DGLHeteroGraph):
-            if not self.allow_zero_in_degree:
-                if (g.in_degrees() == 0).any():
-                    raise dgl.base.DGLError(
-                        "There are 0-in-degree nodes in the graph, "
-                        "output for those nodes will be invalid. "
-                        "This is harmful for some applications, "
-                        "causing silent performance regression. "
-                        "Adding self-loop on the input graph by "
-                        "calling `g = dgl.add_self_loop(g)` will resolve "
-                        "the issue. Setting ``allow_zero_in_degree`` "
-                        "to be `True` when constructing this module will "
-                        "suppress the check and let the code run."
-                    )
-
-        bipartite = isinstance(nfeat, (list, tuple))
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=bipartite, max_in_degree=max_in_degree
-        )
-        if deterministic_dgrad:
-            _graph.add_reverse_graph()
-
-        if bipartite:
-            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
-            nfeat_dst_orig = nfeat[1]
-        else:
-            nfeat = self.feat_drop(nfeat)
-            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to "
-                    f"accept edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        if bipartite:
-            if not hasattr(self, "lin_src"):
-                nfeat_src = self.lin(nfeat[0])
-                nfeat_dst = self.lin(nfeat[1])
-            else:
-                nfeat_src = self.lin_src(nfeat[0])
-                nfeat_dst = self.lin_dst(nfeat[1])
-        else:
-            if not hasattr(self, "lin"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_feats is expected to be an "
-                    f"integer when the graph is not bipartite, "
-                    f"but got {self.in_feats}."
-                )
-            nfeat = self.lin(nfeat)
-
-        out = ops_torch.operators.mha_gat_n2n(
-            (nfeat_src, nfeat_dst) if bipartite else nfeat,
-            self.attn_weights,
-            _graph,
-            num_heads=self.num_heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=efeat,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-            high_precision_dgrad=high_precision_dgrad,
-            high_precision_wgrad=high_precision_wgrad,
-        )[: g.num_dst_nodes()]
-
-        if self.concat:
-            out = out.view(-1, self.num_heads, self.out_feats)
-
-        if self.residual:
-            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
-            if not self.concat:
-                res = res.mean(dim=1)
-            out = out + res
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
deleted file mode 100644
index 4f47005..0000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/gatv2conv.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class GATv2Conv(BaseConv):
-    r"""GATv2 from `How Attentive are Graph Attention Networks?
-    <https://arxiv.org/pdf/2105.14491.pdf>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or (int, int)
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_feats : int
-        Output feature size.
-    num_heads : int
-        Number of heads in Multi-Head Attention.
-    feat_drop : float, optional
-        Dropout rate on feature. Defaults: ``0``.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    edge_feats : int, optional
-        Edge feature size. Default: ``None``.
-    negative_slope : float, optional
-        LeakyReLU angle of negative slope. Defaults: ``0.2``.
-    residual : bool, optional
-        If True, use residual connection. Defaults: ``False``.
-    allow_zero_in_degree : bool, optional
-        If there are 0-in-degree nodes in the graph, output for those nodes will
-        be invalid since no message will be passed to those nodes. This is
-        harmful for some applications causing silent performance regression.
-        This module will raise a DGLError if it detects 0-in-degree nodes in
-        input graph. By setting ``True``, it will suppress the check and let the
-        users handle it by themselves. Defaults: ``False``.
-    bias : bool, optional
-        If True, learns a bias term. Defaults: ``True``.
-    share_weights : bool, optional
-        If ``True``, the same matrix will be applied to the source and the
-        destination node features. Defaults: ``False``.
-    """
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        num_heads: int,
-        feat_drop: float = 0.0,
-        concat: bool = True,
-        edge_feats: Optional[int] = None,
-        negative_slope: float = 0.2,
-        residual: bool = False,
-        allow_zero_in_degree: bool = False,
-        bias: bool = True,
-        share_weights: bool = False,
-    ):
-        super().__init__()
-
-        if isinstance(in_feats, int):
-            self.in_feats_src = self.in_feats_dst = in_feats
-        else:
-            self.in_feats_src, self.in_feats_dst = in_feats
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_heads = num_heads
-        self.feat_drop = nn.Dropout(feat_drop)
-        self.concat = concat
-        self.edge_feats = edge_feats
-        self.negative_slope = negative_slope
-        self.residual = residual
-        self.allow_zero_in_degree = allow_zero_in_degree
-        self.share_weights = share_weights
-        self.bias = bias
-
-        self.lin_src = nn.Linear(self.in_feats_src, num_heads * out_feats, bias=bias)
-        if share_weights:
-            if self.in_feats_src != self.in_feats_dst:
-                raise ValueError(
-                    f"Input feature size of source and destination "
-                    f"nodes must be identical when share_weights is enabled, "
-                    f"but got {self.in_feats_src} and {self.in_feats_dst}."
-                )
-            self.lin_dst = self.lin_src
-        else:
-            self.lin_dst = nn.Linear(
-                self.in_feats_dst, num_heads * out_feats, bias=bias
-            )
-
-        self.attn_weights = nn.Parameter(torch.empty(num_heads * out_feats))
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(edge_feats, num_heads * out_feats, bias=False)
-        else:
-            self.register_parameter("lin_edge", None)
-
-        out_dim = num_heads * out_feats if concat else out_feats
-        if residual:
-            if self.in_feats_dst != out_dim:
-                self.lin_res = nn.Linear(self.in_feats_dst, out_dim, bias=bias)
-            else:
-                self.lin_res = nn.Identity()
-        else:
-            self.register_buffer("lin_res", None)
-
-        self.reset_parameters()
-
-    def set_allow_zero_in_degree(self, set_value):
-        r"""Set allow_zero_in_degree flag."""
-        self.allow_zero_in_degree = set_value
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        gain = nn.init.calculate_gain("relu")
-        nn.init.xavier_normal_(self.lin_src.weight, gain=gain)
-        nn.init.xavier_normal_(self.lin_dst.weight, gain=gain)
-
-        nn.init.xavier_normal_(
-            self.attn_weights.view(-1, self.num_heads, self.out_feats), gain=gain
-        )
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        if self.lin_res is not None:
-            self.lin_res.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-        max_in_degree: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        graph : DGLGraph or SparseGraph
-            The graph.
-        nfeat : torch.Tensor
-            Input features of shape :math:`(N, D_{in})`.
-        efeat: torch.Tensor, optional
-            Optional edge features.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-        deterministic_dgrad : bool, default=False
-            Optional flag indicating whether the feature gradients
-            are computed deterministically using a dedicated workspace buffer.
-        deterministic_wgrad: bool, default=False
-            Optional flag indicating whether the weight gradients
-            are computed deterministically using a dedicated workspace buffer.
-
-        Returns
-        -------
-        torch.Tensor
-            The output feature of shape :math:`(N, H, D_{out})` where
-            :math:`H` is the number of heads, and :math:`D_{out}` is size of
-            output feature.
-        """
-
-        if isinstance(g, dgl.DGLHeteroGraph):
-            if not self.allow_zero_in_degree:
-                if (g.in_degrees() == 0).any():
-                    raise dgl.base.DGLError(
-                        "There are 0-in-degree nodes in the graph, "
-                        "output for those nodes will be invalid. "
-                        "This is harmful for some applications, "
-                        "causing silent performance regression. "
-                        "Adding self-loop on the input graph by "
-                        "calling `g = dgl.add_self_loop(g)` will resolve "
-                        "the issue. Setting ``allow_zero_in_degree`` "
-                        "to be `True` when constructing this module will "
-                        "suppress the check and let the code run."
-                    )
-
-        nfeat_bipartite = isinstance(nfeat, (list, tuple))
-        graph_bipartite = nfeat_bipartite or self.share_weights is False
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
-        )
-        if deterministic_dgrad:
-            _graph.add_reverse_graph()
-
-        if nfeat_bipartite:
-            nfeat = (self.feat_drop(nfeat[0]), self.feat_drop(nfeat[1]))
-            nfeat_dst_orig = nfeat[1]
-        else:
-            nfeat = self.feat_drop(nfeat)
-            nfeat_dst_orig = nfeat[: g.num_dst_nodes()]
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to "
-                    f"accept edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        if nfeat_bipartite:
-            nfeat = (self.lin_src(nfeat[0]), self.lin_dst(nfeat[1]))
-        elif graph_bipartite:
-            nfeat = (self.lin_src(nfeat), self.lin_dst(nfeat[: g.num_dst_nodes()]))
-        else:
-            nfeat = self.lin_src(nfeat)
-
-        out = ops_torch.operators.mha_gat_v2_n2n(
-            nfeat,
-            self.attn_weights,
-            _graph,
-            num_heads=self.num_heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=efeat,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-        )[: g.num_dst_nodes()]
-
-        if self.concat:
-            out = out.view(-1, self.num_heads, self.out_feats)
-
-        if self.residual:
-            res = self.lin_res(nfeat_dst_orig).view(-1, self.num_heads, self.out_feats)
-            if not self.concat:
-                res = res.mean(dim=1)
-            out = out + res
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
deleted file mode 100644
index 5c4b5de..0000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/relgraphconv.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class RelGraphConv(BaseConv):
-    r"""An accelerated relational graph convolution layer from `Modeling
-    Relational Data with Graph Convolutional Networks
-    <https://arxiv.org/abs/1703.06103>`__, with the sparse aggregation
-    accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int
-        Input feature size.
-    out_feats : int
-        Output feature size.
-    num_rels : int
-        Number of relations.
-    regularizer : str, optional
-        Which weight regularizer to use ("basis" or ``None``):
-         - "basis" is for basis-decomposition.
-         - ``None`` applies no regularization.
-        Default: ``None``.
-    num_bases : int, optional
-        Number of bases. It comes into effect when a regularizer is applied.
-        Default: ``None``.
-    bias : bool, optional
-        True if bias is added. Default: ``True``.
-    self_loop : bool, optional
-        True to include self loop message. Default: ``True``.
-    dropout : float, optional
-        Dropout rate. Default: ``0.0``.
-    apply_norm : bool, optional
-        True to normalize aggregation output by the in-degree of the destination
-        node per edge type, i.e. :math:`|\mathcal{N}^r_i|`. Default: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import RelGraphConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = RelGraphConv(
-    ...     10, 2, 3, regularizer='basis', num_bases=2).to(device)
-    >>> etypes = torch.tensor([0,1,2,0,1,2]).to(device)
-    >>> res = conv(g, feat, etypes)
-    >>> res
-    tensor([[-1.7774, -2.0184],
-            [-1.4335, -2.3758],
-            [-1.7774, -2.0184],
-            [-0.4698, -3.0876],
-            [-1.4335, -2.3758],
-            [-1.4331, -2.3295]], device='cuda:0', grad_fn=<AddBackward0>)
-    """
-
-    def __init__(
-        self,
-        in_feats: int,
-        out_feats: int,
-        num_rels: int,
-        regularizer: Optional[str] = None,
-        num_bases: Optional[int] = None,
-        bias: bool = True,
-        self_loop: bool = True,
-        dropout: float = 0.0,
-        apply_norm: bool = False,
-    ):
-        super().__init__()
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.num_rels = num_rels
-        self.apply_norm = apply_norm
-        self.dropout = nn.Dropout(dropout)
-
-        dim_self_loop = 1 if self_loop else 0
-        self.self_loop = self_loop
-        if regularizer is None:
-            self.W = nn.Parameter(
-                torch.empty(num_rels + dim_self_loop, in_feats, out_feats)
-            )
-            self.coeff = None
-        elif regularizer == "basis":
-            if num_bases is None:
-                raise ValueError('Missing "num_bases" for basis regularization.')
-            self.W = nn.Parameter(
-                torch.empty(num_bases + dim_self_loop, in_feats, out_feats)
-            )
-            self.coeff = nn.Parameter(torch.empty(num_rels, num_bases))
-            self.num_bases = num_bases
-        else:
-            raise ValueError(
-                f"Supported regularizer options: 'basis' or None, but got "
-                f"'{regularizer}'."
-            )
-        self.regularizer = regularizer
-
-        if bias:
-            self.bias = nn.Parameter(torch.empty(out_feats))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        bound = 1 / math.sqrt(self.in_feats)
-        end = -1 if self.self_loop else None
-        nn.init.uniform_(self.W[:end], -bound, bound)
-        if self.regularizer == "basis":
-            nn.init.xavier_uniform_(self.coeff, gain=nn.init.calculate_gain("relu"))
-        if self.self_loop:
-            nn.init.xavier_uniform_(self.W[-1], nn.init.calculate_gain("relu"))
-        if self.bias is not None:
-            nn.init.zeros_(self.bias)
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: torch.Tensor,
-        etypes: torch.Tensor,
-        max_in_degree: Optional[int] = None,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        g : DGLGraph
-            The graph.
-        feat : torch.Tensor
-            A 2D tensor of node features. Shape: :math:`(|V|, D_{in})`.
-        etypes : torch.Tensor
-            A 1D integer tensor of edge types. Shape: :math:`(|E|,)`.
-            Note that cugraph-ops only accepts edge type tensors in int32,
-            so any input of other integer types will be casted into int32,
-            thus introducing some overhead. Pass in int32 tensors directly
-            for best performance.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-
-        Returns
-        -------
-        torch.Tensor
-            New node features. Shape: :math:`(|V|, D_{out})`.
-        """
-        _graph = self.get_cugraph_ops_HeteroCSC(
-            g,
-            num_edge_types=self.num_rels,
-            etypes=etypes,
-            is_bipartite=False,
-            max_in_degree=max_in_degree,
-        )
-
-        h = ops_torch.operators.agg_hg_basis_n2n_post(
-            feat,
-            self.coeff,
-            _graph,
-            concat_own=self.self_loop,
-            norm_by_out_degree=self.apply_norm,
-        )[: g.num_dst_nodes()]
-        h = h @ self.W.view(-1, self.out_feats)
-        if self.bias is not None:
-            h = h + self.bias
-        h = self.dropout(h)
-
-        return h
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
deleted file mode 100644
index b619890..0000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/sageconv.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class SAGEConv(BaseConv):
-    r"""An accelerated GraphSAGE layer from `Inductive Representation Learning
-    on Large Graphs <https://arxiv.org/pdf/1706.02216.pdf>`, with the sparse
-    aggregation accelerated by cugraph-ops.
-
-    Parameters
-    ----------
-    in_feats : int or tuple
-        Input feature size. If a scalar is given, the source and destination
-        nodes are required to be the same.
-    out_feats : int
-        Output feature size.
-    aggregator_type : str
-        Aggregator type to use ("mean", "sum", "min", "max", "pool", "gcn").
-    feat_drop : float
-        Dropout rate on features, default: ``0``.
-    bias : bool
-        If True, adds a learnable bias to the output. Default: ``True``.
-
-    Examples
-    --------
-    >>> import dgl
-    >>> import torch
-    >>> from cugraph_dgl.nn import SAGEConv
-    ...
-    >>> device = 'cuda'
-    >>> g = dgl.graph(([0,1,2,3,2,5], [1,2,3,4,0,3])).to(device)
-    >>> g = dgl.add_self_loop(g)
-    >>> feat = torch.ones(6, 10).to(device)
-    >>> conv = SAGEConv(10, 2, 'mean').to(device)
-    >>> res = conv(g, feat)
-    >>> res
-    tensor([[-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952],
-            [-1.1690,  0.1952]], device='cuda:0', grad_fn=<AddmmBackward0>)
-    """
-    valid_aggr_types = {"mean", "sum", "min", "max", "pool", "gcn"}
-
-    def __init__(
-        self,
-        in_feats: Union[int, tuple[int, int]],
-        out_feats: int,
-        aggregator_type: str = "mean",
-        feat_drop: float = 0.0,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggregator_type not in self.valid_aggr_types:
-            raise ValueError(
-                f"Invalid aggregator_type. Must be one of {self.valid_aggr_types}. "
-                f"But got '{aggregator_type}' instead."
-            )
-
-        self.aggregator_type = aggregator_type
-        self._aggr = aggregator_type
-        self.in_feats = in_feats
-        self.out_feats = out_feats
-        self.in_feats_src, self.in_feats_dst = dgl.utils.expand_as_pair(in_feats)
-        self.feat_drop = nn.Dropout(feat_drop)
-
-        if self.aggregator_type == "gcn":
-            self._aggr = "mean"
-            self.lin = nn.Linear(self.in_feats_src, out_feats, bias=bias)
-        else:
-            self.lin = nn.Linear(
-                self.in_feats_src + self.in_feats_dst, out_feats, bias=bias
-            )
-
-        if self.aggregator_type == "pool":
-            self._aggr = "max"
-            self.pre_lin = nn.Linear(self.in_feats_src, self.in_feats_src)
-        else:
-            self.register_parameter("pre_lin", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        r"""Reinitialize learnable parameters."""
-        self.lin.reset_parameters()
-        if self.pre_lin is not None:
-            self.pre_lin.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        feat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        max_in_degree: Optional[int] = None,
-    ) -> torch.Tensor:
-        r"""Forward computation.
-
-        Parameters
-        ----------
-        g : DGLGraph or SparseGraph
-            The graph.
-        feat : torch.Tensor or tuple
-            Node features. Shape: :math:`(|V|, D_{in})`.
-        max_in_degree : int
-            Maximum in-degree of destination nodes. When :attr:`g` is generated
-            from a neighbor sampler, the value should be set to the corresponding
-            :attr:`fanout`. This option is used to invoke the MFG-variant of
-            cugraph-ops kernel.
-
-        Returns
-        -------
-        torch.Tensor
-            Output node features. Shape: :math:`(|V|, D_{out})`.
-        """
-        feat_bipartite = isinstance(feat, (list, tuple))
-        graph_bipartite = feat_bipartite or self.aggregator_type == "pool"
-
-        _graph = self.get_cugraph_ops_CSC(
-            g, is_bipartite=graph_bipartite, max_in_degree=max_in_degree
-        )
-
-        if feat_bipartite:
-            feat = (self.feat_drop(feat[0]), self.feat_drop(feat[1]))
-        else:
-            feat = self.feat_drop(feat)
-
-        if self.aggregator_type == "pool":
-            if feat_bipartite:
-                feat = (self.pre_lin(feat[0]).relu(), feat[1])
-            else:
-                feat = (self.pre_lin(feat).relu(), feat[: g.num_dst_nodes()])
-            # force ctx.needs_input_grad=True in cugraph-ops autograd function
-            feat[0].requires_grad_()
-            feat[1].requires_grad_()
-
-        out = ops_torch.operators.agg_concat_n2n(feat, _graph, self._aggr)[
-            : g.num_dst_nodes()
-        ]
-
-        if self.aggregator_type == "gcn":
-            out = out[:, : self.in_feats_src]
-
-        out = self.lin(out)
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py b/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
deleted file mode 100644
index e77556f..0000000
--- a/python/cugraph-dgl/cugraph_dgl/nn/conv/transformerconv.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph_dgl.nn.conv.base import BaseConv, SparseGraph
-from cugraph.utilities.utils import import_optional
-
-dgl = import_optional("dgl")
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-ops_torch = import_optional("pylibcugraphops.pytorch")
-
-
-class TransformerConv(BaseConv):
-    r"""The graph transformer layer from the `"Masked Label Prediction:
-    Unified Message Passing Model for Semi-Supervised Classification"
-    <https://arxiv.org/abs/2009.03509>`_ paper.
-
-    Parameters
-    ----------
-    in_node_feats : int or pair of ints
-        Input feature size. A pair denotes feature sizes of source and
-        destination nodes.
-    out_node_feats : int
-        Output feature size.
-    num_heads : int
-        Number of multi-head-attentions.
-    concat : bool, optional
-        If False, the multi-head attentions are averaged instead of concatenated.
-        Default: ``True``.
-    beta : bool, optional
-        If True, use a gated residual connection. Default: ``True``.
-    edge_feats: int, optional
-        Edge feature size. Default: ``None``.
-    bias: bool, optional
-        If True, learns a bias term. Default: ``True``.
-    root_weight: bool, optional
-        If False, will skip to learn a root weight matrix. Default: ``True``.
-    """
-
-    def __init__(
-        self,
-        in_node_feats: Union[int, tuple[int, int]],
-        out_node_feats: int,
-        num_heads: int,
-        concat: bool = True,
-        beta: bool = False,
-        edge_feats: Optional[int] = None,
-        bias: bool = True,
-        root_weight: bool = True,
-    ):
-        super().__init__()
-
-        self.in_node_feats = in_node_feats
-        self.out_node_feats = out_node_feats
-        self.num_heads = num_heads
-        self.concat = concat
-        self.beta = beta
-        self.edge_feats = edge_feats
-        self.bias = bias
-        self.root_weight = root_weight
-
-        if isinstance(in_node_feats, int):
-            in_node_feats = (in_node_feats, in_node_feats)
-
-        self.lin_key = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
-        self.lin_query = nn.Linear(in_node_feats[1], num_heads * out_node_feats)
-        self.lin_value = nn.Linear(in_node_feats[0], num_heads * out_node_feats)
-
-        if edge_feats is not None:
-            self.lin_edge = nn.Linear(
-                edge_feats, num_heads * out_node_feats, bias=False
-            )
-        else:
-            self.lin_edge = self.register_parameter("lin_edge", None)
-
-        if concat:
-            self.lin_skip = nn.Linear(
-                in_node_feats[1], num_heads * out_node_feats, bias=bias
-            )
-            if self.beta:
-                self.lin_beta = nn.Linear(3 * num_heads * out_node_feats, 1, bias=bias)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-        else:
-            self.lin_skip = nn.Linear(in_node_feats[1], out_node_feats, bias=bias)
-            if self.beta:
-                self.lin_beta = nn.Linear(3 * out_node_feats, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_key.reset_parameters()
-        self.lin_query.reset_parameters()
-        self.lin_value.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-        if self.lin_skip is not None:
-            self.lin_skip.reset_parameters()
-        if self.lin_beta is not None:
-            self.lin_beta.reset_parameters()
-
-    def forward(
-        self,
-        g: Union[SparseGraph, dgl.DGLHeteroGraph],
-        nfeat: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-        efeat: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward computation.
-
-        Parameters
-        ----------
-        g: DGLGraph
-            The graph.
-        nfeat: torch.Tensor or a pair of torch.Tensor
-            Node feature tensor. A pair denotes features for source and
-            destination nodes, respectively.
-        efeat: torch.Tensor, optional
-            Edge feature tensor. Default: ``None``.
-        """
-        feat_bipartite = isinstance(nfeat, (list, tuple))
-        if not feat_bipartite:
-            nfeat = (nfeat, nfeat)
-
-        _graph = self.get_cugraph_ops_CSC(g, is_bipartite=True)
-
-        query = self.lin_query(nfeat[1][: g.num_dst_nodes()])
-        key = self.lin_key(nfeat[0])
-        value = self.lin_value(nfeat[0])
-
-        if efeat is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_feats must be set to allow "
-                    f"edge features."
-                )
-            efeat = self.lin_edge(efeat)
-
-        out = ops_torch.operators.mha_simple_n2n(
-            key_emb=key,
-            query_emb=query,
-            value_emb=value,
-            graph=_graph,
-            num_heads=self.num_heads,
-            concat_heads=self.concat,
-            edge_emb=efeat,
-            norm_by_dim=True,
-            score_bias=None,
-        )[: g.num_dst_nodes()]
-
-        if self.root_weight:
-            res = self.lin_skip(nfeat[1][: g.num_dst_nodes()])
-            if self.lin_beta is not None:
-                beta = self.lin_beta(torch.cat([out, res, out - res], dim=-1))
-                beta = beta.sigmoid()
-                out = beta * res + (1 - beta) * out
-            else:
-                out = out + res
-
-        return out
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/conftest.py b/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
index 07086f2..ee1183f 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -22,12 +22,6 @@
 )
 
 
-def pytest_ignore_collect(collection_path, config):
-    """Return True to prevent considering this path for collection."""
-    if "nn" in collection_path.name:
-        return True
-
-
 @pytest.fixture(scope="module")
 def dask_client():
     # start_dask_client will check for the SCHEDULER_FILE and
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
deleted file mode 100644
index de27efc..0000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatconv.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import GATConv as CuGraphGATConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("residual", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_gatconv_equality(
-    dgl_graph_1,
-    mode,
-    idx_type,
-    max_in_degree,
-    num_heads,
-    residual,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import GATConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if mode == "bipartite":
-        in_feats = (10, 3)
-        nfeat = (
-            torch.randn(size[0], in_feats[0]).to(device),
-            torch.randn(size[1], in_feats[1]).to(device),
-        )
-    elif mode == "share_weights":
-        in_feats = 5
-        nfeat = (
-            torch.randn(size[0], in_feats).to(device),
-            torch.randn(size[1], in_feats).to(device),
-        )
-    else:
-        in_feats = 7
-        nfeat = torch.randn(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    args = (in_feats, out_feats, num_heads)
-    kwargs = {"bias": False, "allow_zero_in_degree": True, "residual": residual}
-
-    conv1 = GATConv(*args, **kwargs).to(device)
-    conv2 = CuGraphGATConv(*args, **kwargs).to(device)
-
-    dim = num_heads * out_feats
-    with torch.no_grad():
-        conv2.attn_weights[:dim].copy_(conv1.attn_l.flatten())
-        conv2.attn_weights[dim:].copy_(conv1.attn_r.flatten())
-        if mode == "bipartite":
-            conv2.lin_src.weight.copy_(conv1.fc_src.weight)
-            conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
-        else:
-            conv2.lin.weight.copy_(conv1.fc.weight)
-        if residual and conv1.has_linear_res:
-            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
-
-    out1 = conv1(g, nfeat)
-    if sparse_format is not None:
-        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out1 = torch.randn_like(out1)
-    grad_out2 = grad_out1.detach().clone()
-    out1.backward(grad_out1)
-    out2.backward(grad_out2)
-
-    if mode == "bipartite":
-        assert torch.allclose(
-            conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-        )
-    else:
-        assert torch.allclose(conv1.fc.weight.grad, conv2.lin.weight.grad, atol=ATOL)
-
-    if residual and conv1.has_linear_res:
-        assert torch.allclose(
-            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
-        )
-
-    assert torch.allclose(
-        torch.cat((conv1.attn_l.grad, conv1.attn_r.grad), dim=0),
-        conv2.attn_weights.grad.view(2, num_heads, out_feats),
-        atol=1e-5,  # Note: using a loosened tolerance here due to numerical error
-    )
-
-
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_gatconv_edge_feats(
-    dgl_graph_1,
-    bias,
-    bipartite,
-    concat,
-    max_in_degree,
-    num_heads,
-    to_block,
-    use_edge_feats,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    if bipartite:
-        in_feats = (10, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
-        )
-    else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
-    out_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = CuGraphGATConv(
-        in_feats,
-        out_feats,
-        num_heads,
-        concat=concat,
-        edge_feats=edge_feats,
-        bias=bias,
-        allow_zero_in_degree=True,
-    ).to(device)
-    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
deleted file mode 100644
index 2d26b7f..0000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_gatv2conv.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import GATv2Conv as CuGraphGATv2Conv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-5
-
-
-@pytest.mark.parametrize("mode", ["bipartite", "share_weights", "regular"])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("residual", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_gatv2conv_equality(
-    dgl_graph_1,
-    mode,
-    idx_type,
-    max_in_degree,
-    num_heads,
-    residual,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import GATv2Conv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if mode == "bipartite":
-        in_feats = (10, 3)
-        nfeat = (
-            torch.randn(size[0], in_feats[0]).to(device),
-            torch.randn(size[1], in_feats[1]).to(device),
-        )
-    elif mode == "share_weights":
-        in_feats = 5
-        nfeat = (
-            torch.randn(size[0], in_feats).to(device),
-            torch.randn(size[1], in_feats).to(device),
-        )
-    else:
-        in_feats = 7
-        nfeat = torch.randn(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    args = (in_feats, out_feats, num_heads)
-    kwargs = {
-        "bias": False,
-        "allow_zero_in_degree": True,
-        "residual": residual,
-        "share_weights": mode == "share_weights",
-    }
-
-    conv1 = GATv2Conv(*args, **kwargs).to(device)
-    conv2 = CuGraphGATv2Conv(*args, **kwargs).to(device)
-
-    with torch.no_grad():
-        conv2.attn_weights.copy_(conv1.attn.flatten())
-        conv2.lin_src.weight.copy_(conv1.fc_src.weight)
-        conv2.lin_dst.weight.copy_(conv1.fc_dst.weight)
-        if residual:
-            conv2.lin_res.weight.copy_(conv1.res_fc.weight)
-
-    out1 = conv1(g, nfeat)
-    if sparse_format is not None:
-        out2 = conv2(sg, nfeat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, nfeat, max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out1 = torch.randn_like(out1)
-    grad_out2 = grad_out1.detach().clone()
-    out1.backward(grad_out1)
-    out2.backward(grad_out2)
-
-    assert torch.allclose(
-        conv1.fc_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.fc_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-    )
-
-    if residual:
-        assert torch.allclose(
-            conv1.res_fc.weight.grad, conv2.lin_res.weight.grad, atol=ATOL
-        )
-
-    assert torch.allclose(
-        conv1.attn.grad,
-        conv2.attn_weights.grad.view(1, num_heads, out_feats),
-        atol=ATOL,
-    )
-
-
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_heads", [1, 2, 7])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-def test_gatv2conv_edge_feats(
-    dgl_graph_1,
-    bias,
-    bipartite,
-    concat,
-    max_in_degree,
-    num_heads,
-    to_block,
-    use_edge_feats,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    if bipartite:
-        in_feats = (10, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_feats[0]).to(device),
-            torch.rand(g.num_dst_nodes(), in_feats[1]).to(device),
-        )
-    else:
-        in_feats = 10
-        nfeat = torch.rand(g.num_src_nodes(), in_feats).to(device)
-    out_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats).to(device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = CuGraphGATv2Conv(
-        in_feats,
-        out_feats,
-        num_heads,
-        concat=concat,
-        edge_feats=edge_feats,
-        bias=bias,
-        allow_zero_in_degree=True,
-    ).to(device)
-    out = conv(g, nfeat, efeat=efeat, max_in_degree=max_in_degree)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
deleted file mode 100644
index b5d3686..0000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_relgraphconv.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import RelGraphConv as CuGraphRelGraphConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("num_bases", [1, 2, 5])
-@pytest.mark.parametrize("regularizer", [None, "basis"])
-@pytest.mark.parametrize("self_loop", [False, True])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_relgraphconv_equality(
-    dgl_graph_1,
-    idx_type,
-    max_in_degree,
-    num_bases,
-    regularizer,
-    self_loop,
-    to_block,
-    sparse_format,
-):
-    from dgl.nn.pytorch import RelGraphConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    in_feat, out_feat, num_rels = 10, 2, 3
-    args = (in_feat, out_feat, num_rels)
-    kwargs = {
-        "num_bases": num_bases,
-        "regularizer": regularizer,
-        "bias": False,
-        "self_loop": self_loop,
-    }
-
-    g.edata[dgl.ETYPE] = torch.randint(num_rels, (g.num_edges(),)).to(device)
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-    feat = torch.rand(g.num_src_nodes(), in_feat).to(device)
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size,
-            src_ids=g.edges()[0],
-            dst_ids=g.edges()[1],
-            values=g.edata[dgl.ETYPE],
-            formats="csc",
-        )
-    elif sparse_format == "csc":
-        offsets, indices, perm = g.adj_tensors("csc")
-        etypes = g.edata[dgl.ETYPE][perm]
-        sg = SparseGraph(
-            size=size, src_ids=indices, cdst_ids=offsets, values=etypes, formats="csc"
-        )
-
-    conv1 = RelGraphConv(*args, **kwargs).to(device)
-    conv2 = CuGraphRelGraphConv(*args, **kwargs, apply_norm=False).to(device)
-
-    with torch.no_grad():
-        if self_loop:
-            conv2.W[:-1].copy_(conv1.linear_r.W)
-            conv2.W[-1].copy_(conv1.loop_weight)
-        else:
-            conv2.W.copy_(conv1.linear_r.W)
-
-        if regularizer is not None:
-            conv2.coeff.copy_(conv1.linear_r.coeff)
-
-    out1 = conv1(g, feat, g.edata[dgl.ETYPE])
-
-    if sparse_format is not None:
-        out2 = conv2(sg, feat, sg.values(), max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, feat, g.edata[dgl.ETYPE], max_in_degree=max_in_degree)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.randn_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    end = -1 if self_loop else None
-    assert torch.allclose(conv1.linear_r.W.grad, conv2.W.grad[:end], atol=ATOL)
-
-    if self_loop:
-        assert torch.allclose(conv1.loop_weight.grad, conv2.W.grad[-1], atol=ATOL)
-
-    if regularizer is not None:
-        assert torch.allclose(conv1.linear_r.coeff.grad, conv2.coeff.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
deleted file mode 100644
index 3f1c2b1..0000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_sageconv.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import SAGEConv as CuGraphSAGEConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("aggr", ["mean", "pool"])
-@pytest.mark.parametrize("bias", [False, True])
-@pytest.mark.parametrize("bipartite", [False, True])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("max_in_degree", [None, 8])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_sageconv_equality(
-    dgl_graph_1, aggr, bias, bipartite, idx_type, max_in_degree, to_block, sparse_format
-):
-    from dgl.nn.pytorch import SAGEConv
-
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-
-    if bipartite:
-        in_feats = (5, 3)
-        feat = (
-            torch.rand(size[0], in_feats[0], requires_grad=True).to(device),
-            torch.rand(size[1], in_feats[1], requires_grad=True).to(device),
-        )
-    else:
-        in_feats = 5
-        feat = torch.rand(size[0], in_feats).to(device)
-    out_feats = 2
-
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    kwargs = {"aggregator_type": aggr, "bias": bias}
-    conv1 = SAGEConv(in_feats, out_feats, **kwargs).to(device)
-    conv2 = CuGraphSAGEConv(in_feats, out_feats, **kwargs).to(device)
-
-    in_feats_src = conv2.in_feats_src
-    with torch.no_grad():
-        conv2.lin.weight[:, :in_feats_src].copy_(conv1.fc_neigh.weight)
-        conv2.lin.weight[:, in_feats_src:].copy_(conv1.fc_self.weight)
-        if bias:
-            conv2.lin.bias.copy_(conv1.fc_self.bias)
-        if aggr == "pool":
-            conv2.pre_lin.weight.copy_(conv1.fc_pool.weight)
-            conv2.pre_lin.bias.copy_(conv1.fc_pool.bias)
-
-    out1 = conv1(g, feat)
-    if sparse_format is not None:
-        out2 = conv2(sg, feat, max_in_degree=max_in_degree)
-    else:
-        out2 = conv2(g, feat, max_in_degree=max_in_degree)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.randn_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-    assert torch.allclose(
-        conv1.fc_neigh.weight.grad,
-        conv2.lin.weight.grad[:, :in_feats_src],
-        atol=ATOL,
-    )
-    assert torch.allclose(
-        conv1.fc_self.weight.grad,
-        conv2.lin.weight.grad[:, in_feats_src:],
-        atol=ATOL,
-    )
-    if bias:
-        assert torch.allclose(conv1.fc_self.bias.grad, conv2.lin.bias.grad, atol=ATOL)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py b/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
deleted file mode 100644
index 28d13de..0000000
--- a/python/cugraph-dgl/cugraph_dgl/tests/nn/test_transformerconv.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_dgl.nn.conv.base import SparseGraph
-from cugraph_dgl.nn import TransformerConv
-
-dgl = pytest.importorskip("dgl", reason="DGL not available")
-torch = pytest.importorskip("torch", reason="PyTorch not available")
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("beta", [False, True])
-@pytest.mark.parametrize("bipartite_node_feats", [False, True])
-@pytest.mark.parametrize("concat", [False, True])
-@pytest.mark.parametrize("idx_type", [torch.int32, torch.int64])
-@pytest.mark.parametrize("num_heads", [1, 3, 4])
-@pytest.mark.parametrize("to_block", [False, True])
-@pytest.mark.parametrize("use_edge_feats", [False, True])
-@pytest.mark.parametrize("sparse_format", ["coo", "csc", None])
-def test_transformerconv(
-    dgl_graph_1,
-    beta,
-    bipartite_node_feats,
-    concat,
-    idx_type,
-    num_heads,
-    to_block,
-    use_edge_feats,
-    sparse_format,
-):
-    torch.manual_seed(12345)
-    device = torch.device("cuda")
-    g = dgl_graph_1.to(device).astype(idx_type)
-
-    if to_block:
-        g = dgl.to_block(g)
-
-    size = (g.num_src_nodes(), g.num_dst_nodes())
-    if sparse_format == "coo":
-        sg = SparseGraph(
-            size=size, src_ids=g.edges()[0], dst_ids=g.edges()[1], formats="csc"
-        )
-    elif sparse_format == "csc":
-        offsets, indices, _ = g.adj_tensors("csc")
-        sg = SparseGraph(size=size, src_ids=indices, cdst_ids=offsets, formats="csc")
-
-    if bipartite_node_feats:
-        in_node_feats = (5, 3)
-        nfeat = (
-            torch.rand(g.num_src_nodes(), in_node_feats[0], device=device),
-            torch.rand(g.num_dst_nodes(), in_node_feats[1], device=device),
-        )
-    else:
-        in_node_feats = 3
-        nfeat = torch.rand(g.num_src_nodes(), in_node_feats, device=device)
-    out_node_feats = 2
-
-    if use_edge_feats:
-        edge_feats = 3
-        efeat = torch.rand(g.num_edges(), edge_feats, device=device)
-    else:
-        edge_feats = None
-        efeat = None
-
-    conv = TransformerConv(
-        in_node_feats,
-        out_node_feats,
-        num_heads=num_heads,
-        concat=concat,
-        beta=beta,
-        edge_feats=edge_feats,
-    ).to(device)
-
-    if sparse_format is not None:
-        out = conv(sg, nfeat, efeat)
-    else:
-        out = conv(g, nfeat, efeat)
-
-    grad_out = torch.randn_like(out)
-    out.backward(grad_out)
diff --git a/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py b/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
index df60304..4ac4346 100644
--- a/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
+++ b/python/cugraph-dgl/cugraph_dgl/tests/test_utils.py
@@ -180,7 +180,6 @@ def test_get_source_destination_range():
     assert output_d == expected_output
 
 
-@pytest.mark.skip(reason="Skipping due to missing cugraph-ops backend.")
 def test__create_homogeneous_cugraph_dgl_nn_sparse_graph():
     tensor_d = {
         "sources_range": 1,
@@ -198,7 +197,6 @@ def test__create_homogeneous_cugraph_dgl_nn_sparse_graph():
     assert isinstance(sparse_graph, cugraph_dgl.nn.SparseGraph)
 
 
-@pytest.mark.skip(reason="Skipping due to missing cugraph-ops backend.")
 def test_create_homogeneous_sampled_graphs_from_dataframe_csc():
     df = get_dummy_sampled_df_csc()
     batches = create_homogeneous_sampled_graphs_from_dataframe_csc(df)
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
deleted file mode 100644
index 65136a8..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .conv import *
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
deleted file mode 100644
index a0cda91..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2023-2025, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-HAVE_CUGRAPH_OPS = False
-try:
-    import pylibcugraphops
-
-    HAVE_CUGRAPH_OPS = True
-except ImportError:
-    pass
-except Exception as e:
-    warnings.warn(f"Unexpected error while importing pylibcugraphops: {e}")
-
-if HAVE_CUGRAPH_OPS:
-    from .gat_conv import GATConv
-    from .gatv2_conv import GATv2Conv
-    from .hetero_gat_conv import HeteroGATConv
-    from .rgcn_conv import RGCNConv
-    from .sage_conv import SAGEConv
-    from .transformer_conv import TransformerConv
-
-    __all__ = [
-        "GATConv",
-        "GATv2Conv",
-        "HeteroGATConv",
-        "RGCNConv",
-        "SAGEConv",
-        "TransformerConv",
-    ]
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
deleted file mode 100644
index 713448a..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/base.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-import pylibcugraphops.pytorch
-
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-# A tuple of (row, colptr, num_src_nodes)
-CSC = Tuple[torch.Tensor, torch.Tensor, int]
-
-
-class BaseConv(torch.nn.Module):  # pragma: no cover
-    r"""An abstract base class for implementing cugraph-ops message passing layers."""
-
-    def reset_parameters(self):
-        r"""Resets all learnable parameters of the module."""
-        pass
-
-    @staticmethod
-    def to_csc(
-        edge_index: torch.Tensor,
-        size: Optional[Tuple[int, int]] = None,
-        edge_attr: Optional[torch.Tensor] = None,
-    ) -> Union[CSC, Tuple[CSC, torch.Tensor],]:
-        r"""Returns a CSC representation of an :obj:`edge_index` tensor to be
-        used as input to cugraph-ops conv layers.
-
-        Args:
-            edge_index (torch.Tensor): The edge indices.
-            size ((int, int), optional). The shape of :obj:`edge_index` in each
-                dimension. (default: :obj:`None`)
-            edge_attr (torch.Tensor, optional): The edge features.
-                (default: :obj:`None`)
-        """
-        if size is None:
-            warnings.warn(
-                f"Inferring the graph size from 'edge_index' causes "
-                f"a decline in performance and does not work for "
-                f"bipartite graphs. To suppress this warning, pass "
-                f"the 'size' explicitly in '{__name__}.to_csc()'."
-            )
-            num_src_nodes = num_dst_nodes = int(edge_index.max()) + 1
-        else:
-            num_src_nodes, num_dst_nodes = size
-
-        row, col = edge_index
-        col, perm = torch_geometric.utils.index_sort(col, max_value=num_dst_nodes)
-        row = row[perm]
-
-        colptr = torch_geometric.utils.sparse.index2ptr(col, num_dst_nodes)
-
-        if edge_attr is not None:
-            return (row, colptr, num_src_nodes), edge_attr[perm]
-
-        return row, colptr, num_src_nodes
-
-    def get_cugraph(
-        self,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        bipartite: bool = False,
-        max_num_neighbors: Optional[int] = None,
-    ) -> Tuple[pylibcugraphops.pytorch.CSC, Optional[torch.Tensor]]:
-        r"""Constructs a :obj:`cugraph-ops` graph object from CSC representation.
-        Supports both bipartite and non-bipartite graphs.
-
-        Args:
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-            bipartite (bool): If set to :obj:`True`, will create the bipartite
-                structure in cugraph-ops. (default: :obj:`False`)
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-        """
-        perm = None
-        if isinstance(edge_index, torch_geometric.EdgeIndex):
-            edge_index, perm = edge_index.sort_by("col")
-            num_src_nodes = edge_index.get_sparse_size(0)
-            (colptr, row), _ = edge_index.get_csc()
-        else:
-            row, colptr, num_src_nodes = edge_index
-
-        if not row.is_cuda:
-            raise RuntimeError(
-                f"'{self.__class__.__name__}' requires GPU-based processing "
-                f"but got CPU tensor."
-            )
-
-        if max_num_neighbors is None:
-            max_num_neighbors = -1
-
-        return (
-            pylibcugraphops.pytorch.CSC(
-                offsets=colptr,
-                indices=row,
-                num_src_nodes=num_src_nodes,
-                dst_max_in_degree=max_num_neighbors,
-                is_bipartite=bipartite,
-            ),
-            perm,
-        )
-
-    def get_typed_cugraph(
-        self,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_type: torch.Tensor,
-        num_edge_types: Optional[int] = None,
-        bipartite: bool = False,
-        max_num_neighbors: Optional[int] = None,
-    ) -> Tuple[pylibcugraphops.pytorch.HeteroCSC, Optional[torch.Tensor]]:
-        r"""Constructs a typed :obj:`cugraph` graph object from a CSC
-        representation where each edge corresponds to a given edge type.
-        Supports both bipartite and non-bipartite graphs.
-
-        Args:
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-            edge_type (torch.Tensor): The edge type.
-            num_edge_types (int, optional): The maximum number of edge types.
-                When not given, will be computed on-the-fly, leading to
-                slightly worse performance. (default: :obj:`None`)
-            bipartite (bool): If set to :obj:`True`, will create the bipartite
-                structure in cugraph-ops. (default: :obj:`False`)
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-        """
-        if num_edge_types is None:
-            num_edge_types = int(edge_type.max()) + 1
-
-        if max_num_neighbors is None:
-            max_num_neighbors = -1
-
-        perm = None
-        if isinstance(edge_index, torch_geometric.EdgeIndex):
-            edge_index, perm = edge_index.sort_by("col")
-            edge_type = edge_type[perm]
-            num_src_nodes = edge_index.get_sparse_size(0)
-            (colptr, row), _ = edge_index.get_csc()
-        else:
-            row, colptr, num_src_nodes = edge_index
-        edge_type = edge_type.int()
-
-        return (
-            pylibcugraphops.pytorch.HeteroCSC(
-                offsets=colptr,
-                indices=row,
-                edge_types=edge_type,
-                num_src_nodes=num_src_nodes,
-                num_edge_types=num_edge_types,
-                dst_max_in_degree=max_num_neighbors,
-                is_bipartite=bipartite,
-            ),
-            perm,
-        )
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor): The node features.
-            edge_index (EdgeIndex, (torch.Tensor, torch.Tensor, int)): The edge
-                indices, or a tuple of :obj:`(row, colptr, num_src_nodes)` for
-                CSC representation.
-        """
-        raise NotImplementedError
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
deleted file mode 100644
index 981b1c5..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gat_conv.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class GATConv(BaseConv):
-    r"""The graph attentional operator from the `"Graph Attention Networks"
-    <https://arxiv.org/abs/1710.10903>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k]
-        \right)\right)}.
-
-    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
-    the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_j
-        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,j}]\right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathrm{LeakyReLU}\left(\mathbf{a}^{\top}
-        [\mathbf{\Theta}\mathbf{x}_i \, \Vert \, \mathbf{\Theta}\mathbf{x}_k
-        \, \Vert \, \mathbf{\Theta}_{e} \mathbf{e}_{i,k}]\right)\right)}.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        negative_slope (float, optional): LeakyReLU angle of the negative
-            slope. (default: :obj:`0.2`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). (default: :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.concat = concat
-        self.negative_slope = negative_slope
-        self.edge_dim = edge_dim
-
-        Linear = torch_geometric.nn.Linear
-
-        if isinstance(in_channels, int):
-            self.lin = Linear(
-                in_channels,
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-        else:
-            self.lin_src = Linear(
-                in_channels[0],
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-            self.lin_dst = Linear(
-                in_channels[1],
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-
-        if edge_dim is not None:
-            self.lin_edge = Linear(
-                edge_dim,
-                heads * out_channels,
-                bias=False,
-                weight_initializer="glorot",
-            )
-            self.att = nn.Parameter(torch.Tensor(3 * heads * out_channels))
-        else:
-            self.register_parameter("lin_edge", None)
-            self.att = nn.Parameter(torch.Tensor(2 * heads * out_channels))
-
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if isinstance(self.in_channels, int):
-            self.lin.reset_parameters()
-        else:
-            self.lin_src.reset_parameters()
-            self.lin_dst.reset_parameters()
-
-        torch_geometric.nn.inits.glorot(
-            self.att.view(-1, self.heads, self.out_channels)
-        )
-
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-        max_num_neighbors: Optional[int] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-        high_precision_dgrad: bool = False,
-        high_precision_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-            max_num_neighbors (int, optional): The maximum number of neighbors
-                of a destination node. When enabled, it allows models to use
-                the message-flow-graph primitives in cugraph-ops.
-                (default: :obj:`None`)
-            deterministic_dgrad : bool, default=False
-                Optional flag indicating whether the feature gradients
-                are computed deterministically using a dedicated workspace buffer.
-            deterministic_wgrad: bool, default=False
-                Optional flag indicating whether the weight gradients
-                are computed deterministically using a dedicated workspace buffer.
-            high_precision_dgrad: bool, default=False
-                Optional flag indicating whether gradients for inputs in half precision
-                are kept in single precision as long as possible and only casted to
-                the corresponding input type at the very end.
-            high_precision_wgrad: bool, default=False
-                Optional flag indicating whether gradients for weights in half precision
-                are kept in single precision as long as possible and only casted to
-                the corresponding input type at the very end.
-        """
-        bipartite = not isinstance(x, torch.Tensor)
-        graph, perm = self.get_cugraph(
-            edge_index=edge_index,
-            bipartite=bipartite,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        if deterministic_dgrad:
-            graph.add_reverse_graph()
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if edge_attr.dim() == 1:
-                edge_attr = edge_attr.view(-1, 1)
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        if bipartite:
-            if not hasattr(self, "lin_src"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_channels must be a pair of "
-                    f"integers to allow bipartite node features, but got "
-                    f"{self.in_channels}."
-                )
-            x_src = self.lin_src(x[0])
-            x_dst = self.lin_dst(x[1])
-        else:
-            if not hasattr(self, "lin"):
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.in_channels is expected to be an "
-                    f"integer, but got {self.in_channels}."
-                )
-            x = self.lin(x)
-
-        out = mha_gat_n2n(
-            (x_src, x_dst) if bipartite else x,
-            self.att,
-            graph,
-            num_heads=self.heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=edge_attr,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-            high_precision_dgrad=high_precision_dgrad,
-            high_precision_wgrad=high_precision_wgrad,
-        )
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
deleted file mode 100644
index ebb30de..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/gatv2_conv.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_v2_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class GATv2Conv(BaseConv):
-    r"""The GATv2 operator from the `"How Attentive are Graph Attention
-    Networks?" <https://arxiv.org/abs/2105.14491>`_ paper, which fixes the
-    static attention problem of the standard
-    :class:`~torch_geometric.conv.GATConv` layer.
-    Since the linear layers in the standard GAT are applied right after each
-    other, the ranking of attended nodes is unconditioned on the query node.
-    In contrast, in :class:`GATv2`, every node can attend to any other node.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \alpha_{i,i}\mathbf{\Theta}\mathbf{x}_{i} +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j}\mathbf{\Theta}\mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k]
-        \right)\right)}.
-
-    If the graph has multi-dimensional edge features :math:`\mathbf{e}_{i,j}`,
-    the attention coefficients :math:`\alpha_{i,j}` are computed as
-
-    .. math::
-        \alpha_{i,j} =
-        \frac{
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_j \, \Vert \, \mathbf{e}_{i,j}]
-        \right)\right)}
-        {\sum_{k \in \mathcal{N}(i) \cup \{ i \}}
-        \exp\left(\mathbf{a}^{\top}\mathrm{LeakyReLU}\left(\mathbf{\Theta}
-        [\mathbf{x}_i \, \Vert \, \mathbf{x}_k \, \Vert \, \mathbf{e}_{i,k}]
-        \right)\right)}.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        negative_slope (float, optional): LeakyReLU angle of the negative
-            slope. (default: :obj:`0.2`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). (default: :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-        share_weights (bool, optional): If set to :obj:`True`, the same matrix
-            will be applied to the source and the target node of every edge.
-            (default: :obj:`False`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-        share_weights: bool = False,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.concat = concat
-        self.negative_slope = negative_slope
-        self.edge_dim = edge_dim
-        self.share_weights = share_weights
-
-        Linear = torch_geometric.nn.Linear
-
-        if isinstance(in_channels, int):
-            self.lin_src = Linear(
-                in_channels,
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-
-            if share_weights:
-                self.lin_dst = self.lin_src
-            else:
-                self.lin_dst = Linear(
-                    in_channels,
-                    heads * out_channels,
-                    bias=bias,
-                    weight_initializer="glorot",
-                )
-        else:
-            self.lin_src = Linear(
-                in_channels[0],
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-            self.lin_dst = Linear(
-                in_channels[1],
-                heads * out_channels,
-                bias=bias,
-                weight_initializer="glorot",
-            )
-
-        self.att = nn.Parameter(torch.Tensor(heads * out_channels))
-
-        if edge_dim is not None:
-            self.lin_edge = Linear(
-                edge_dim, heads * out_channels, bias=False, weight_initializer="glorot"
-            )
-        else:
-            self.register_parameter("lin_edge", None)
-
-        if bias and concat:
-            self.bias = nn.Parameter(torch.Tensor(heads * out_channels))
-        elif bias and not concat:
-            self.bias = nn.Parameter(torch.Tensor(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_src.reset_parameters()
-        self.lin_dst.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-
-        torch_geometric.nn.inits.glorot(
-            self.att.view(-1, self.heads, self.out_channels)
-        )
-
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-        deterministic_dgrad: bool = False,
-        deterministic_wgrad: bool = False,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-            deterministic_dgrad : bool, default=False
-                Optional flag indicating whether the feature gradients
-                are computed deterministically using a dedicated workspace buffer.
-            deterministic_wgrad: bool, default=False
-                Optional flag indicating whether the weight gradients
-                are computed deterministically using a dedicated workspace buffer.
-        """
-        bipartite = not isinstance(x, torch.Tensor) or not self.share_weights
-        graph, perm = self.get_cugraph(edge_index, bipartite=bipartite)
-        if deterministic_dgrad:
-            graph.add_reverse_graph()
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if edge_attr.dim() == 1:
-                edge_attr = edge_attr.view(-1, 1)
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        if bipartite:
-            if isinstance(x, torch.Tensor):
-                x = (x, x)
-            x_src = self.lin_src(x[0])
-            x_dst = self.lin_dst(x[1])
-        else:
-            x = self.lin_src(x)
-
-        out = mha_gat_v2_n2n(
-            (x_src, x_dst) if bipartite else x,
-            self.att,
-            graph,
-            num_heads=self.heads,
-            activation="LeakyReLU",
-            negative_slope=self.negative_slope,
-            concat_heads=self.concat,
-            edge_feat=edge_attr,
-            deterministic_dgrad=deterministic_dgrad,
-            deterministic_wgrad=deterministic_wgrad,
-        )
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
deleted file mode 100644
index a73dd8e..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/hetero_gat_conv.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-from collections import defaultdict
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_gat_n2n
-
-from .base import BaseConv
-from cugraph_pyg.utils.imports import package_available
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class HeteroGATConv(BaseConv):
-    r"""The graph attentional operator on heterogeneous graphs, where a separate
-    `GATConv` is applied on the homogeneous graph for each edge type. Compared
-    with directly wrapping `GATConv`s with `HeteroConv`, `HeteroGATConv` fuses
-    all the linear transformation associated with each node type together into 1
-    GEMM call, to improve the performance on GPUs.
-
-    Parameters
-    ----------
-    in_channels : int or Dict[str, int])
-        Size of each input sample of every node type.
-
-    out_channels : int
-        Size of each output sample.
-
-    node_types : List[str]
-        List of Node types.
-
-    edge_types : List[Tuple[str, str, str]]
-        List of Edge types.
-
-    heads : int, optional (default=1)
-        Number of multi-head-attentions.
-
-    concat : bool, optional (default=True):
-        If set to :obj:`False`, the multi-head attentions are averaged instead
-        of concatenated.
-
-    negative_slope : float, optional (default=0.2)
-        LeakyReLU angle of the negative slope.
-
-    bias : bool, optional (default=True)
-        If set to :obj:`False`, the layer will not learn an additive bias.
-
-    aggr : str, optional (default="sum")
-        The aggregation scheme to use for grouping node embeddings generated by
-        different relations. Choose from "sum", "mean", "min", "max".
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, dict[str, int]],
-        out_channels: int,
-        node_types: list[str],
-        edge_types: list[tuple[str, str, str]],
-        heads: int = 1,
-        concat: bool = True,
-        negative_slope: float = 0.2,
-        bias: bool = True,
-        aggr: str = "sum",
-    ):
-        if not package_available("torch_geometric>=2.4.0"):
-            raise RuntimeError(
-                f"{self.__class__.__name__} requires torch_geometric>=2.4.0."
-            )
-
-        super().__init__()
-
-        if isinstance(in_channels, int):
-            in_channels = dict.fromkeys(node_types, in_channels)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-
-        self.node_types = node_types
-        self.edge_types = edge_types
-        self.num_heads = heads
-        self.concat_heads = concat
-
-        self.negative_slope = negative_slope
-        self.aggr = aggr
-
-        self.relations_per_ntype = defaultdict(lambda: ([], []))
-
-        lin_weights = dict.fromkeys(self.node_types)
-        attn_weights = dict.fromkeys(self.edge_types)
-        biases = dict.fromkeys(self.edge_types)
-
-        ParameterDict = torch_geometric.nn.parameter_dict.ParameterDict
-
-        for edge_type in self.edge_types:
-            src_type, _, dst_type = edge_type
-            self.relations_per_ntype[src_type][0].append(edge_type)
-            if src_type != dst_type:
-                self.relations_per_ntype[dst_type][1].append(edge_type)
-
-            attn_weights[edge_type] = torch.empty(
-                2 * self.num_heads * self.out_channels
-            )
-
-            if bias and concat:
-                biases[edge_type] = torch.empty(self.num_heads * out_channels)
-            elif bias:
-                biases[edge_type] = torch.empty(out_channels)
-            else:
-                biases[edge_type] = None
-
-        for ntype in self.node_types:
-            n_src_rel = len(self.relations_per_ntype[ntype][0])
-            n_dst_rel = len(self.relations_per_ntype[ntype][1])
-            n_rel = n_src_rel + n_dst_rel
-
-            lin_weights[ntype] = torch.empty(
-                (n_rel * self.num_heads * self.out_channels, self.in_channels[ntype])
-            )
-
-        self.lin_weights = ParameterDict(lin_weights)
-        self.attn_weights = ParameterDict(attn_weights)
-
-        if bias:
-            self.bias = ParameterDict(biases)
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def split_tensors(
-        self, x_fused_dict: dict[str, torch.Tensor], dim: int
-    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
-        """Split fused tensors into chunks based on edge types.
-
-        Parameters
-        ----------
-        x_fused_dict : dict[str, torch.Tensor]
-            A dictionary to hold node feature for each node type. The key is
-            node type; the value is a fused tensor that account for all
-            relations for that node type.
-
-        dim : int
-            Dimension along which to split the fused tensor.
-
-        Returns
-        -------
-        x_src_dict : dict[str, torch.Tensor]
-            A dictionary to hold source node feature for each relation graph.
-
-        x_dst_dict : dict[str, torch.Tensor]
-            A dictionary to hold destination node feature for each relation graph.
-        """
-        x_src_dict = dict.fromkeys(self.edge_types)
-        x_dst_dict = dict.fromkeys(self.edge_types)
-
-        for ntype, t in x_fused_dict.items():
-            n_src_rel = len(self.relations_per_ntype[ntype][0])
-            n_dst_rel = len(self.relations_per_ntype[ntype][1])
-            n_rel = n_src_rel + n_dst_rel
-            t_list = torch.chunk(t, chunks=n_rel, dim=dim)
-
-            for i, src_rel in enumerate(self.relations_per_ntype[ntype][0]):
-                x_src_dict[src_rel] = t_list[i]
-
-            for i, dst_rel in enumerate(self.relations_per_ntype[ntype][1]):
-                x_dst_dict[dst_rel] = t_list[i + n_src_rel]
-
-        return x_src_dict, x_dst_dict
-
-    def reset_parameters(self, seed: Optional[int] = None):
-        if seed is not None:
-            torch.manual_seed(seed)
-
-        w_src, w_dst = self.split_tensors(self.lin_weights, dim=0)
-
-        for edge_type in self.edge_types:
-            src_type, _, dst_type = edge_type
-
-            # lin_src
-            torch_geometric.nn.inits.glorot(w_src[edge_type])
-
-            # lin_dst
-            if src_type != dst_type:
-                torch_geometric.nn.inits.glorot(w_dst[edge_type])
-
-            # attn_weights
-            torch_geometric.nn.inits.glorot(
-                self.attn_weights[edge_type].view(-1, self.num_heads, self.out_channels)
-            )
-
-            # bias
-            if self.bias is not None:
-                torch_geometric.nn.inits.zeros(self.bias[edge_type])
-
-    def forward(
-        self,
-        x_dict: dict[str, torch.Tensor],
-        edge_index_dict: dict[tuple[str, str, str], torch.Tensor],
-    ) -> dict[str, torch.Tensor]:
-        feat_dict = dict.fromkeys(x_dict.keys())
-
-        for ntype, x in x_dict.items():
-            feat_dict[ntype] = x @ self.lin_weights[ntype].T
-
-        x_src_dict, x_dst_dict = self.split_tensors(feat_dict, dim=1)
-
-        out_dict = defaultdict(list)
-
-        for edge_type, edge_index in edge_index_dict.items():
-            src_type, _, dst_type = edge_type
-
-            csc = BaseConv.to_csc(
-                edge_index, (x_dict[src_type].size(0), x_dict[dst_type].size(0))
-            )
-
-            if src_type == dst_type:
-                graph, _ = self.get_cugraph(
-                    csc,
-                    bipartite=False,
-                )
-                out = mha_gat_n2n(
-                    x_src_dict[edge_type],
-                    self.attn_weights[edge_type],
-                    graph,
-                    num_heads=self.num_heads,
-                    activation="LeakyReLU",
-                    negative_slope=self.negative_slope,
-                    concat_heads=self.concat_heads,
-                )
-
-            else:
-                graph, _ = self.get_cugraph(
-                    csc,
-                    bipartite=True,
-                )
-                out = mha_gat_n2n(
-                    (x_src_dict[edge_type], x_dst_dict[edge_type]),
-                    self.attn_weights[edge_type],
-                    graph,
-                    num_heads=self.num_heads,
-                    activation="LeakyReLU",
-                    negative_slope=self.negative_slope,
-                    concat_heads=self.concat_heads,
-                )
-
-            if self.bias is not None:
-                out = out + self.bias[edge_type]
-
-            out_dict[dst_type].append(out)
-
-        for key, value in out_dict.items():
-            out_dict[key] = torch_geometric.nn.conv.hetero_conv.group(value, self.aggr)
-
-        return out_dict
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
deleted file mode 100644
index 13fa08d..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/rgcn_conv.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import agg_hg_basis_n2n_post
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class RGCNConv(BaseConv):  # pragma: no cover
-    r"""The relational graph convolutional operator from the `"Modeling
-    Relational Data with Graph Convolutional Networks"
-    <https://arxiv.org/abs/1703.06103>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{\Theta}_{\textrm{root}} \cdot
-        \mathbf{x}_i + \sum_{r \in \mathcal{R}} \sum_{j \in \mathcal{N}_r(i)}
-        \frac{1}{|\mathcal{N}_r(i)|} \mathbf{\Theta}_r \cdot \mathbf{x}_j,
-
-    where :math:`\mathcal{R}` denotes the set of relations, *i.e.* edge types.
-    Edge type needs to be a one-dimensional :obj:`torch.long` tensor which
-    stores a relation identifier
-    :math:`\in \{ 0, \ldots, |\mathcal{R}| - 1\}` for each edge.
-
-    Args:
-        in_channels (int): Size of each input sample.
-        out_channels (int): Size of each output sample.
-        num_relations (int): Number of relations.
-        num_bases (int, optional): If set, this layer will use the
-            basis-decomposition regularization scheme where :obj:`num_bases`
-            denotes the number of bases to use. (default: :obj:`None`)
-        aggr (str, optional): The aggregation scheme to use
-            (:obj:`"add"`, :obj:`"mean"`, :obj:`"sum"`).
-            (default: :obj:`"mean"`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add transformed root node features to the output.
-            (default: :obj:`True`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_relations: int,
-        num_bases: Optional[int] = None,
-        aggr: str = "mean",
-        root_weight: bool = True,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggr not in ["mean", "sum", "add"]:
-            raise ValueError(
-                f"Aggregation function must be chosen from 'mean', 'sum' or "
-                f"'add', but got '{aggr}'."
-            )
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.num_relations = num_relations
-        self.num_bases = num_bases
-        self.aggr = aggr
-        self.root_weight = root_weight
-
-        dim_root_weight = 1 if root_weight else 0
-
-        if num_bases is not None:
-            self.weight = torch.nn.Parameter(
-                torch.empty(num_bases + dim_root_weight, in_channels, out_channels)
-            )
-            self.comp = torch.nn.Parameter(torch.empty(num_relations, num_bases))
-        else:
-            self.weight = torch.nn.Parameter(
-                torch.empty(num_relations + dim_root_weight, in_channels, out_channels)
-            )
-            self.register_parameter("comp", None)
-
-        if bias:
-            self.bias = torch.nn.Parameter(torch.empty(out_channels))
-        else:
-            self.register_parameter("bias", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        end = -1 if self.root_weight else None
-        torch_geometric.nn.inits.glorot(self.weight[:end])
-        torch_geometric.nn.inits.glorot(self.comp)
-        if self.root_weight:
-            torch_geometric.nn.inits.glorot(self.weight[-1])
-        torch_geometric.nn.inits.zeros(self.bias)
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_type: torch.Tensor,
-        max_num_neighbors: Optional[int] = None,
-    ) -> torch.Tensor:
-
-        graph, _ = self.get_typed_cugraph(
-            edge_index,
-            edge_type,
-            self.num_relations,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        out = agg_hg_basis_n2n_post(
-            x,
-            self.comp,
-            graph,
-            concat_own=self.root_weight,
-            norm_by_out_degree=bool(self.aggr == "mean"),
-        )
-
-        out = out @ self.weight.view(-1, self.out_channels)
-
-        if self.bias is not None:
-            out = out + self.bias
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, num_relations={self.num_relations})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
deleted file mode 100644
index 65dc99d..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/sage_conv.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import agg_concat_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-torch_geometric = import_optional("torch_geometric")
-
-
-class SAGEConv(BaseConv):
-    r"""The GraphSAGE operator from the `"Inductive Representation Learning on
-    Large Graphs" <https://arxiv.org/abs/1706.02216>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i + \mathbf{W}_2 \cdot
-        \mathrm{mean}_{j \in \mathcal{N(i)}} \mathbf{x}_j
-
-    If :obj:`project = True`, then :math:`\mathbf{x}_j` will first get
-    projected via
-
-    .. math::
-        \mathbf{x}_j \leftarrow \sigma ( \mathbf{W}_3 \mathbf{x}_j +
-        \mathbf{b})
-
-    as described in Eq. (3) of the paper.
-
-    Args:
-        in_channels (int or tuple): Size of each input sample. A tuple
-            corresponds to the sizes of source and target dimensionalities.
-        out_channels (int): Size of each output sample.
-        aggr (str or Aggregation, optional): The aggregation scheme to use.
-            Choose from :obj:`"mean"`, :obj:`"sum"`, :obj:`"min"` or
-            :obj:`"max"`. (default: :obj:`"mean"`)
-        normalize (bool, optional): If set to :obj:`True`, output features
-            will be :math:`\ell_2`-normalized, *i.e.*,
-            :math:`\frac{\mathbf{h}_i^{k+1}}
-            {\| \mathbf{h}_i^{k+1} \|_2}`.
-            (default: :obj:`False`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add transformed root node features to the output.
-            (default: :obj:`True`)
-        project (bool, optional): If set to :obj:`True`, the layer will apply a
-            linear transformation followed by an activation function before
-            aggregation (as described in Eq. (3) of the paper).
-            (default: :obj:`False`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        aggr: str = "mean",
-        normalize: bool = False,
-        root_weight: bool = True,
-        project: bool = False,
-        bias: bool = True,
-    ):
-        super().__init__()
-
-        if aggr not in ["mean", "sum", "min", "max"]:
-            raise ValueError(
-                f"Aggregation function must be chosen from 'mean',"
-                f" 'sum', 'min' or 'max', but got '{aggr}'."
-            )
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.aggr = aggr
-        self.normalize = normalize
-        self.root_weight = root_weight
-        self.project = project
-
-        if isinstance(in_channels, int):
-            self.in_channels_src = self.in_channels_dst = in_channels
-        else:
-            self.in_channels_src, self.in_channels_dst = in_channels
-
-        if self.project:
-            self.pre_lin = torch_geometric.nn.Linear(
-                self.in_channels_src, self.in_channels_src, bias=True
-            )
-
-        if self.root_weight:
-            self.lin = torch_geometric.nn.Linear(
-                self.in_channels_src + self.in_channels_dst, out_channels, bias=bias
-            )
-        else:
-            self.lin = torch_geometric.nn.Linear(
-                self.in_channels_src, out_channels, bias=bias
-            )
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        if self.project:
-            self.pre_lin.reset_parameters()
-        self.lin.reset_parameters()
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        max_num_neighbors: Optional[int] = None,
-    ) -> torch.Tensor:
-        bipartite = isinstance(x, Tuple)
-        graph, _ = self.get_cugraph(
-            edge_index=edge_index,
-            bipartite=bipartite,
-            max_num_neighbors=max_num_neighbors,
-        )
-
-        if self.project:
-            if bipartite:
-                x = (self.pre_lin(x[0]).relu(), x[1])
-            else:
-                x = self.pre_lin(x).relu()
-
-        out = agg_concat_n2n(x, graph, self.aggr)
-
-        if self.root_weight:
-            out = self.lin(out)
-        else:
-            out = self.lin(out[:, : self.in_channels_src])
-
-        if self.normalize:
-            out = torch.nn.functional.normalize(out, p=2.0, dim=-1)
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, aggr={self.aggr})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
deleted file mode 100644
index e184ee0..0000000
--- a/python/cugraph-pyg/cugraph_pyg/nn/conv/transformer_conv.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple, Union
-
-from cugraph.utilities.utils import import_optional
-from pylibcugraphops.pytorch.operators import mha_simple_n2n
-
-from .base import BaseConv, CSC
-
-torch = import_optional("torch")
-nn = import_optional("torch.nn")
-torch_geometric = import_optional("torch_geometric")
-
-
-class TransformerConv(BaseConv):
-    r"""The graph transformer operator from the `"Masked Label Prediction:
-    Unified Message Passing Model for Semi-Supervised Classification"
-    <https://arxiv.org/abs/2009.03509>`_ paper.
-
-    .. math::
-        \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
-        \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \mathbf{W}_2 \mathbf{x}_{j},
-
-    where the attention coefficients :math:`\alpha_{i,j}` are computed via
-    multi-head dot product attention:
-
-    .. math::
-        \alpha_{i,j} = \textrm{softmax} \left(
-        \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top} (\mathbf{W}_4\mathbf{x}_j)}
-        {\sqrt{d}} \right)
-
-    Args:
-        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
-            derive the size from the first input(s) to the forward method.
-            A tuple corresponds to the sizes of source and target
-            dimensionalities.
-        out_channels (int): Size of each output sample.
-        heads (int, optional): Number of multi-head-attentions.
-            (default: :obj:`1`)
-        concat (bool, optional): If set to :obj:`False`, the multi-head
-            attentions are averaged instead of concatenated.
-            (default: :obj:`True`)
-        beta (bool, optional): If set, will combine aggregation and
-            skip information via
-
-            .. math::
-                \mathbf{x}^{\prime}_i = \beta_i \mathbf{W}_1 \mathbf{x}_i +
-                (1 - \beta_i) \underbrace{\left(\sum_{j \in \mathcal{N}(i)}
-                \alpha_{i,j} \mathbf{W}_2 \vec{x}_j \right)}_{=\mathbf{m}_i}
-
-            with :math:`\beta_i = \textrm{sigmoid}(\mathbf{w}_5^{\top}
-            [ \mathbf{W}_1 \mathbf{x}_i, \mathbf{m}_i, \mathbf{W}_1
-            \mathbf{x}_i - \mathbf{m}_i ])` (default: :obj:`False`)
-        edge_dim (int, optional): Edge feature dimensionality (in case
-            there are any). Edge features are added to the keys after
-            linear transformation, that is, prior to computing the
-            attention dot product. They are also added to final values
-            after the same linear transformation. The model is:
-
-            .. math::
-                \mathbf{x}^{\prime}_i = \mathbf{W}_1 \mathbf{x}_i +
-                \sum_{j \in \mathcal{N}(i)} \alpha_{i,j} \left(
-                \mathbf{W}_2 \mathbf{x}_{j} + \mathbf{W}_6 \mathbf{e}_{ij}
-                \right),
-
-            where the attention coefficients :math:`\alpha_{i,j}` are now
-            computed via:
-
-            .. math::
-                \alpha_{i,j} = \textrm{softmax} \left(
-                \frac{(\mathbf{W}_3\mathbf{x}_i)^{\top}
-                (\mathbf{W}_4\mathbf{x}_j + \mathbf{W}_6 \mathbf{e}_{ij})}
-                {\sqrt{d}} \right)
-
-            (default :obj:`None`)
-        bias (bool, optional): If set to :obj:`False`, the layer will not learn
-            an additive bias. (default: :obj:`True`)
-        root_weight (bool, optional): If set to :obj:`False`, the layer will
-            not add the transformed root node features to the output and the
-            option  :attr:`beta` is set to :obj:`False`. (default: :obj:`True`)
-    """
-
-    def __init__(
-        self,
-        in_channels: Union[int, Tuple[int, int]],
-        out_channels: int,
-        heads: int = 1,
-        concat: bool = True,
-        beta: bool = False,
-        edge_dim: Optional[int] = None,
-        bias: bool = True,
-        root_weight: bool = True,
-    ):
-        super().__init__()
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.heads = heads
-        self.beta = beta and root_weight
-        self.root_weight = root_weight
-        self.concat = concat
-        self.edge_dim = edge_dim
-
-        if isinstance(in_channels, int):
-            in_channels = (in_channels, in_channels)
-
-        Linear = torch_geometric.nn.Linear
-
-        self.lin_key = Linear(in_channels[0], heads * out_channels)
-        self.lin_query = Linear(in_channels[1], heads * out_channels)
-        self.lin_value = Linear(in_channels[0], heads * out_channels)
-        if edge_dim is not None:
-            self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False)
-        else:
-            self.lin_edge = self.register_parameter("lin_edge", None)
-
-        if concat:
-            self.lin_skip = Linear(in_channels[1], heads * out_channels, bias=bias)
-            if self.beta:
-                self.lin_beta = Linear(3 * heads * out_channels, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-        else:
-            self.lin_skip = Linear(in_channels[1], out_channels, bias=bias)
-            if self.beta:
-                self.lin_beta = Linear(3 * out_channels, 1, bias=False)
-            else:
-                self.lin_beta = self.register_parameter("lin_beta", None)
-
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        self.lin_key.reset_parameters()
-        self.lin_query.reset_parameters()
-        self.lin_value.reset_parameters()
-        if self.lin_edge is not None:
-            self.lin_edge.reset_parameters()
-        self.lin_skip.reset_parameters()
-        if self.lin_beta is not None:
-            self.lin_beta.reset_parameters()
-
-    def forward(
-        self,
-        x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        edge_index: Union[torch_geometric.EdgeIndex, CSC],
-        edge_attr: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        r"""Runs the forward pass of the module.
-
-        Args:
-            x (torch.Tensor or tuple): The node features. Can be a tuple of
-                tensors denoting source and destination node features.
-            edge_index (EdgeIndex or CSC): The edge indices.
-            edge_attr: (torch.Tensor, optional) The edge features.
-        """
-        bipartite = True
-        graph, perm = self.get_cugraph(edge_index=edge_index, bipartite=bipartite)
-
-        if isinstance(x, torch.Tensor):
-            x = (x, x)
-
-        query = self.lin_query(x[1])
-        key = self.lin_key(x[0])
-        value = self.lin_value(x[0])
-
-        if edge_attr is not None:
-            if self.lin_edge is None:
-                raise RuntimeError(
-                    f"{self.__class__.__name__}.edge_dim must be set to accept "
-                    f"edge features."
-                )
-            if perm is not None:
-                edge_attr = edge_attr[perm]
-            edge_attr = self.lin_edge(edge_attr)
-
-        out = mha_simple_n2n(
-            key,
-            query,
-            value,
-            graph,
-            self.heads,
-            self.concat,
-            edge_emb=edge_attr,
-            norm_by_dim=True,
-            score_bias=None,
-        )
-
-        if self.root_weight:
-            x_r = self.lin_skip(x[1])
-            if self.lin_beta is not None:
-                beta = self.lin_beta(torch.cat([out, x_r, out - x_r], dim=-1))
-                beta = beta.sigmoid()
-                out = beta * x_r + (1 - beta) * out
-            else:
-                out = out + x_r
-
-        return out
-
-    def __repr__(self) -> str:
-        return (
-            f"{self.__class__.__name__}({self.in_channels}, "
-            f"{self.out_channels}, heads={self.heads})"
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
index 7b2f306..b42fb48 100644
--- a/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
+++ b/python/cugraph-pyg/cugraph_pyg/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2025, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -43,12 +43,6 @@
     gpubenchmark = pytest_benchmark.plugin.benchmark
 
 
-def pytest_ignore_collect(collection_path, config):
-    """Return True to prevent considering this path for collection."""
-    if "nn" in collection_path.name:
-        return True
-
-
 @pytest.fixture(scope="module")
 def dask_client():
     dask_scheduler_file = os.environ.get("SCHEDULER_FILE")
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
deleted file mode 100644
index 92d216f..0000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gat_conv.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import GATConv as CuGraphGATConv
-from cugraph_pyg.utils.imports import package_available
-
-ATOL = 1e-6
-
-
-@pytest.mark.skipif(
-    package_available("torch_geometric<2.5"), reason="Test requires pyg>=2.5"
-)
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_gat_conv_equality(
-    use_edge_index,
-    bias,
-    bipartite,
-    concat,
-    heads,
-    max_num_neighbors,
-    use_edge_attr,
-    graph,
-    request,
-):
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import GATConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphGATConv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphGATConv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(bias=bias, concat=concat, edge_dim=edge_dim)
-
-    conv1 = GATConv(
-        in_channels, out_channels, heads, add_self_loops=False, **kwargs
-    ).cuda()
-    conv2 = CuGraphGATConv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    out_dim = heads * out_channels
-    with torch.no_grad():
-        if bipartite:
-            conv2.lin_src.weight.copy_(conv1.lin_src.weight)
-            conv2.lin_dst.weight.copy_(conv1.lin_dst.weight)
-        else:
-            conv2.lin.weight.copy_(conv1.lin.weight)
-
-        conv2.att[:out_dim].copy_(conv1.att_src.flatten())
-        conv2.att[out_dim : 2 * out_dim].copy_(conv1.att_dst.flatten())
-        if use_edge_attr:
-            conv2.att[2 * out_dim :].copy_(conv1.att_edge.flatten())
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr, max_num_neighbors=max_num_neighbors)
-    else:
-        out2 = conv2(
-            x, csc, edge_attr=edge_attr_perm, max_num_neighbors=max_num_neighbors
-        )
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    if bipartite:
-        assert torch.allclose(
-            conv1.lin_src.weight.grad, conv2.lin_src.weight.grad, atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.lin_dst.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL
-        )
-    else:
-        assert torch.allclose(conv1.lin.weight.grad, conv2.lin.weight.grad, atol=ATOL)
-
-    assert torch.allclose(
-        conv1.att_src.grad.flatten(), conv2.att.grad[:out_dim], atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.att_dst.grad.flatten(), conv2.att.grad[out_dim : 2 * out_dim], atol=ATOL
-    )
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.att_edge.grad.flatten(), conv2.att.grad[2 * out_dim :], atol=ATOL
-        )
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
-
-    if bias:
-        assert torch.allclose(conv1.bias.grad, conv2.bias.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
deleted file mode 100644
index 2e22192..0000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_gatv2_conv.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import GATv2Conv as CuGraphGATv2Conv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_gatv2_conv_equality(
-    use_edge_index, bipartite, concat, heads, use_edge_attr, graph, request
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import GATv2Conv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphGATv2Conv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphGATv2Conv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(bias=False, concat=concat, edge_dim=edge_dim)
-
-    conv1 = GATv2Conv(
-        in_channels, out_channels, heads, add_self_loops=False, **kwargs
-    ).cuda()
-    conv2 = CuGraphGATv2Conv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    with torch.no_grad():
-        conv2.lin_src.weight.copy_(conv1.lin_l.weight)
-        conv2.lin_dst.weight.copy_(conv1.lin_r.weight)
-        conv2.att.copy_(conv1.att.flatten())
-        if use_edge_attr:
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr)
-    else:
-        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    assert torch.allclose(conv1.lin_l.weight.grad, conv2.lin_src.weight.grad, atol=ATOL)
-    assert torch.allclose(conv1.lin_r.weight.grad, conv2.lin_dst.weight.grad, atol=ATOL)
-
-    assert torch.allclose(conv1.att.grad.flatten(), conv2.att.grad, atol=ATOL)
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
deleted file mode 100644
index f182869..0000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_hetero_gat_conv.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import HeteroGATConv as CuGraphHeteroGATConv
-from cugraph_pyg.utils.imports import package_available
-
-ATOL = 1e-6
-
-
-@pytest.mark.cugraph_ops
-@pytest.mark.skipif(
-    package_available("torch_geometric<2.4"), reason="Test requires pyg>=2.4"
-)
-@pytest.mark.parametrize("heads", [1, 3, 10])
-@pytest.mark.parametrize("aggr", ["sum", "mean"])
-@pytest.mark.sg
-def test_hetero_gat_conv_equality(sample_pyg_hetero_data, aggr, heads):
-    import torch
-    from torch_geometric.data import HeteroData
-    from torch_geometric.nn import HeteroConv, GATConv
-
-    device = torch.device("cuda")
-    data = HeteroData(sample_pyg_hetero_data).to(device)
-
-    in_channels_dict = {k: v.size(1) for k, v in data.x_dict.items()}
-    out_channels = 2
-
-    convs_dict = {}
-    kwargs1 = dict(heads=heads, add_self_loops=False, bias=False)
-    for edge_type in data.edge_types:
-        src_t, _, dst_t = edge_type
-        in_channels_src, in_channels_dst = data.x_dict[src_t].size(-1), data.x_dict[
-            dst_t
-        ].size(-1)
-        if src_t == dst_t:
-            convs_dict[edge_type] = GATConv(in_channels_src, out_channels, **kwargs1)
-        else:
-            convs_dict[edge_type] = GATConv(
-                (in_channels_src, in_channels_dst), out_channels, **kwargs1
-            )
-
-    conv1 = HeteroConv(convs_dict, aggr=aggr).to(device)
-    kwargs2 = dict(
-        heads=heads,
-        aggr=aggr,
-        node_types=data.node_types,
-        edge_types=data.edge_types,
-        bias=False,
-    )
-    conv2 = CuGraphHeteroGATConv(in_channels_dict, out_channels, **kwargs2).to(device)
-
-    # copy over linear and attention weights
-    w_src, w_dst = conv2.split_tensors(conv2.lin_weights, dim=0)
-    with torch.no_grad():
-        for edge_type in conv2.edge_types:
-            src_t, _, dst_t = edge_type
-            if src_t == dst_t:
-                w_src[edge_type].copy_(conv1.convs[edge_type].lin.weight)
-            else:
-                w_src[edge_type].copy_(conv1.convs[edge_type].lin_src.weight)
-                if w_dst[edge_type] is not None:
-                    w_dst[edge_type].copy_(conv1.convs[edge_type].lin_dst.weight)
-
-            conv2.attn_weights[edge_type][: heads * out_channels].copy_(
-                conv1.convs[edge_type].att_src.flatten()
-            )
-            conv2.attn_weights[edge_type][heads * out_channels :].copy_(
-                conv1.convs[edge_type].att_dst.flatten()
-            )
-
-    out1 = conv1(data.x_dict, data.edge_index_dict)
-    out2 = conv2(data.x_dict, data.edge_index_dict)
-
-    for node_type in data.node_types:
-        assert torch.allclose(out1[node_type], out2[node_type], atol=ATOL)
-
-    loss1 = 0
-    loss2 = 0
-    for node_type in data.node_types:
-        loss1 += out1[node_type].mean()
-        loss2 += out2[node_type].mean()
-
-    loss1.backward()
-    loss2.backward()
-
-    # check gradient w.r.t attention weights
-    out_dim = heads * out_channels
-    for edge_type in conv2.edge_types:
-        assert torch.allclose(
-            conv1.convs[edge_type].att_src.grad.flatten(),
-            conv2.attn_weights[edge_type].grad[:out_dim],
-            atol=ATOL,
-        )
-        assert torch.allclose(
-            conv1.convs[edge_type].att_dst.grad.flatten(),
-            conv2.attn_weights[edge_type].grad[out_dim:],
-            atol=ATOL,
-        )
-
-    # check gradient w.r.t linear weights
-    grad_lin_weights_ref = dict.fromkeys(out1.keys())
-    for node_t, (rels_as_src, rels_as_dst) in conv2.relations_per_ntype.items():
-        grad_list = []
-        for rel_t in rels_as_src:
-            src_type, _, dst_type = rel_t
-            if src_type == dst_type:
-                grad_list.append(conv1.convs[rel_t].lin.weight.grad.clone())
-            else:
-                grad_list.append(conv1.convs[rel_t].lin_src.weight.grad.clone())
-        for rel_t in rels_as_dst:
-            grad_list.append(conv1.convs[rel_t].lin_dst.weight.grad.clone())
-        assert len(grad_list) > 0
-        grad_lin_weights_ref[node_t] = torch.vstack(grad_list)
-
-    for node_type in conv2.lin_weights:
-        assert torch.allclose(
-            grad_lin_weights_ref[node_type],
-            conv2.lin_weights[node_type].grad,
-            atol=ATOL,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
deleted file mode 100644
index 8b06cb2..0000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_rgcn_conv.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import RGCNConv as CuGraphRGCNConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("aggr", ["add", "sum", "mean"])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("num_bases", [1, 2, None])
-@pytest.mark.parametrize("root_weight", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_rgcn_conv_equality(
-    use_edge_index,
-    aggr,
-    bias,
-    max_num_neighbors,
-    num_bases,
-    root_weight,
-    graph,
-    request,
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import FastRGCNConv as RGCNConv
-
-    torch.manual_seed(12345)
-    in_channels, out_channels, num_relations = (4, 2, 3)
-    kwargs = dict(aggr=aggr, bias=bias, num_bases=num_bases, root_weight=root_weight)
-
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-    edge_type = torch.randint(num_relations, (edge_index.size(1),)).cuda()
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        csc, edge_type_perm = CuGraphRGCNConv.to_csc(edge_index, size, edge_type)
-
-    x = torch.rand(size[0], in_channels, device="cuda")
-
-    conv1 = RGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
-    conv2 = CuGraphRGCNConv(in_channels, out_channels, num_relations, **kwargs).cuda()
-
-    with torch.no_grad():
-        if root_weight:
-            conv2.weight[:-1].copy_(conv1.weight)
-            conv2.weight[-1].copy_(conv1.root)
-        else:
-            conv2.weight.copy_(conv1.weight)
-        if num_bases is not None:
-            conv2.comp.copy_(conv1.comp)
-
-    out1 = conv1(x, edge_index, edge_type)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_type)
-    else:
-        out2 = conv2(x, csc, edge_type_perm, max_num_neighbors=max_num_neighbors)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.rand_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    if root_weight:
-        assert torch.allclose(conv1.weight.grad, conv2.weight.grad[:-1], atol=ATOL)
-        assert torch.allclose(conv1.root.grad, conv2.weight.grad[-1], atol=ATOL)
-    else:
-        assert torch.allclose(conv1.weight.grad, conv2.weight.grad, atol=ATOL)
-
-    if num_bases is not None:
-        assert torch.allclose(conv1.comp.grad, conv2.comp.grad, atol=ATOL)
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
deleted file mode 100644
index 878ceff..0000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_sage_conv.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import SAGEConv as CuGraphSAGEConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("aggr", ["sum", "mean", "min", "max"])
-@pytest.mark.parametrize("bias", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("max_num_neighbors", [8, None])
-@pytest.mark.parametrize("normalize", [True, False])
-@pytest.mark.parametrize("root_weight", [True, False])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_sage_conv_equality(
-    use_edge_index,
-    aggr,
-    bias,
-    bipartite,
-    max_num_neighbors,
-    normalize,
-    root_weight,
-    graph,
-    request,
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import SAGEConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        csc = CuGraphSAGEConv.to_csc(edge_index, size)
-
-    if bipartite:
-        in_channels = (7, 3)
-        x = (
-            torch.rand(size[0], in_channels[0]).cuda(),
-            torch.rand(size[1], in_channels[1]).cuda(),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels).cuda()
-    out_channels = 4
-
-    kwargs = dict(aggr=aggr, bias=bias, normalize=normalize, root_weight=root_weight)
-
-    conv1 = SAGEConv(in_channels, out_channels, **kwargs).cuda()
-    conv2 = CuGraphSAGEConv(in_channels, out_channels, **kwargs).cuda()
-
-    in_channels_src = conv2.in_channels_src
-    with torch.no_grad():
-        conv2.lin.weight[:, :in_channels_src].copy_(conv1.lin_l.weight)
-        if root_weight:
-            conv2.lin.weight[:, in_channels_src:].copy_(conv1.lin_r.weight)
-        if bias:
-            conv2.lin.bias.copy_(conv1.lin_l.bias)
-
-    out1 = conv1(x, edge_index)
-    out2 = conv2(x, csc, max_num_neighbors=max_num_neighbors)
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_out = torch.rand_like(out1)
-    out1.backward(grad_out)
-    out2.backward(grad_out)
-
-    assert torch.allclose(
-        conv1.lin_l.weight.grad,
-        conv2.lin.weight.grad[:, :in_channels_src],
-        atol=ATOL,
-    )
-
-    if root_weight:
-        assert torch.allclose(
-            conv1.lin_r.weight.grad,
-            conv2.lin.weight.grad[:, in_channels_src:],
-            atol=ATOL,
-        )
-
-    if bias:
-        assert torch.allclose(
-            conv1.lin_l.bias.grad,
-            conv2.lin.bias.grad,
-            atol=ATOL,
-        )
diff --git a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py b/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
deleted file mode 100644
index d207a4d..0000000
--- a/python/cugraph-pyg/cugraph_pyg/tests/nn/test_transformer_conv.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from cugraph_pyg.nn import TransformerConv as CuGraphTransformerConv
-
-ATOL = 1e-6
-
-
-@pytest.mark.parametrize("use_edge_index", [True, False])
-@pytest.mark.parametrize("use_edge_attr", [True, False])
-@pytest.mark.parametrize("bipartite", [True, False])
-@pytest.mark.parametrize("concat", [True, False])
-@pytest.mark.parametrize("heads", [1, 2, 3, 5, 10, 16])
-@pytest.mark.parametrize("graph", ["basic_pyg_graph_1", "basic_pyg_graph_2"])
-@pytest.mark.sg
-def test_transformer_conv_equality(
-    use_edge_index, use_edge_attr, bipartite, concat, heads, graph, request
-):
-    pytest.importorskip("torch_geometric", reason="PyG not available")
-    import torch
-    from torch_geometric import EdgeIndex
-    from torch_geometric.nn import TransformerConv
-
-    torch.manual_seed(12345)
-    edge_index, size = request.getfixturevalue(graph)
-    edge_index = edge_index.cuda()
-
-    if bipartite:
-        in_channels = (5, 3)
-        x = (
-            torch.rand(size[0], in_channels[0], device="cuda"),
-            torch.rand(size[1], in_channels[1], device="cuda"),
-        )
-    else:
-        in_channels = 5
-        x = torch.rand(size[0], in_channels, device="cuda")
-    out_channels = 2
-
-    if use_edge_attr:
-        edge_dim = 3
-        edge_attr = torch.rand(edge_index.size(1), edge_dim).cuda()
-    else:
-        edge_dim = edge_attr = None
-
-    if use_edge_index:
-        csc = EdgeIndex(edge_index, sparse_size=size)
-    else:
-        if use_edge_attr:
-            csc, edge_attr_perm = CuGraphTransformerConv.to_csc(
-                edge_index, size, edge_attr=edge_attr
-            )
-        else:
-            csc = CuGraphTransformerConv.to_csc(edge_index, size)
-            edge_attr_perm = None
-
-    kwargs = dict(concat=concat, bias=False, edge_dim=edge_dim, root_weight=False)
-
-    conv1 = TransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
-    conv2 = CuGraphTransformerConv(in_channels, out_channels, heads, **kwargs).cuda()
-
-    with torch.no_grad():
-        conv2.lin_query.weight.copy_(conv1.lin_query.weight)
-        conv2.lin_key.weight.copy_(conv1.lin_key.weight)
-        conv2.lin_value.weight.copy_(conv1.lin_value.weight)
-        conv2.lin_query.bias.copy_(conv1.lin_query.bias)
-        conv2.lin_key.bias.copy_(conv1.lin_key.bias)
-        conv2.lin_value.bias.copy_(conv1.lin_value.bias)
-        if use_edge_attr:
-            conv2.lin_edge.weight.copy_(conv1.lin_edge.weight)
-
-    out1 = conv1(x, edge_index, edge_attr=edge_attr)
-    if use_edge_index:
-        out2 = conv2(x, csc, edge_attr=edge_attr)
-    else:
-        out2 = conv2(x, csc, edge_attr=edge_attr_perm)
-
-    assert torch.allclose(out1, out2, atol=ATOL)
-
-    grad_output = torch.rand_like(out1)
-    out1.backward(grad_output)
-    out2.backward(grad_output)
-
-    assert torch.allclose(
-        conv1.lin_query.weight.grad, conv2.lin_query.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_key.weight.grad, conv2.lin_key.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_value.weight.grad, conv2.lin_value.weight.grad, atol=ATOL
-    )
-    assert torch.allclose(
-        conv1.lin_query.bias.grad, conv2.lin_query.bias.grad, atol=ATOL
-    )
-    assert torch.allclose(conv1.lin_key.bias.grad, conv2.lin_key.bias.grad, atol=ATOL)
-    assert torch.allclose(
-        conv1.lin_value.bias.grad, conv2.lin_value.bias.grad, atol=ATOL
-    )
-
-    if use_edge_attr:
-        assert torch.allclose(
-            conv1.lin_edge.weight.grad, conv2.lin_edge.weight.grad, atol=ATOL
-        )