Fix sparse utilities

rapidsai · Jan 14, 2025 · 9c85366 · 9c85366
1 parent 1b62c41
commit 9c85366
Show file tree

Hide file tree

Showing 18 changed files with 94 additions and 94 deletions.
diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -95,7 +95,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
                       });
 
     raft::sparse::convert::sorted_coo_to_csr(
-      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream);
+      knn_graph_coo.rows(), (value_idx)knn_graph_coo.nnz, indptr.data(), m + 1, stream);
 
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors

diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
@@ -98,7 +98,7 @@ void connect_knn_graph(
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
   raft::sparse::convert::sorted_coo_to_csr(
-    connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream);
+    connected_edges.rows(), (value_idx)connected_edges.nnz, indptr2.data(), m + 1, stream);
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
@@ -204,4 +204,4 @@ void build_sorted_mst(
   raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream);
 }
 
-};  // namespace raft::cluster::detail
+};  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
@@ -52,8 +52,8 @@ void coo_to_csr(raft::resources const& handle,
  * @param m: number of rows in dense matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+template <typename T, typename outT>
+void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream_t stream)
 {
   detail::sorted_coo_to_csr(rows, nnz, row_ind, m, stream);
 }
@@ -65,10 +65,10 @@ void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t s
  * @param row_ind: output row indices array
  * @param stream: cuda stream to use
  */
-template <typename T>
-void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
+template <typename T, typename outT>
+void sorted_coo_to_csr(COO<T>* coo, outT* row_ind, cudaStream_t stream)
 {
-  detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
+  detail::sorted_coo_to_csr(coo->rows(), (outT)coo->nnz, row_ind, coo->n_rows, stream);
 }
 
 /**

diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -84,18 +84,18 @@ void coo_to_csr(raft::resources const& handle,
  * @param m: number of rows in dense matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+template <typename T, typename outT>
+void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream_t stream)
 {
   rmm::device_uvector<T> row_counts(m, stream);
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (uint64_t)m * sizeof(T), stream));
 
   linalg::coo_degree(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
   thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<T> c_ind_d      = thrust::device_pointer_cast(row_ind);
+  thrust::device_ptr<outT> c_ind_d   = thrust::device_pointer_cast(row_ind);
   exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 }
 

diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
@@ -52,7 +52,7 @@ class COO {
   rmm::device_uvector<T> vals_arr;
 
  public:
-  Index_Type nnz;
+  uint64_t nnz;
   Index_Type n_rows;
   Index_Type n_cols;
 
@@ -75,7 +75,7 @@ class COO {
   COO(rmm::device_uvector<Index_Type>& rows,
       rmm::device_uvector<Index_Type>& cols,
       rmm::device_uvector<T>& vals,
-      Index_Type nnz,
+      uint64_t nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0)
     : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
@@ -90,7 +90,7 @@ class COO {
    * @param init: initialize arrays with zeros
    */
   COO(cudaStream_t stream,
-      Index_Type nnz,
+      uint64_t nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0,
       bool init         = true)
@@ -121,7 +121,7 @@ class COO {
    */
   bool validate_size() const
   {
-    if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false;
+    if (this->nnz <= 0 || n_rows <= 0 || n_cols <= 0) return false;
     return true;
   }
 
@@ -204,7 +204,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(Index_Type nnz, bool init, cudaStream_t stream)
+  void allocate(uint64_t nnz, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, 0, init, stream);
   }
@@ -216,7 +216,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(Index_Type nnz, Index_Type size, bool init, cudaStream_t stream)
+  void allocate(uint64_t nnz, Index_Type size, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, size, size, init, stream);
   }
@@ -229,8 +229,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: stream to use for init
    */
-  void allocate(
-    Index_Type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
+  void allocate(uint64_t nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;

diff --git a/cpp/include/raft/sparse/detail/utils.h b/cpp/include/raft/sparse/detail/utils.h
@@ -103,10 +103,10 @@ void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_
   iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
 }
 
-template <typename T>
-__device__ int get_stop_idx(T row, T m, T nnz, const T* ind)
+template <typename T, typename indT>
+__device__ indT get_stop_idx(T row, T m, indT nnz, const indT* ind)
 {
-  int stop_idx = 0;
+  indT stop_idx = 0;
   if (row < (m - 1))
     stop_idx = ind[row + 1];
   else

diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -34,7 +34,7 @@ namespace linalg {
  * @param stream: cuda stream to use
  */
 template <typename T = int>
-void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+void coo_degree(const T* rows, uint64_t nnz, T* results, cudaStream_t stream)
 {
   detail::coo_degree<64, T>(rows, nnz, results, stream);
 }
@@ -66,7 +66,7 @@ void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
  */
 template <typename T>
 void coo_degree_scalar(
-  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results, cudaStream_t stream = 0)
 {
   detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
 }
@@ -120,4 +120,4 @@ void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
+#endif
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -39,10 +39,10 @@ namespace detail {
  * @param nnz the size of the rows array
  * @param results array to place results
  */
-template <int TPB_X = 64, typename T = int>
-RAFT_KERNEL coo_degree_kernel(const T* rows, int nnz, T* results)
+template <uint64_t TPB_X = 64, typename T = int>
+RAFT_KERNEL coo_degree_kernel(const T* rows, uint64_t nnz, T* results)
 {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz) { atomicAdd(results + rows[row], (T)1); }
 }
 
@@ -54,8 +54,8 @@ RAFT_KERNEL coo_degree_kernel(const T* rows, int nnz, T* results)
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64, typename T = int>
-void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+template <uint64_t TPB_X = 64, typename T = int>
+void coo_degree(const T* rows, uint64_t nnz, T* results, cudaStream_t stream)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
@@ -71,11 +71,11 @@ RAFT_KERNEL coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* r
   if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
-template <int TPB_X = 64, typename T>
+template <uint64_t TPB_X = 64, typename T>
 RAFT_KERNEL coo_degree_scalar_kernel(
-  const int* rows, const T* vals, int nnz, T scalar, int* results)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results)
 {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
@@ -90,9 +90,9 @@ RAFT_KERNEL coo_degree_scalar_kernel(
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64, typename T>
+template <uint64_t TPB_X = 64, typename T>
 void coo_degree_scalar(
-  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results, cudaStream_t stream = 0)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);

diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -40,15 +40,15 @@ namespace sparse {
 namespace linalg {
 namespace detail {
 
-template <int TPB_X = 64, typename T>
+template <int TPB_X = 64, typename T, typename indT>
 RAFT_KERNEL csr_row_normalize_l1_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int* ia,  // csr row ex_scan (sorted by row)
+  const indT* ia,  // csr row ex_scan (sorted by row)
   const T* vals,
-  int nnz,  // array of values and number of non-zeros
-  int m,    // num rows in csr
+  indT nnz,  // array of values and number of non-zeros
+  int m,     // num rows in csr
   T* result)
 {  // output array
 
@@ -57,19 +57,19 @@ RAFT_KERNEL csr_row_normalize_l1_kernel(
 
   // sum all vals_arr for row and divide each val by sum
   if (row < m) {
-    int start_idx = ia[row];
-    int stop_idx  = 0;
+    indT start_idx = ia[row];
+    indT stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
       stop_idx = nnz;
 
     T sum = T(0.0);
-    for (int j = start_idx; j < stop_idx; j++) {
+    for (indT j = start_idx; j < stop_idx; j++) {
       sum = sum + fabs(vals[j]);
     }
 
-    for (int j = start_idx; j < stop_idx; j++) {
+    for (indT j = start_idx; j < stop_idx; j++) {
       if (sum != 0.0) {
         T val     = vals[j];
         result[j] = val / sum;
@@ -90,11 +90,11 @@ RAFT_KERNEL csr_row_normalize_l1_kernel(
  * @param result: l1 normalized data array
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64, typename T>
-void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+template <int TPB_X = 64, typename T, typename indT>
+void csr_row_normalize_l1(const indT* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
+                          indT nnz,  // array of values and number of non-zeros
+                          int m,     // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
@@ -232,4 +232,4 @@ void rowNormCsrCaller(const IdxType* ia,
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -48,39 +48,39 @@ namespace detail {
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
 template <int TPB_X = 128, typename T, typename Lambda>
-RAFT_KERNEL coo_symmetrize_kernel(int* row_ind,
+RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
                                   int* rows,
                                   int* cols,
                                   T* vals,
                                   int* orows,
                                   int* ocols,
                                   T* ovals,
                                   int n,
-                                  int cnnz,
+                                  uint64_t cnnz,
                                   Lambda reduction_op)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < n) {
-    int start_idx = row_ind[row];  // each thread processes one row
-    int stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
+    uint64_t start_idx = row_ind[row];  // each thread processes one row
+    uint64_t stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
 
-    int row_nnz       = 0;
-    int out_start_idx = start_idx * 2;
+    int row_nnz            = 0;
+    uint64_t out_start_idx = start_idx * 2;
 
     for (int idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[idx + start_idx];
       int cur_col = cols[idx + start_idx];
       T cur_val   = vals[idx + start_idx];
 
-      int lookup_row = cur_col;
-      int t_start    = row_ind[lookup_row];  // Start at
-      int t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      int lookup_row   = cur_col;
+      uint64_t t_start = row_ind[lookup_row];  // Start at
+      uint64_t t_stop  = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
       bool found_match = false;
-      for (int t_idx = t_start; t_idx < t_stop; t_idx++) {
+      for (uint64_t t_idx = t_start; t_idx < t_stop; t_idx++) {
         // If we find a match, let's get out of the loop. We won't
         // need to modify the transposed value, since that will be
         // done in a different thread.
@@ -142,7 +142,7 @@ void coo_symmetrize(COO<T>* in,
 
   ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
 
-  rmm::device_uvector<int> in_row_ind(in->n_rows, stream);
+  rmm::device_uvector<uint64_t> in_row_ind(in->n_rows, stream);
 
   convert::sorted_coo_to_csr(in, in_row_ind.data(), stream);
 

diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -36,11 +36,11 @@ namespace linalg {
  * @param result: l1 normalized data array
  * @param stream: cuda stream to use
  */
-template <typename T>
-void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+template <typename T, typename indT>
+void csr_row_normalize_l1(const indT* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
+                          indT nnz,  // array of values and number of non-zeros
+                          int m,     // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
@@ -104,4 +104,4 @@ void rowNormCsr(raft::resources const& handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
+#endif
diff --git a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
@@ -242,7 +242,7 @@ void perform_1nn(raft::resources const& handle,
   // the color components.
   auto colors_group_idxs = raft::make_device_vector<value_idx, value_idx>(handle, n_components + 1);
   raft::sparse::convert::sorted_coo_to_csr(
-    colors, n_rows, colors_group_idxs.data_handle(), n_components + 1, stream);
+    colors, (value_idx)n_rows, colors_group_idxs.data_handle(), n_components + 1, stream);
 
   auto group_idxs_view = raft::make_device_vector_view<const value_idx, value_idx>(
     colors_group_idxs.data_handle() + 1, n_components);