From 8f707407e28c41ad9995955c38322acd00939404 Mon Sep 17 00:00:00 2001 From: Gerald Paul Bowen Collom Date: Tue, 27 Jun 2023 23:06:19 -0700 Subject: [PATCH] Use config flag for locality-aware mpi --- src/parcsr_ls/par_cycle.c | 11 + src/parcsr_mv/_hypre_parcsr_mv.h | 16 ++ src/parcsr_mv/new_commpkg.c | 40 +++ src/parcsr_mv/par_csr_communication.c | 392 +++++++++++++++++++++++++- src/parcsr_mv/par_csr_communication.h | 16 ++ src/test/ij.c | 16 ++ src/utilities/_hypre_utilities.h | 11 + src/utilities/general.c | 5 + src/utilities/handle.h | 11 + 9 files changed, 514 insertions(+), 4 deletions(-) diff --git a/src/parcsr_ls/par_cycle.c b/src/parcsr_ls/par_cycle.c index 8819de400d..d5f32999e3 100644 --- a/src/parcsr_ls/par_cycle.c +++ b/src/parcsr_ls/par_cycle.c @@ -286,6 +286,17 @@ hypre_BoomerAMGCycle( void *amg_vdata, hypre_GpuProfilingPushRange(nvtx_name); while (Not_Finished) { +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (level >= hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())) + { + if (my_id == 0) { printf("LVL %d using node aware, th: %d\n", level, hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())); } + hypre_HandleUsingNodeAwareMPI(hypre_handle()) = 1; + } else + { + if (my_id == 0) { printf("LVL %d NOT using node aware\n", level); } + hypre_HandleUsingNodeAwareMPI(hypre_handle()) = 0; + } +#endif if (num_levels > 1) { local_size = hypre_VectorSize(hypre_ParVectorLocalVector(F_array[level])); diff --git a/src/parcsr_mv/_hypre_parcsr_mv.h b/src/parcsr_mv/_hypre_parcsr_mv.h index 56c61b3c75..d93a800f03 100644 --- a/src/parcsr_mv/_hypre_parcsr_mv.h +++ b/src/parcsr_mv/_hypre_parcsr_mv.h @@ -23,6 +23,10 @@ extern "C" { #ifndef HYPRE_PAR_CSR_COMMUNICATION_HEADER #define HYPRE_PAR_CSR_COMMUNICATION_HEADER +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*-------------------------------------------------------------------------- * hypre_ParCSRCommPkg: * Structure containing information for doing communications @@ -59,6 +63,9 @@ typedef struct void *recv_data_buffer; HYPRE_Int num_requests; hypre_MPI_Request *requests; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Request *Xrequest; +#endif } hypre_ParCSRCommHandle; typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; @@ -66,6 +73,10 @@ typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; typedef struct _hypre_ParCSRCommPkg { MPI_Comm comm; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Comm *neighbor_comm; + MPIX_Comm *neighborT_comm; +#endif HYPRE_Int num_components; HYPRE_Int num_sends; HYPRE_Int *send_procs; @@ -75,6 +86,11 @@ typedef struct _hypre_ParCSRCommPkg HYPRE_Int num_recvs; HYPRE_Int *recv_procs; HYPRE_Int *recv_vec_starts; + HYPRE_Int use_neighbor; +#ifdef HYPRE_USING_NODE_AWARE_MPI + long *global_send_indices; + long *global_recv_indices; +#endif /* remote communication information */ hypre_MPI_Datatype *send_mpi_types; hypre_MPI_Datatype *recv_mpi_types; diff --git a/src/parcsr_mv/new_commpkg.c b/src/parcsr_mv/new_commpkg.c index b12c2112ec..a81f2f7d6d 100644 --- a/src/parcsr_mv/new_commpkg.c +++ b/src/parcsr_mv/new_commpkg.c @@ -11,6 +11,9 @@ *-----------------------------------------------------*/ #include "_hypre_parcsr_mv.h" +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include +#endif /* some debugging tools*/ #define mydebug 0 @@ -546,6 +549,43 @@ hypre_ParCSRCommPkgCreateApart num_sends, send_procs, send_map_starts, send_map_elmts, &comm_pkg); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_pkg->use_neighbor) { + int *sendcounts = (int *)malloc(num_sends * sizeof(int)); + int *recvcounts = (int *)malloc(num_recvs * sizeof(int)); + for (int i = 0; i < num_sends; i++) { + sendcounts[i] = send_map_starts[i+1] - send_map_starts[i]; + } + for (int i = 0; i < num_recvs; i++) { + recvcounts[i] = recv_vec_starts[i+1] - recv_vec_starts[i]; + } + MPIX_Dist_graph_create_adjacent( comm, num_recvs, hypre_ParCSRCommPkgRecvProcs(comm_pkg), + recvcounts, + num_sends, hypre_ParCSRCommPkgSendProcs(comm_pkg), + sendcounts, + MPI_INFO_NULL, 0, &(comm_pkg->neighbor_comm)); + MPIX_Dist_graph_create_adjacent( comm, num_sends, hypre_ParCSRCommPkgSendProcs(comm_pkg), + sendcounts, + num_recvs, hypre_ParCSRCommPkgRecvProcs(comm_pkg), + recvcounts, + MPI_INFO_NULL, 0, &(comm_pkg->neighborT_comm)); + + HYPRE_Int num_send_elmts = send_map_starts[num_sends]; + comm_pkg->global_send_indices = hypre_CTAlloc(long, num_send_elmts, HYPRE_MEMORY_HOST); + for (int i = 0; i < num_sends; i++) { + for (int j = send_map_starts[i]; j < send_map_starts[i+1]; j++) { + comm_pkg->global_send_indices[j] = send_map_elmts[j] + first_col_diag; + } + } + HYPRE_Int num_recv_elmts = recv_vec_starts[num_recvs]; + comm_pkg->global_recv_indices = hypre_CTAlloc(long, num_recv_elmts, HYPRE_MEMORY_HOST); + for (int i = 0; i < num_recvs; i++) { + for (int j = recv_vec_starts[i]; j < recv_vec_starts[i+1]; j++) { + comm_pkg->global_recv_indices[j] = col_map_off_d[j]; + } + } + } +#endif return hypre_error_flag; } diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c index 70a355f1ba..5aa590fe65 100644 --- a/src/parcsr_mv/par_csr_communication.c +++ b/src/parcsr_mv/par_csr_communication.c @@ -7,6 +7,10 @@ #include "_hypre_parcsr_mv.h" +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*==========================================================================*/ #ifdef HYPRE_USING_PERSISTENT_COMM @@ -66,6 +70,23 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_ParCSRCommHandleNumRequests(comm_handle) = num_requests; hypre_ParCSRCommHandleRequests(comm_handle) = requests; +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_pkg->neighbor_comm == NULL) { + hypre_printf("Trying to communicate with a NULL communicator\n"); + hypre_assert(1 == 0); + } else if (!comm_pkg->use_neighbor) { + hypre_printf("Trying to use neighbor collectives without proper setup"); + hypre_assert(1 == 0); + } + MPIX_Request *Xrequest; + + HYPRE_Int *send_sizes = hypre_TAlloc(HYPRE_Int, num_sends, HYPRE_MEMORY_HOST); + HYPRE_Int *recv_sizes = hypre_TAlloc(HYPRE_Int, num_recvs, HYPRE_MEMORY_HOST); + + HYPRE_BigInt num_send_elmts = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends); + HYPRE_BigInt num_recv_elmts = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs); +#endif + void *send_buff = NULL, *recv_buff = NULL; switch (job_type) @@ -77,6 +98,52 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (hypre_HandleUsingNodeAwareMPI(hypre_handle()) == 0) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Complex *)recv_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + num_recvs + i ); + } + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + //if (rank == 0) { hypre_printf("Standard init done\n"); } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_alltoallv_init( (HYPRE_Complex *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_COMPLEX, + (HYPRE_Complex *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_COMPLEX, comm_pkg->neighbor_comm, + MPI_INFO_NULL, &Xrequest); + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + //if (rank == 0) { hypre_printf("Node-aware init done\n"); } + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -93,6 +160,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, ip, 0, comm, requests + num_recvs + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_COMPLEX_TRANSPOSE: @@ -102,6 +170,47 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_Complex, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Complex *)recv_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, requests + num_sends + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_alltoallv_init( (HYPRE_Complex *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_COMPLEX, + (HYPRE_Complex *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_COMPLEX, comm_pkg->neighborT_comm, + MPI_INFO_NULL, &Xrequest); + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + //if (rank == 0) { hypre_printf("Node-aware transpose init done\n"); } + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -118,6 +227,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Complex *)send_buff + vec_start, vec_len, HYPRE_MPI_COMPLEX, ip, 0, comm, requests + num_sends + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_INT: @@ -127,6 +237,46 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Int *)recv_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + num_recvs + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_alltoallv_init( (HYPRE_Int *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_INT, + (HYPRE_Int *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_INT, comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -143,6 +293,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, ip, 0, comm, requests + num_recvs + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_INT_TRANSPOSE: @@ -152,6 +303,44 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_Int, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_Int *)recv_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, + ip, 0, comm, requests + num_sends + i ); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_alltoallv_init( (HYPRE_Int *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_INT, + (HYPRE_Int *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_INT, comm_pkg->neighborT_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -168,6 +357,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_MPI_Send_init( (HYPRE_Int *)send_buff + vec_start, vec_len, HYPRE_MPI_INT, ip, 0, comm, requests + num_sends + i ); } +#endif break; case HYPRE_COMM_PKG_JOB_BIGINT: @@ -177,6 +367,48 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); recv_buff = hypre_TAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_BigInt *)recv_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + num_recvs + i); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_locality_alltoallv_init( (HYPRE_BigInt *)send_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + comm_pkg->global_send_indices, + HYPRE_MPI_BIG_INT, + (HYPRE_BigInt *)recv_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + comm_pkg->global_recv_indices, + HYPRE_MPI_BIG_INT, comm_pkg->neighbor_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_recvs; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); @@ -195,6 +427,7 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MPI_BIG_INT, ip, 0, comm, requests + num_recvs + i); } +#endif break; case HYPRE_COMM_PKG_JOB_BIGINT_TRANSPOSE: @@ -204,6 +437,46 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_MEMORY_HOST); send_buff = hypre_TAlloc(HYPRE_BigInt, hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs), HYPRE_MEMORY_HOST); +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + for (i = 0; i < num_sends; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Recv_init( (HYPRE_BigInt *)recv_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + i ); + } + for (i = 0; i < num_recvs; ++i) + { + HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, + HYPRE_MPI_BIG_INT, + ip, 0, comm, requests + num_sends + i); + } + } else { + for (i = 0; i < num_recvs; ++i) + { + recv_sizes[i] = (hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i)); + } + for (i = 0; i < num_sends; ++i) + { + send_sizes[i] = (hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - + hypre_ParCSRCommPkgSendMapStart(comm_pkg, i)); + } + MPIX_Neighbor_part_locality_alltoallv_init( (HYPRE_BigInt *)send_buff, recv_sizes, + hypre_ParCSRCommPkgRecvVecStarts(comm_pkg), + HYPRE_MPI_BIG_INT, + (HYPRE_BigInt *)recv_buff, send_sizes, + hypre_ParCSRCommPkgSendMapStarts(comm_pkg), + HYPRE_MPI_BIG_INT, comm_pkg->neighborT_comm, + 0, &Xrequest); + } +#else for (i = 0; i < num_sends; ++i) { HYPRE_Int ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); @@ -218,11 +491,11 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm HYPRE_Int ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); HYPRE_Int vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); HYPRE_Int vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; - hypre_MPI_Send_init( (HYPRE_BigInt *)send_buff + (HYPRE_BigInt)vec_start, vec_len, HYPRE_MPI_BIG_INT, ip, 0, comm, requests + num_sends + i); } +#endif break; default: hypre_assert(1 == 0); @@ -234,6 +507,14 @@ hypre_ParCSRPersistentCommHandleCreate( HYPRE_Int job, hypre_ParCSRCommPkg *comm hypre_ParCSRCommHandleNumSendBytes(comm_handle) = num_bytes_send; hypre_ParCSRCommHandleNumRecvBytes(comm_handle) = num_bytes_recv; + +#ifdef HYPRE_USING_NODE_AWARE_MPI + comm_handle->Xrequest = Xrequest; + + hypre_TFree(send_sizes, HYPRE_MEMORY_HOST); + hypre_TFree(recv_sizes, HYPRE_MEMORY_HOST); +#endif + return ( comm_handle ); } @@ -266,8 +547,14 @@ hypre_ParCSRPersistentCommHandleDestroy( hypre_ParCSRPersistentCommHandle *comm_ { hypre_TFree(hypre_ParCSRCommHandleSendDataBuffer(comm_handle), HYPRE_MEMORY_HOST); hypre_TFree(hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), HYPRE_MEMORY_HOST); - hypre_TFree(comm_handle->requests, HYPRE_MEMORY_HOST); - + if (comm_handle->requests) { + hypre_TFree(comm_handle->requests, HYPRE_MEMORY_HOST); + } +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (comm_handle->Xrequest && hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + MPIX_Request_free(comm_handle->Xrequest); + } +#endif hypre_TFree(comm_handle, HYPRE_MEMORY_HOST); } } @@ -284,6 +571,50 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha hypre_ParCSRCommHandleSendData(comm_handle) = send_data; hypre_ParCSRCommHandleSendMemoryLocation(comm_handle) = send_memory_location; +#ifdef HYPRE_USING_NODE_AWARE_MPI + if (!hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) + { + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + //if (rank == 0) { hypre_printf("Standard starting\n"); } + hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), + send_data, + char, + hypre_ParCSRCommHandleNumSendBytes(comm_handle), + HYPRE_MEMORY_HOST, + send_memory_location ); + HYPRE_Int ret = hypre_MPI_Startall(hypre_ParCSRCommHandleNumRequests(comm_handle), + hypre_ParCSRCommHandleRequests(comm_handle)); + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + } + } else { + if (comm_handle->Xrequest) + { + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + //if (rank == 0) { hypre_printf("Node-aware starting\n"); } + hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), + send_data, + char, + hypre_ParCSRCommHandleNumSendBytes(comm_handle), + HYPRE_MEMORY_HOST, + send_memory_location ); + //if (rank == 0) { hypre_printf("Node-aware start copied\n"); } + HYPRE_Int ret = (HYPRE_Int) MPIX_Start(comm_handle->Xrequest); + //if (rank == 0) { hypre_printf("Node-aware started\n"); } + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + } + } +#else if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) { hypre_TMemcpy( hypre_ParCSRCommHandleSendDataBuffer(comm_handle), @@ -292,7 +623,6 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha hypre_ParCSRCommHandleNumSendBytes(comm_handle), HYPRE_MEMORY_HOST, send_memory_location ); - HYPRE_Int ret = hypre_MPI_Startall(hypre_ParCSRCommHandleNumRequests(comm_handle), hypre_ParCSRCommHandleRequests(comm_handle)); if (hypre_MPI_SUCCESS != ret) @@ -301,6 +631,7 @@ hypre_ParCSRPersistentCommHandleStart( hypre_ParCSRPersistentCommHandle *comm_ha /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ } } +#endif } /*------------------------------------------------------------------ @@ -315,6 +646,52 @@ hypre_ParCSRPersistentCommHandleWait( hypre_ParCSRPersistentCommHandle *comm_han hypre_ParCSRCommHandleRecvData(comm_handle) = recv_data; hypre_ParCSRCommHandleRecvMemoryLocation(comm_handle) = recv_memory_location; +#ifdef HYPRE_USING_NODE_AWARE_MPI + //int rank; + //MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!hypre_HandleUsingNodeAwareMPI(hypre_handle())) { + //if (rank == 0) { hypre_printf("wait not using node aware\n"); } + if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) + { + HYPRE_Int ret = hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle), + hypre_ParCSRCommHandleRequests(comm_handle), + hypre_MPI_STATUSES_IGNORE); + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + + hypre_TMemcpy(recv_data, + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), + char, + hypre_ParCSRCommHandleNumRecvBytes(comm_handle), + recv_memory_location, + HYPRE_MEMORY_HOST); + } + //if (rank == 0) { hypre_printf("wait done not using node aware\n"); } + } else { + if (comm_handle->Xrequest) + { + //if (rank == 0) { hypre_printf("Node-aware waiting\n"); } + HYPRE_Int ret = (HYPRE_Int) MPIX_Wait(comm_handle->Xrequest, + MPI_STATUS_IGNORE); + + if (hypre_MPI_SUCCESS != ret) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "MPI error\n"); + /*hypre_printf("MPI error %d in %s (%s, line %u)\n", ret, __FUNCTION__, __FILE__, __LINE__);*/ + } + + hypre_TMemcpy(recv_data, + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), + char, + hypre_ParCSRCommHandleNumRecvBytes(comm_handle), + recv_memory_location, + HYPRE_MEMORY_HOST); + } + } +#else if (hypre_ParCSRCommHandleNumRequests(comm_handle) > 0) { HYPRE_Int ret = hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle), @@ -333,6 +710,7 @@ hypre_ParCSRPersistentCommHandleWait( hypre_ParCSRPersistentCommHandle *comm_han recv_memory_location, HYPRE_MEMORY_HOST); } +#endif } #endif // HYPRE_USING_PERSISTENT_COMM @@ -1190,6 +1568,11 @@ hypre_MatvecCommPkgCreate ( hypre_ParCSRMatrix *A ) comm_pkg = hypre_TAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST); hypre_ParCSRMatrixCommPkg(A) = comm_pkg; +#ifdef HYPRE_USING_NODE_AWARE_MPI + comm_pkg->use_neighbor = 1; +#else + comm_pkg->use_neighbor = 0; +#endif hypre_ParCSRCommPkgCreateApart( comm, col_map_offd, first_col_diag, num_cols_offd, global_num_cols, apart, @@ -1268,6 +1651,7 @@ hypre_ParCSRFindExtendCommPkg(MPI_Comm comm, hypre_ParCSRCommPkg **extend_comm_pkg) { hypre_ParCSRCommPkg *new_comm_pkg = hypre_TAlloc(hypre_ParCSRCommPkg, 1, HYPRE_MEMORY_HOST); + new_comm_pkg->use_neighbor = 0; hypre_assert(apart != NULL); hypre_ParCSRCommPkgCreateApart(comm, indices, my_first, indices_len, diff --git a/src/parcsr_mv/par_csr_communication.h b/src/parcsr_mv/par_csr_communication.h index 13f5ea0719..4fa13ac618 100644 --- a/src/parcsr_mv/par_csr_communication.h +++ b/src/parcsr_mv/par_csr_communication.h @@ -8,6 +8,10 @@ #ifndef HYPRE_PAR_CSR_COMMUNICATION_HEADER #define HYPRE_PAR_CSR_COMMUNICATION_HEADER +#ifdef HYPRE_USING_NODE_AWARE_MPI +#include "mpi_advance.h" +#endif + /*-------------------------------------------------------------------------- * hypre_ParCSRCommPkg: * Structure containing information for doing communications @@ -44,6 +48,9 @@ typedef struct void *recv_data_buffer; HYPRE_Int num_requests; hypre_MPI_Request *requests; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Request *Xrequest; +#endif } hypre_ParCSRCommHandle; typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; @@ -51,6 +58,10 @@ typedef hypre_ParCSRCommHandle hypre_ParCSRPersistentCommHandle; typedef struct _hypre_ParCSRCommPkg { MPI_Comm comm; +#ifdef HYPRE_USING_NODE_AWARE_MPI + MPIX_Comm *neighbor_comm; + MPIX_Comm *neighborT_comm; +#endif HYPRE_Int num_components; HYPRE_Int num_sends; HYPRE_Int *send_procs; @@ -60,6 +71,11 @@ typedef struct _hypre_ParCSRCommPkg HYPRE_Int num_recvs; HYPRE_Int *recv_procs; HYPRE_Int *recv_vec_starts; + HYPRE_Int use_neighbor; +#ifdef HYPRE_USING_NODE_AWARE_MPI + long *global_send_indices; + long *global_recv_indices; +#endif /* remote communication information */ hypre_MPI_Datatype *send_mpi_types; hypre_MPI_Datatype *recv_mpi_types; diff --git a/src/test/ij.c b/src/test/ij.c index b77e08e799..4480868af3 100644 --- a/src/test/ij.c +++ b/src/test/ij.c @@ -1561,6 +1561,19 @@ main( hypre_int argc, arg_index++; snprintf(mem_tracker_name, HYPRE_MAX_FILE_NAME_LEN, "%s", argv[arg_index++]); } +#endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + else if ( strcmp(argv[arg_index], "-node_aware_lvl_threshold") == 0 ) + { + arg_index++; + hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle()) = atoi(argv[arg_index++]); + if (hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle()) > 0) { + hypre_HandleUsingNodeAwareMPI(hypre_handle()) = 0; + } + //if (myid == 0) { + // printf("Set LVL Thresh: %d\n", hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle())); + //} + } #endif else { @@ -2588,6 +2601,9 @@ main( hypre_int argc, hypre_printf(" -umpire_pinned_pool_size : pinned memory pool size (GiB)\n"); hypre_printf(" -umpire_host_pool_size : host memory pool size (GiB)\n"); /* end umpire options */ +#endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + hypre_printf(" -node_aware_lvl_threshold : Min. level in AMG hierarchy to use node-aware MPI\n"); #endif } diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index a7cd43133e..2898d5bd82 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1594,6 +1594,14 @@ typedef struct #if defined(HYPRE_USING_MAGMA) magma_queue_t magma_queue; #endif + +#if defined(HYPRE_USING_NODE_AWARE_MPI) + /* level at which to begin using node aware optimization */ + HYPRE_Int node_aware_switchover_threshold; + /* flag for using node aware optimization */ + HYPRE_Int using_node_aware_mpi; +#endif + } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -1658,6 +1666,9 @@ typedef struct #define hypre_HandleMagmaQueue(hypre_handle) ((hypre_handle) -> magma_queue) +#define hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle) ((hypre_handle) -> node_aware_switchover_threshold) +#define hypre_HandleUsingNodeAwareMPI(hypre_handle) ((hypre_handle) -> using_node_aware_mpi) + #endif /****************************************************************************** * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other diff --git a/src/utilities/general.c b/src/utilities/general.c index c2a81c88fa..d7542b2d49 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -44,6 +44,11 @@ hypre_HandleCreate(void) hypre_HandleDeviceGSMethod(hypre_handle_) = 1; /* CPU: 0; Cusparse: 1 */ #endif +#if defined(HYPRE_USING_NODE_AWARE_MPI) + hypre_HandleUsingNodeAwareMPI(hypre_handle_) = 1; + hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle_) = 0; +#endif + return hypre_handle_; } diff --git a/src/utilities/handle.h b/src/utilities/handle.h index 6ff16bf228..f86d495e5c 100644 --- a/src/utilities/handle.h +++ b/src/utilities/handle.h @@ -58,6 +58,14 @@ typedef struct #if defined(HYPRE_USING_MAGMA) magma_queue_t magma_queue; #endif + +#if defined(HYPRE_USING_NODE_AWARE_MPI) + /* level at which to begin using node aware optimization */ + HYPRE_Int node_aware_switchover_threshold; + /* flag for using node aware optimization */ + HYPRE_Int using_node_aware_mpi; +#endif + } hypre_Handle; /* accessor macros to hypre_Handle */ @@ -122,4 +130,7 @@ typedef struct #define hypre_HandleMagmaQueue(hypre_handle) ((hypre_handle) -> magma_queue) +#define hypre_HandleNodeAwareSwitchoverThreshold(hypre_handle) ((hypre_handle) -> node_aware_switchover_threshold) +#define hypre_HandleUsingNodeAwareMPI(hypre_handle) ((hypre_handle) -> using_node_aware_mpi) + #endif