From f374fe7f024d5b71f4899d2e61637685a771f1af Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Mon, 28 Oct 2024 18:28:26 -0400 Subject: [PATCH 1/2] test without Open MPI --- .github/workflows/setup_env.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/setup_env.sh b/.github/workflows/setup_env.sh index 5fe6034d1..883cf10eb 100755 --- a/.github/workflows/setup_env.sh +++ b/.github/workflows/setup_env.sh @@ -80,15 +80,17 @@ ${CC} --version ${FC} --version #----------------------------------------------------------------- MPI -# Test Open MPI with CPU and CUDA. +# Test Open MPI with CPU and CUDA. --> temporarily use Intel MPI. # Test Intel MPI with ROCm and SYCL. # Note: Open MPI hides SYCL devices, at least in our current CI. -if [ "${device}" = "cpu" -o "${device}" = "gpu_nvidia" ]; then +if false; then +#if [ "${device}" = "cpu" -o "${device}" = "gpu_nvidia" ]; then print "======================================== Load Open MPI" quiet module load openmpi export OMPI_CXX=${CXX} export OMPI_CC=${CC} export OMPI_FC=${FC} + unset UCX_NET_DEVICES # complains about unavailable interface mlx5_0:1 echo "mkl_blacs = openmpi" >> make.inc else From 071f0dd841ebc91ca8566d04e4a05218c4a92820 Mon Sep 17 00:00:00 2001 From: Mark Gates Date: Mon, 28 Oct 2024 20:53:21 -0400 Subject: [PATCH 2/2] more robust test for idle GPUs --- test/idle_gpus.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/idle_gpus.py b/test/idle_gpus.py index 73228946f..242844617 100755 --- a/test/idle_gpus.py +++ b/test/idle_gpus.py @@ -48,13 +48,15 @@ if (s): gpu = s.group( 1 ) - # If using > half the memory, assume it is not idle. + # If using >= 10 MiB or 5% utilization, assume it is not idle. + # Typically idle is 1 MiB and 0% utilization. # Docker can't see processes in section 2. - s = re.search( '^\| +N/A +\d+C +\w+ +\d+W +/ +\d+W *\| +(\d+)MiB +/ +(\d+)MiB', line ) + s = re.search( '^\| +N/A +\d+C +\w+ +\d+W +/ +\d+W *\| +(\d+)MiB +/ +(\d+)MiB *\| +(\d+)%', line ) if (s): used_mem = int( s.group( 1 ) ) total_mem = int( s.group( 2 ) ) - if (used_mem > 0.5*total_mem): + percent = int( s.group( 3 ) ) + if (used_mem >= 10 or percent >= 5): gpus[ gpu ] = 0 else: # Match process lines: