ROCm workaround: Use ParallelFor instead of Reduce (#2749)

Assuming the failure is not often, we can use ParallelFor with atomicAdd to obtain the number of failures. With this change, the ROCm memory issue seems to be gone.
AMReX-Astro · Feb 10, 2024 · 1f3bfff · 1f3bfff
1 parent 6401d25
commit 1f3bfff
Showing 1 changed file with 55 additions and 30 deletions.
diff --git a/Source/reactions/Castro_react.cpp b/Source/reactions/Castro_react.cpp
@@ -186,12 +186,14 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
     MultiFab tmp_mask_mf;
     const MultiFab& mask_mf = mask_covered_zones ? getLevel(level+1).build_fine_mask() : tmp_mask_mf;
 
-    ReduceOps<ReduceOpSum> reduce_op;
-    ReduceData<Real> reduce_data(reduce_op);
-    using ReduceTuple = typename decltype(reduce_data)::Type;
+#if defined(AMREX_USE_GPU)
+    Gpu::Buffer<int> d_num_failed({0});
+    auto* p_num_failed = d_num_failed.data();
+#endif
+    int num_failed = 0;
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel reduction(+:num_failed)
 #endif
     for (MFIter mfi(s, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
@@ -208,8 +210,11 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
         const auto problo = geom.ProbLoArray();
 #endif
 
-        reduce_op.eval(bx, reduce_data,
-        [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) -> ReduceTuple
+#if defined(AMREX_USE_GPU)
+        ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
+#else
+        LoopOnCpu(bx, [&] (int i, int j, int k) mutable
+#endif
         {
 
             burn_t burn_state;
@@ -230,7 +235,7 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
 
             bool do_burn = true;
             burn_state.success = true;
-            Real burn_failed = 0.0_rt;
+            int burn_failed = 0;
 
             // Don't burn on zones inside shock regions, if the relevant option is set.
 
@@ -329,7 +334,7 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
                 // If we were unsuccessful, update the failure count.
 
                 if (!burn_state.success) {
-                    burn_failed = 1.0_rt;
+                    burn_failed = 1;
                 }
 
                 // Add burning rates to reactions MultiFab, but be
@@ -399,19 +404,25 @@ Castro::react_state(MultiFab& s, MultiFab& r, Real time, Real dt, const int stra
 
             }
 
-
-            return {burn_failed};
-
+#if defined(AMREX_USE_GPU)
+            if (burn_failed) {
+                Gpu::Atomic::Add(p_num_failed, burn_failed);
+            }
+#else
+            num_failed += burn_failed;
+#endif
         });
 
+#if defined(AMREX_USE_HIP)
+        Gpu::streamSynchronize(); // otherwise HIP may faile to allocate the necessary resources.
+#endif
     }
 
-    ReduceTuple hv = reduce_data.value();
-    Real burn_failed = amrex::get<0>(hv);
+#if defined(AMREX_USE_GPU)
+    num_failed = *(d_num_failed.copyToHost());
+#endif
 
-    if (burn_failed != 0.0) {
-      burn_success = 0;
-    }
+    burn_success = !num_failed;
 
     ParallelDescriptor::ReduceIntMin(burn_success);
 
@@ -516,11 +527,13 @@ Castro::react_state(Real time, Real dt)
 
     int burn_success = 1;
 
-    ReduceOps<ReduceOpSum> reduce_op;
-    ReduceData<Real> reduce_data(reduce_op);
-
-    using ReduceTuple = typename decltype(reduce_data)::Type;
+#if defined(AMREX_USE_GPU)
+    Gpu::Buffer<int> d_num_failed({0});
+    auto* p_num_failed = d_num_failed.data();
+#endif
+    int num_failed = 0;
 
+    // why no omp here?
     for (MFIter mfi(S_new, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
         const Box& bx = mfi.growntilebox(ng);
@@ -542,8 +555,11 @@ Castro::react_state(Real time, Real dt)
         const auto dx = geom.CellSizeArray();
         const auto problo = geom.ProbLoArray();
 
-        reduce_op.eval(bx, reduce_data,
-        [=] AMREX_GPU_HOST_DEVICE (int i, int j, int k) -> ReduceTuple
+#if defined(AMREX_USE_GPU)
+        ParallelFor(bx, [=] AMREX_GPU_DEVICE (int i, int j, int k)
+#else
+        LoopOnCpu(bx, [&] (int i, int j, int k) mutable
+#endif
         {
             burn_t burn_state;
 
@@ -563,7 +579,7 @@ Castro::react_state(Real time, Real dt)
 
             bool do_burn = true;
             burn_state.success = true;
-            Real burn_failed = 0.0_rt;
+            int burn_failed = 0;
 
             // Don't burn on zones inside shock regions, if the
             // relevant option is set.
@@ -687,7 +703,7 @@ Castro::react_state(Real time, Real dt)
                 // If we were unsuccessful, update the failure count.
 
                 if (!burn_state.success) {
-                    burn_failed = 1.0_rt;
+                    burn_failed = 1;
                 }
 
                 // update the state data.
@@ -780,16 +796,25 @@ Castro::react_state(Real time, Real dt)
                 }
             }
 
-            return {burn_failed};
+#if defined(AMREX_USE_GPU)
+            if (burn_failed) {
+                Gpu::Atomic::Add(p_num_failed, burn_failed);
+            }
+#else
+            num_failed += burn_failed;
+#endif
         });
+
+#if defined(AMREX_USE_HIP)
+        Gpu::streamSynchronize(); // otherwise HIP may faile to allocate the necessary resources.
+#endif
     }
 
-    ReduceTuple hv = reduce_data.value();
-    Real burn_failed = amrex::get<0>(hv);
+#if defined(AMREX_USE_GPU)
+    num_failed = *(d_num_failed.copyToHost());
+#endif
 
-    if (burn_failed != 0.0) {
-        burn_success = 0;
-    }
+    burn_success = !num_failed;
 
     ParallelDescriptor::ReduceIntMin(burn_success);