hpcgroup · pranav-sivaraman · Feb 7, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 24, 2024
diff --git a/src/raja/fasten.hpp b/src/raja/fasten.hpp
@@ -4,6 +4,8 @@
 #include <string>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Allocator.hpp"
+#include "umpire/ResourceManager.hpp"
 #include "camp/resource.hpp"
 
 #ifdef IMPL_CLS
@@ -16,31 +18,31 @@
 
 template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
 
-  using launch_policy = RAJA::expt::LaunchPolicy< //
+  using launch_policy = RAJA::LaunchPolicy< //
 #if defined(RAJA_ENABLE_OPENMP)
-      RAJA::expt::omp_launch_t
+      RAJA::omp_launch_t
 #else
-      RAJA::expt::seq_launch_t
+      RAJA::seq_launch_t
 #endif
 #if defined(RAJA_ENABLE_CUDA)
       ,
-      RAJA::expt::cuda_launch_t<false>
+      RAJA::cuda_launch_t<false>
 #endif
 #if defined(RAJA_ENABLE_HIP)
       ,
-      RAJA::expt::hip_launch_t<false>
+      RAJA::hip_launch_t<false>
 #endif
 #if defined(RAJA_ENABLE_SYCL)
       ,
-      RAJA::expt::sycl_launch_t<false>
+      RAJA::sycl_launch_t<false>
 #endif
       >;
 
-  using teams_x = RAJA::expt::LoopPolicy< //
+  using teams_x = RAJA::LoopPolicy< //
 #if defined(RAJA_ENABLE_OPENMP)
       RAJA::omp_parallel_for_exec
 #else
-      RAJA::loop_exec
+      RAJA::seq_exec
 #endif
 #if defined(RAJA_ENABLE_CUDA)
       ,
@@ -52,8 +54,8 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
 #endif
       >;
 
-  using threads_x = RAJA::expt::LoopPolicy< //
-      RAJA::loop_exec
+  using threads_x = RAJA::LoopPolicy< //
+      RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
       ,
       RAJA::cuda_thread_x_loop
@@ -81,11 +83,11 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
     global = int(std::ceil(double(global) / double(wgsize)));
     size_t local = int(wgsize);
 
-    RAJA::expt::launch<launch_policy>(                                           //
-        static_cast<RAJA::expt::ExecPlace>(device),                              //
-        RAJA::expt::Grid(RAJA::expt::Teams(global), RAJA::expt::Threads(local)), //
-        [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) {                    //
-          RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, global), [&](int gid) {
+    RAJA::launch<launch_policy>(                                           //
+        static_cast<RAJA::ExecPlace>(device),                              //
+        RAJA::LaunchParams(RAJA::Teams(global), RAJA::Threads(local)), //
+        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {                    //
+          RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, global), [&](int gid) {
 #ifdef USE_LOCAL_ARRAY
   #error RAJA does not appear to support dynamically allocated LocalArray w/ the shared memory policy
             RAJA_TEAM_SHARED FFParams *local_forcefield;
@@ -95,7 +97,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
             float etot[PPWI];
             float transform[3][4][PPWI];
 
-            RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
+            RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
               size_t ix = gid * local * PPWI + lid;
               ix = ix < nposes ? ix : nposes - PPWI;
 
@@ -135,9 +137,10 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
               local_forcefield = forcefields;
 #endif
             });
+
             ctx.teamSync();
 
-            RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
+            RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
               // Loop over ligand atoms
               size_t il = 0;
               do {
@@ -227,37 +230,54 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
         });
   }
 
+  template <typename T> static void *registerAllocation(const std::vector<T> &xs) {
+    auto &rm = umpire::ResourceManager::getInstance();
+    auto host_alloc = rm.getAllocator("HOST");
+    auto strategy = host_alloc.getAllocationStrategy();
+    // Not allowed to create a record without casting away the const
+    auto host_data = const_cast<void*>(static_cast<const void*>(std::data(xs)));
+    umpire::util::AllocationRecord record{host_data, sizeof(T) * std::size(xs), strategy};
+    rm.registerAllocation(host_data, record);
+    return host_data;
+  }
+
   template <typename T> static T *allocate(const std::vector<T> &xs) {
-    auto data = allocate<T>(xs.size());
-    std::copy(xs.begin(), xs.end(), data);
+    auto &rm = umpire::ResourceManager::getInstance();
+    auto host_data = registerAllocation(xs);
+    auto data = allocate<T>(std::size(xs));
+    rm.copy(data, host_data);
     return data;
   }
-
+    
   template <typename T> static T *allocate(const size_t size) {
-#ifndef RAJA_DEVICE_ACTIVE
-    return static_cast<T *>(std::malloc(sizeof(T) * size));
+    auto &rm = umpire::ResourceManager::getInstance();
+#ifndef RAJA_TARGET_GPU
+    auto alloc = rm.getAllocator("HOST");
 #else
-    T *ptr;
-    cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal);
-    return ptr;
+  #ifdef BUDE_MANAGED_ALLOC
+    auto alloc = rm.getAllocator("UM");
+  #else
+    auto alloc = rm.getAllocator("DEVICE");
+  #endif
 #endif
+    return static_cast<T *>(alloc.allocate(sizeof(T) * size));
   }
 
   template <typename T> static void deallocate(T *ptr) {
-#ifndef RAJA_DEVICE_ACTIVE
-    std::free(ptr);
-#else
-    cudaFree(ptr);
-#endif
+    auto &rm = umpire::ResourceManager::getInstance();
+    rm.getAllocator(ptr).deallocate(ptr);
   }
 
   static void synchronise() {
     // nothing to do for host devices
 #if defined(RAJA_ENABLE_CUDA)
-    cudaDeviceSynchronize();
+    RAJA::synchronize<RAJA::cuda_synchronize>();
 #endif
 #if defined(RAJA_ENABLE_HIP)
-    hipDeviceSynchronize();
+    RAJA::synchronize<RAJA::hip_synchronize>();
+#endif
+#if defined(RAJA_ENABLE_SYCL)
+    RAJA::synchronize<RAJA::sycl_synchronize>();
 #endif
   }
 
@@ -267,25 +287,26 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
   [[nodiscard]] std::string name() { return "raja"; };
 
   [[nodiscard]] std::vector<Device> enumerateDevices() override {
-    std::vector<Device> devices{{RAJA::expt::ExecPlace::HOST, "RAJA Host device"}};
-#if defined(RAJA_DEVICE_ACTIVE)
+    std::vector<Device> devices{{(size_t) RAJA::ExecPlace::HOST, "RAJA Host device"}};
+#if defined(RAJA_TARGET_GPU)
   #if defined(RAJA_ENABLE_CUDA)
     const auto deviceName = "RAJA CUDA device";
   #endif
   #if defined(RAJA_ENABLE_HIP)
-    const auto deviceName = "Raja HIP device";
+    const auto deviceName = "RAJA HIP device";
   #endif
   #if defined(RAJA_ENABLE_SYCL)
-    const auto deviceName = "Raja SYCL device";
+    const auto deviceName = "RAJA SYCL device";
   #endif
-    devices.template emplace_back(RAJA::expt::ExecPlace::DEVICE, deviceName);
+    devices.template emplace_back((size_t) RAJA::ExecPlace::DEVICE, deviceName);
 #endif
     return devices;
   };
 
   [[nodiscard]] Sample fasten(const Params &p, size_t wgsize, size_t device) const override {
 
     Sample sample(PPWI, wgsize, p.nposes());
+    auto &rm = umpire::ResourceManager::getInstance();
 
     auto contextStart = now();
 
@@ -302,6 +323,8 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
 
     synchronise();
 
+    auto host_energies = registerAllocation(sample.energies);
+
     auto contextEnd = now();
     sample.contextTime = {contextStart, contextEnd};
 
@@ -315,7 +338,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
       sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
     }
 
-    std::copy(results, results + p.nposes(), sample.energies.begin());
+    rm.copy(host_energies, results);
 
     deallocate(protein);
     deallocate(ligand);

diff --git a/src/raja/model.cmake b/src/raja/model.cmake
@@ -1,93 +1,40 @@
+register_flag_optional(RAJA_BACK_END "Specify whether we target CPU/CUDA/HIP/SYCL" "CPU")
 
-register_flag_optional(CMAKE_CXX_COMPILER
-        "Any CXX compiler that is supported by CMake detection and RAJA.
-         See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
-        "c++")
-
-register_flag_required(RAJA_IN_TREE
-        "Absolute path to the *source* distribution directory of RAJA.
-         Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
-         Remember to append RAJA specific flags as well, for example:
-
-             -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...
-
-         See https://raja.readthedocs.io/en/v0.14.0/sphinx/user_guide/config_options.html#available-raja-options-and-defaults for all available options
-")
-
-#register_flag_optional(TARGET
-#        "Target offload device, implemented values are CPU, NVIDIA, HIP"
-#        CPU)
-
-register_flag_optional(CUDA_TOOLKIT_ROOT_DIR
-        "[ENABLE_CUDA=ON only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "")
-
-# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
-register_flag_optional(CUDA_ARCH
-        "[ENABLE_CUDA=ON only] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
-        "")
-
-register_flag_optional(CUDA_EXTRA_FLAGS
-        "[ENABLE_CUDA=ON only] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
-        "")
-
-# compiler vendor and arch specific flags
-set(RAJA_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
+register_flag_optional(MANAGED_ALLOC "Use UVM (cudaMallocManaged) instead of the device-only allocation (cudaMalloc)"
+        "OFF")
 
 macro(setup)
+    if (POLICY CMP0104)
+        cmake_policy(SET CMP0104 OLD)
+    endif ()
 
+    set(CMAKE_CXX_STANDARD 17)
 
+    find_package(RAJA REQUIRED)
+    find_package(umpire REQUIRED)
 
-    if (EXISTS "${RAJA_IN_TREE}")
-
-        message(STATUS "Building using in-tree RAJA source at `${RAJA_IN_TREE}`")
-
-        set(CMAKE_CUDA_STANDARD 17)
-
-        # don't build anything that isn't the RAJA library itself, by default their cmake def builds everything, whyyy?
-        set(RAJA_ENABLE_TESTS OFF CACHE BOOL "")
-        set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "")
-        set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "")
-        set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "")
-        set(ENABLE_REPRODUCERS OFF CACHE BOOL "")
-        set(ENABLE_DOCUMENTATION OFF CACHE BOOL "")
-
-        if (ENABLE_CUDA)
-
-            set(ENABLE_CUDA ON CACHE BOOL "")
-
-            # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
-            if(POLICY CMP0104)
-                set(CMAKE_POLICY_DEFAULT_CMP0104 OLD) # so that propogates to RAJA's CMakeList as well
-                cmake_policy(SET CMP0104 OLD)
-            endif()
-
-            # RAJA needs all the cuda stuff setup before including!
-            set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
-            set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})
-
-            message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}")
-        endif ()
-
-
-        add_subdirectory(${RAJA_IN_TREE} ${CMAKE_BINARY_DIR}/raja)
-        register_link_library(RAJA)
-        # RAJA's cmake screws with where the binary will end up, resetting it here:
-        set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
-    else ()
-        message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
-    endif ()
-
-    if (ENABLE_CUDA)
-        # RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu
+    register_link_library(RAJA umpire)
+    if (${RAJA_BACK_END} STREQUAL "CUDA")
         enable_language(CUDA)
-        set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA)
+        set(CMAKE_CUDA_STANDARD 17)
+        set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+        set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -extended-lambda --expt-relaxed-constexpr --restrict --keep")
+
+        set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CUDA)
+        register_definitions(RAJA_TARGET_GPU)
+    elseif (${RAJA_BACK_END} STREQUAL "HIP")
+        # Set CMAKE_CXX_COMPILER to hipcc
+        find_package(hip REQUIRED)
+        register_definitions(RAJA_TARGET_GPU)
+    elseif (${RAJA_BACK_END} STREQUAL "SYCL")
+        register_definitions(RAJA_TARGET_GPU)
+    else()
+        register_definitions(RAJA_TARGET_CPU)
+        message(STATUS "Falling Back to CPU")
+    endif ()   
+
+    if (MANAGED_ALLOC)
+        register_definitions(BUDE_MANAGED_ALLOC)
     endif ()
-
-    register_append_compiler_and_arch_specific_cxx_flags(
-            RAJA_FLAGS_CPU
-            ${CMAKE_CXX_COMPILER_ID}
-            ${CMAKE_SYSTEM_PROCESSOR}
-    )
-
 endmacro()
-