diff --git a/src/raja/fasten.hpp b/src/raja/fasten.hpp index 07b42f3..8ce8eb2 100644 --- a/src/raja/fasten.hpp +++ b/src/raja/fasten.hpp @@ -4,6 +4,8 @@ #include #include "RAJA/RAJA.hpp" +#include "umpire/Allocator.hpp" +#include "umpire/ResourceManager.hpp" #include "camp/resource.hpp" #ifdef IMPL_CLS @@ -16,31 +18,31 @@ template class IMPL_CLS final : public Bude { - using launch_policy = RAJA::expt::LaunchPolicy< // + using launch_policy = RAJA::LaunchPolicy< // #if defined(RAJA_ENABLE_OPENMP) - RAJA::expt::omp_launch_t + RAJA::omp_launch_t #else - RAJA::expt::seq_launch_t + RAJA::seq_launch_t #endif #if defined(RAJA_ENABLE_CUDA) , - RAJA::expt::cuda_launch_t + RAJA::cuda_launch_t #endif #if defined(RAJA_ENABLE_HIP) , - RAJA::expt::hip_launch_t + RAJA::hip_launch_t #endif #if defined(RAJA_ENABLE_SYCL) , - RAJA::expt::sycl_launch_t + RAJA::sycl_launch_t #endif >; - using teams_x = RAJA::expt::LoopPolicy< // + using teams_x = RAJA::LoopPolicy< // #if defined(RAJA_ENABLE_OPENMP) RAJA::omp_parallel_for_exec #else - RAJA::loop_exec + RAJA::seq_exec #endif #if defined(RAJA_ENABLE_CUDA) , @@ -52,8 +54,8 @@ template class IMPL_CLS final : public Bude { #endif >; - using threads_x = RAJA::expt::LoopPolicy< // - RAJA::loop_exec + using threads_x = RAJA::LoopPolicy< // + RAJA::seq_exec #if defined(RAJA_ENABLE_CUDA) , RAJA::cuda_thread_x_loop @@ -81,11 +83,11 @@ template class IMPL_CLS final : public Bude { global = int(std::ceil(double(global) / double(wgsize))); size_t local = int(wgsize); - RAJA::expt::launch( // - static_cast(device), // - RAJA::expt::Grid(RAJA::expt::Teams(global), RAJA::expt::Threads(local)), // - [=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { // - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, global), [&](int gid) { + RAJA::launch( // + static_cast(device), // + RAJA::LaunchParams(RAJA::Teams(global), RAJA::Threads(local)), // + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { // + RAJA::loop(ctx, RAJA::RangeSegment(0, global), [&](int gid) { #ifdef USE_LOCAL_ARRAY #error RAJA does not appear to support dynamically allocated LocalArray w/ the shared memory policy RAJA_TEAM_SHARED FFParams *local_forcefield; @@ -95,7 +97,7 @@ template class IMPL_CLS final : public Bude { float etot[PPWI]; float transform[3][4][PPWI]; - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, local), [&](int lid) { + RAJA::loop(ctx, RAJA::RangeSegment(0, local), [&](int lid) { size_t ix = gid * local * PPWI + lid; ix = ix < nposes ? ix : nposes - PPWI; @@ -135,9 +137,10 @@ template class IMPL_CLS final : public Bude { local_forcefield = forcefields; #endif }); + ctx.teamSync(); - RAJA::expt::loop(ctx, RAJA::RangeSegment(0, local), [&](int lid) { + RAJA::loop(ctx, RAJA::RangeSegment(0, local), [&](int lid) { // Loop over ligand atoms size_t il = 0; do { @@ -232,32 +235,32 @@ template class IMPL_CLS final : public Bude { std::copy(xs.begin(), xs.end(), data); return data; } - + template static T *allocate(const size_t size) { -#ifndef RAJA_DEVICE_ACTIVE - return static_cast(std::malloc(sizeof(T) * size)); + auto &rm = umpire::ResourceManager::getInstance(); +#ifndef RAJA_TARGET_GPU + auto alloc = rm.getAllocator("HOST"); #else - T *ptr; - cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal); - return ptr; + auto alloc = rm.getAllocator("UM"); #endif + return static_cast(alloc.allocate(sizeof(T) * size)); } template static void deallocate(T *ptr) { -#ifndef RAJA_DEVICE_ACTIVE - std::free(ptr); -#else - cudaFree(ptr); -#endif + auto &rm = umpire::ResourceManager::getInstance(); + rm.getAllocator(ptr).deallocate(ptr); } static void synchronise() { // nothing to do for host devices #if defined(RAJA_ENABLE_CUDA) - cudaDeviceSynchronize(); + RAJA::synchronize(); #endif #if defined(RAJA_ENABLE_HIP) - hipDeviceSynchronize(); + RAJA::synchronize(); +#endif +#if defined(RAJA_ENABLE_SYCL) + RAJA::synchronize(); #endif } @@ -267,18 +270,18 @@ template class IMPL_CLS final : public Bude { [[nodiscard]] std::string name() { return "raja"; }; [[nodiscard]] std::vector enumerateDevices() override { - std::vector devices{{RAJA::expt::ExecPlace::HOST, "RAJA Host device"}}; -#if defined(RAJA_DEVICE_ACTIVE) + std::vector devices{{(size_t) RAJA::ExecPlace::HOST, "RAJA Host device"}}; +#if defined(RAJA_TARGET_GPU) #if defined(RAJA_ENABLE_CUDA) const auto deviceName = "RAJA CUDA device"; #endif #if defined(RAJA_ENABLE_HIP) - const auto deviceName = "Raja HIP device"; + const auto deviceName = "RAJA HIP device"; #endif #if defined(RAJA_ENABLE_SYCL) - const auto deviceName = "Raja SYCL device"; + const auto deviceName = "RAJA SYCL device"; #endif - devices.template emplace_back(RAJA::expt::ExecPlace::DEVICE, deviceName); + devices.template emplace_back((size_t) RAJA::ExecPlace::DEVICE, deviceName); #endif return devices; }; @@ -286,7 +289,7 @@ template class IMPL_CLS final : public Bude { [[nodiscard]] Sample fasten(const Params &p, size_t wgsize, size_t device) const override { Sample sample(PPWI, wgsize, p.nposes()); - + auto contextStart = now(); auto protein = allocate(p.protein); diff --git a/src/raja/model.cmake b/src/raja/model.cmake index e8d270f..edffa61 100644 --- a/src/raja/model.cmake +++ b/src/raja/model.cmake @@ -1,93 +1,40 @@ +register_flag_optional(RAJA_BACK_END "Specify whether we target CPU/CUDA/HIP/SYCL" "CPU") -register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection and RAJA. - See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install" - "c++") - -register_flag_required(RAJA_IN_TREE - "Absolute path to the *source* distribution directory of RAJA. - Make sure to use the release version of RAJA or clone RAJA recursively with submodules. - Remember to append RAJA specific flags as well, for example: - - -DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ... - - See https://raja.readthedocs.io/en/v0.14.0/sphinx/user_guide/config_options.html#available-raja-options-and-defaults for all available options -") - -#register_flag_optional(TARGET -# "Target offload device, implemented values are CPU, NVIDIA, HIP" -# CPU) - -register_flag_optional(CUDA_TOOLKIT_ROOT_DIR - "[ENABLE_CUDA=ON only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "") - -# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes -register_flag_optional(CUDA_ARCH - "[ENABLE_CUDA=ON only] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc" - "") - -register_flag_optional(CUDA_EXTRA_FLAGS - "[ENABLE_CUDA=ON only] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" - "") - -# compiler vendor and arch specific flags -set(RAJA_FLAGS_CPU_INTEL -qopt-streaming-stores=always) +register_flag_optional(MANAGED_ALLOC "Use UVM (cudaMallocManaged) instead of the device-only allocation (cudaMalloc)" + "OFF") macro(setup) + if (POLICY CMP0104) + cmake_policy(SET CMP0104 OLD) + endif () + set(CMAKE_CXX_STANDARD 17) + find_package(RAJA REQUIRED) + find_package(umpire REQUIRED) - if (EXISTS "${RAJA_IN_TREE}") - - message(STATUS "Building using in-tree RAJA source at `${RAJA_IN_TREE}`") - - set(CMAKE_CUDA_STANDARD 17) - - # don't build anything that isn't the RAJA library itself, by default their cmake def builds everything, whyyy? - set(RAJA_ENABLE_TESTS OFF CACHE BOOL "") - set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "") - set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "") - set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "") - set(ENABLE_REPRODUCERS OFF CACHE BOOL "") - set(ENABLE_DOCUMENTATION OFF CACHE BOOL "") - - if (ENABLE_CUDA) - - set(ENABLE_CUDA ON CACHE BOOL "") - - # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes - if(POLICY CMP0104) - set(CMAKE_POLICY_DEFAULT_CMP0104 OLD) # so that propogates to RAJA's CMakeList as well - cmake_policy(SET CMP0104 OLD) - endif() - - # RAJA needs all the cuda stuff setup before including! - set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc) - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS}) - - message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}") - endif () - - - add_subdirectory(${RAJA_IN_TREE} ${CMAKE_BINARY_DIR}/raja) - register_link_library(RAJA) - # RAJA's cmake screws with where the binary will end up, resetting it here: - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) - else () - message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist") - endif () - - if (ENABLE_CUDA) - # RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu + register_link_library(RAJA umpire) + if (${RAJA_BACK_END} STREQUAL "CUDA") enable_language(CUDA) - set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA) + set(CMAKE_CUDA_STANDARD 17) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -extended-lambda --expt-relaxed-constexpr --restrict --keep") + + set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CUDA) + register_definitions(RAJA_TARGET_GPU) + elseif (${RAJA_BACK_END} STREQUAL "HIP") + # Set CMAKE_CXX_COMPILER to hipcc + find_package(hip REQUIRED) + register_definitions(RAJA_TARGET_GPU) + elseif (${RAJA_BACK_END} STREQUAL "SYCL") + register_definitions(RAJA_TARGET_GPU) + else() + register_definitions(RAJA_TARGET_CPU) + message(STATUS "Falling Back to CPU") + endif () + + if (MANAGED_ALLOC) + register_definitions(BUDE_MANAGED_ALLOC) endif () - - register_append_compiler_and_arch_specific_cxx_flags( - RAJA_FLAGS_CPU - ${CMAKE_CXX_COMPILER_ID} - ${CMAKE_SYSTEM_PROCESSOR} - ) - endmacro() -