Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RAJA Model Update #4

Merged
merged 7 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 61 additions & 38 deletions src/raja/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <string>

#include "RAJA/RAJA.hpp"
#include "umpire/Allocator.hpp"
#include "umpire/ResourceManager.hpp"
#include "camp/resource.hpp"

#ifdef IMPL_CLS
Expand All @@ -16,31 +18,31 @@

template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

using launch_policy = RAJA::expt::LaunchPolicy< //
using launch_policy = RAJA::LaunchPolicy< //
#if defined(RAJA_ENABLE_OPENMP)
RAJA::expt::omp_launch_t
RAJA::omp_launch_t
#else
RAJA::expt::seq_launch_t
RAJA::seq_launch_t
#endif
#if defined(RAJA_ENABLE_CUDA)
,
RAJA::expt::cuda_launch_t<false>
RAJA::cuda_launch_t<false>
#endif
#if defined(RAJA_ENABLE_HIP)
,
RAJA::expt::hip_launch_t<false>
RAJA::hip_launch_t<false>
#endif
#if defined(RAJA_ENABLE_SYCL)
,
RAJA::expt::sycl_launch_t<false>
RAJA::sycl_launch_t<false>
#endif
>;

using teams_x = RAJA::expt::LoopPolicy< //
using teams_x = RAJA::LoopPolicy< //
#if defined(RAJA_ENABLE_OPENMP)
RAJA::omp_parallel_for_exec
#else
RAJA::loop_exec
RAJA::seq_exec
#endif
#if defined(RAJA_ENABLE_CUDA)
,
Expand All @@ -52,8 +54,8 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
#endif
>;

using threads_x = RAJA::expt::LoopPolicy< //
RAJA::loop_exec
using threads_x = RAJA::LoopPolicy< //
RAJA::seq_exec
#if defined(RAJA_ENABLE_CUDA)
,
RAJA::cuda_thread_x_loop
Expand Down Expand Up @@ -81,11 +83,11 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
global = int(std::ceil(double(global) / double(wgsize)));
size_t local = int(wgsize);

RAJA::expt::launch<launch_policy>( //
static_cast<RAJA::expt::ExecPlace>(device), //
RAJA::expt::Grid(RAJA::expt::Teams(global), RAJA::expt::Threads(local)), //
[=] RAJA_HOST_DEVICE(RAJA::expt::LaunchContext ctx) { //
RAJA::expt::loop<teams_x>(ctx, RAJA::RangeSegment(0, global), [&](int gid) {
RAJA::launch<launch_policy>( //
static_cast<RAJA::ExecPlace>(device), //
RAJA::LaunchParams(RAJA::Teams(global), RAJA::Threads(local)), //
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { //
RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, global), [&](int gid) {
#ifdef USE_LOCAL_ARRAY
#error RAJA does not appear to support dynamically allocated LocalArray w/ the shared memory policy
RAJA_TEAM_SHARED FFParams *local_forcefield;
Expand All @@ -95,7 +97,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
float etot[PPWI];
float transform[3][4][PPWI];

RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
size_t ix = gid * local * PPWI + lid;
ix = ix < nposes ? ix : nposes - PPWI;

Expand Down Expand Up @@ -135,9 +137,10 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
local_forcefield = forcefields;
#endif
});

ctx.teamSync();

RAJA::expt::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, local), [&](int lid) {
// Loop over ligand atoms
size_t il = 0;
do {
Expand Down Expand Up @@ -227,37 +230,54 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
});
}

template <typename T> static void *registerAllocation(const std::vector<T> &xs) {
auto &rm = umpire::ResourceManager::getInstance();
auto host_alloc = rm.getAllocator("HOST");
auto strategy = host_alloc.getAllocationStrategy();
// Not allowed to create a record without casting away the const
auto host_data = const_cast<void*>(static_cast<const void*>(std::data(xs)));
umpire::util::AllocationRecord record{host_data, sizeof(T) * std::size(xs), strategy};
rm.registerAllocation(host_data, record);
return host_data;
}

template <typename T> static T *allocate(const std::vector<T> &xs) {
auto data = allocate<T>(xs.size());
std::copy(xs.begin(), xs.end(), data);
auto &rm = umpire::ResourceManager::getInstance();
auto host_data = registerAllocation(xs);
auto data = allocate<T>(std::size(xs));
rm.copy(data, host_data);
return data;
}

template <typename T> static T *allocate(const size_t size) {
#ifndef RAJA_DEVICE_ACTIVE
return static_cast<T *>(std::malloc(sizeof(T) * size));
auto &rm = umpire::ResourceManager::getInstance();
#ifndef RAJA_TARGET_GPU
auto alloc = rm.getAllocator("HOST");
#else
T *ptr;
cudaMallocManaged((void **)&ptr, sizeof(T) * size, cudaMemAttachGlobal);
return ptr;
#ifdef BUDE_MANAGED_ALLOC
auto alloc = rm.getAllocator("UM");
#else
auto alloc = rm.getAllocator("DEVICE");
#endif
#endif
return static_cast<T *>(alloc.allocate(sizeof(T) * size));
}

template <typename T> static void deallocate(T *ptr) {
#ifndef RAJA_DEVICE_ACTIVE
std::free(ptr);
#else
cudaFree(ptr);
#endif
auto &rm = umpire::ResourceManager::getInstance();
rm.getAllocator(ptr).deallocate(ptr);
}

static void synchronise() {
// nothing to do for host devices
#if defined(RAJA_ENABLE_CUDA)
cudaDeviceSynchronize();
RAJA::synchronize<RAJA::cuda_synchronize>();
#endif
#if defined(RAJA_ENABLE_HIP)
hipDeviceSynchronize();
RAJA::synchronize<RAJA::hip_synchronize>();
#endif
#if defined(RAJA_ENABLE_SYCL)
RAJA::synchronize<RAJA::sycl_synchronize>();
#endif
}

Expand All @@ -267,25 +287,26 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
[[nodiscard]] std::string name() { return "raja"; };

[[nodiscard]] std::vector<Device> enumerateDevices() override {
std::vector<Device> devices{{RAJA::expt::ExecPlace::HOST, "RAJA Host device"}};
#if defined(RAJA_DEVICE_ACTIVE)
std::vector<Device> devices{{(size_t) RAJA::ExecPlace::HOST, "RAJA Host device"}};
#if defined(RAJA_TARGET_GPU)
#if defined(RAJA_ENABLE_CUDA)
const auto deviceName = "RAJA CUDA device";
#endif
#if defined(RAJA_ENABLE_HIP)
const auto deviceName = "Raja HIP device";
const auto deviceName = "RAJA HIP device";
#endif
#if defined(RAJA_ENABLE_SYCL)
const auto deviceName = "Raja SYCL device";
const auto deviceName = "RAJA SYCL device";
#endif
devices.template emplace_back(RAJA::expt::ExecPlace::DEVICE, deviceName);
devices.template emplace_back((size_t) RAJA::ExecPlace::DEVICE, deviceName);
#endif
return devices;
};

[[nodiscard]] Sample fasten(const Params &p, size_t wgsize, size_t device) const override {

Sample sample(PPWI, wgsize, p.nposes());
auto &rm = umpire::ResourceManager::getInstance();

auto contextStart = now();

Expand All @@ -302,6 +323,8 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

synchronise();

auto host_energies = registerAllocation(sample.energies);

auto contextEnd = now();
sample.contextTime = {contextStart, contextEnd};

Expand All @@ -315,7 +338,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

std::copy(results, results + p.nposes(), sample.energies.begin());
rm.copy(host_energies, results);

deallocate(protein);
deallocate(ligand);
Expand Down
115 changes: 31 additions & 84 deletions src/raja/model.cmake
Original file line number Diff line number Diff line change
@@ -1,93 +1,40 @@
register_flag_optional(RAJA_BACK_END "Specify whether we target CPU/CUDA/HIP/SYCL" "CPU")

register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection and RAJA.
See https://raja.readthedocs.io/en/main/getting_started.html#build-and-install"
"c++")

register_flag_required(RAJA_IN_TREE
"Absolute path to the *source* distribution directory of RAJA.
Make sure to use the release version of RAJA or clone RAJA recursively with submodules.
Remember to append RAJA specific flags as well, for example:

-DRAJA_IN_TREE=... -DENABLE_OPENMP=ON -DENABLE_CUDA=ON ...

See https://raja.readthedocs.io/en/v0.14.0/sphinx/user_guide/config_options.html#available-raja-options-and-defaults for all available options
")

#register_flag_optional(TARGET
# "Target offload device, implemented values are CPU, NVIDIA, HIP"
# CPU)

register_flag_optional(CUDA_TOOLKIT_ROOT_DIR
"[ENABLE_CUDA=ON only] Path to the CUDA toolkit directory (e.g `/opt/cuda-11.2`) if the ENABLE_CUDA flag is specified for RAJA" "")

# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
register_flag_optional(CUDA_ARCH
"[ENABLE_CUDA=ON only] Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc"
"")

register_flag_optional(CUDA_EXTRA_FLAGS
"[ENABLE_CUDA=ON only] Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
"")

# compiler vendor and arch specific flags
set(RAJA_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
register_flag_optional(MANAGED_ALLOC "Use UVM (cudaMallocManaged) instead of the device-only allocation (cudaMalloc)"
"OFF")

macro(setup)
if (POLICY CMP0104)
cmake_policy(SET CMP0104 OLD)
endif ()

set(CMAKE_CXX_STANDARD 17)

find_package(RAJA REQUIRED)
find_package(umpire REQUIRED)

if (EXISTS "${RAJA_IN_TREE}")

message(STATUS "Building using in-tree RAJA source at `${RAJA_IN_TREE}`")

set(CMAKE_CUDA_STANDARD 17)

# don't build anything that isn't the RAJA library itself, by default their cmake def builds everything, whyyy?
set(RAJA_ENABLE_TESTS OFF CACHE BOOL "")
set(RAJA_ENABLE_EXAMPLES OFF CACHE BOOL "")
set(RAJA_ENABLE_EXERCISES OFF CACHE BOOL "")
set(RAJA_ENABLE_BENCHMARKS OFF CACHE BOOL "")
set(ENABLE_REPRODUCERS OFF CACHE BOOL "")
set(ENABLE_DOCUMENTATION OFF CACHE BOOL "")

if (ENABLE_CUDA)

set(ENABLE_CUDA ON CACHE BOOL "")

# XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
if(POLICY CMP0104)
set(CMAKE_POLICY_DEFAULT_CMP0104 OLD) # so that propogates to RAJA's CMakeList as well
cmake_policy(SET CMP0104 OLD)
endif()

# RAJA needs all the cuda stuff setup before including!
set(CMAKE_CUDA_COMPILER ${CUDA_TOOLKIT_ROOT_DIR}/bin/nvcc)
set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-forward-unknown-to-host-compiler -extended-lambda -arch=${CUDA_ARCH}" ${CUDA_EXTRA_FLAGS})

message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS}")
endif ()


add_subdirectory(${RAJA_IN_TREE} ${CMAKE_BINARY_DIR}/raja)
register_link_library(RAJA)
# RAJA's cmake screws with where the binary will end up, resetting it here:
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
else ()
message(FATAL_ERROR "`${RAJA_IN_TREE}` does not exist")
endif ()

if (ENABLE_CUDA)
# RAJA needs the codebase to be compiled with nvcc, so we tell cmake to treat sources as *.cu
register_link_library(RAJA umpire)
if (${RAJA_BACK_END} STREQUAL "CUDA")
enable_language(CUDA)
set_source_files_properties(src/main.cpp PROPERTIES LANGUAGE CUDA)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -extended-lambda --expt-relaxed-constexpr --restrict --keep")

set_source_files_properties(${IMPL_SOURCES} PROPERTIES LANGUAGE CUDA)
register_definitions(RAJA_TARGET_GPU)
elseif (${RAJA_BACK_END} STREQUAL "HIP")
# Set CMAKE_CXX_COMPILER to hipcc
find_package(hip REQUIRED)
register_definitions(RAJA_TARGET_GPU)
elseif (${RAJA_BACK_END} STREQUAL "SYCL")
register_definitions(RAJA_TARGET_GPU)
else()
register_definitions(RAJA_TARGET_CPU)
message(STATUS "Falling Back to CPU")
endif ()

if (MANAGED_ALLOC)
register_definitions(BUDE_MANAGED_ALLOC)
endif ()

register_append_compiler_and_arch_specific_cxx_flags(
RAJA_FLAGS_CPU
${CMAKE_CXX_COMPILER_ID}
${CMAKE_SYSTEM_PROCESSOR}
)

endmacro()

Loading