Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SYCL USM Implementation #6

Merged
merged 4 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ register_model(std-ranges STD_RANGES fasten.hpp) # TODO
register_model(hip HIP fasten.hpp)
register_model(cuda CUDA fasten.hpp)
register_model(kokkos KOKKOS fasten.hpp)
register_model(sycl SYCL fasten.hpp)
register_model(sycl-acc SYCL_ACC fasten.hpp)
register_model(sycl-usm SYCL_USM fasten.hpp)
register_model(acc ACC fasten.hpp)
# defining RAJA collides with the RAJA namespace so USE_RAJA
register_model(raja USE_RAJA fasten.hpp)
Expand Down Expand Up @@ -290,4 +291,4 @@ endif ()

set_target_properties(${EXE_NAME} PROPERTIES OUTPUT_NAME "${BIN_NAME}")

install(TARGETS ${EXE_NAME} DESTINATION bin)
install(TARGETS ${EXE_NAME} DESTINATION bin)
6 changes: 4 additions & 2 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,10 @@
#include "kokkos/fasten.hpp"
#elif defined(ACC)
#include "acc/fasten.hpp"
#elif defined(SYCL)
#include "sycl/fasten.hpp"
#elif defined(SYCL_ACC)
#include "sycl-acc/fasten.hpp"
#elif defined(SYCL_USM)
#include "sycl-usm/fasten.hpp"
#elif defined(OMP)
#include "omp/fasten.hpp"
#elif defined(THRUST)
Expand Down
4 changes: 2 additions & 2 deletions src/sycl/fasten.hpp → src/sycl-acc/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ using namespace cl;
#ifdef IMPL_CLS
#error IMPL_CLS was already defined
#endif
#define IMPL_CLS SyclBude
#define IMPL_CLS SyclACCBude

template <size_t N> class bude_kernel_ndrange;

Expand Down Expand Up @@ -234,4 +234,4 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

return sample;
};
};
};
File renamed without changes.
239 changes: 239 additions & 0 deletions src/sycl-usm/fasten.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
#pragma once

#include "../bude.h"
#include <sycl/sycl.hpp>
#include <cstdint>
#include <iostream>
#include <string>

#ifdef IMPL_CLS
#error IMPL_CLS was already defined
#endif
#define IMPL_CLS SyclUSMBude

template <size_t N> class bude_kernel_ndrange;

template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {


static void fasten_main(sycl::handler &h,
size_t wgsize, size_t ntypes, size_t nposes,
size_t natlig, size_t natpro,
const Atom *proteins,
const Atom *ligands,
const FFParams *forcefields,
const float *transforms_0, const float *transforms_1,
const float *transforms_2, const float *transforms_3,
const float *transforms_4, const float *transforms_5,
float *energies) {
size_t global = std::ceil(double(nposes) / PPWI);
global = wgsize * size_t(std::ceil(double(global) / double(wgsize)));

sycl::local_accessor<FFParams> local_forcefield(sycl::range<1>(ntypes), h);

h.parallel_for<bude_kernel_ndrange<PPWI>>(sycl::nd_range<1>(global, wgsize), [=] (sycl::nd_item<1> item) {
const size_t lid = item.get_local_id(0);
const size_t gid = item.get_group(0);
const size_t lrange = item.get_local_range(0);

float etot[PPWI];
sycl::float4 transform[PPWI][3];

size_t ix = gid * lrange * PPWI + lid;
ix = ix < nposes ? ix : nposes - PPWI;

sycl::device_event event = item.async_work_group_copy(
sycl::local_ptr<FFParams>(local_forcefield.get_pointer()),
sycl::global_ptr<FFParams>(std::remove_const_t<FFParams*>(forcefields)),
ntypes);

const size_t lsz = lrange;
for (size_t i = 0; i < PPWI; i++) {
size_t index = ix + i * lsz;

const float sx = sycl::sin(transforms_0[index]);
const float cx = sycl::cos(transforms_0[index]);
const float sy = sycl::sin(transforms_1[index]);
const float cy = sycl::cos(transforms_1[index]);
const float sz = sycl::sin(transforms_2[index]);
const float cz = sycl::cos(transforms_2[index]);

transform[i][0].x() = cy * cz;
transform[i][0].y() = sx * sy * cz - cx * sz;
transform[i][0].z() = cx * sy * cz + sx * sz;
transform[i][0].w() = transforms_3[index];
transform[i][1].x() = cy * sz;
transform[i][1].y() = sx * sy * sz + cx * cz;
transform[i][1].z() = cx * sy * sz - sx * cz;
transform[i][1].w() = transforms_4[index];
transform[i][2].x() = -sy;
transform[i][2].y() = sx * cy;
transform[i][2].z() = cx * cy;
transform[i][2].w() = transforms_5[index];

etot[i] = ZERO;
}

item.wait_for(event);

// Loop over ligand atoms
for (size_t il = 0; il < natlig; il++) {
// Load ligand atom data
const Atom l_atom = ligands[il];
const FFParams l_params = local_forcefield[l_atom.type];
const bool lhphb_ltz = l_params.hphb < ZERO;
const bool lhphb_gtz = l_params.hphb > ZERO;

const sycl::float4 linitpos(l_atom.x, l_atom.y, l_atom.z, ONE);
sycl::float3 lpos[PPWI];
for (size_t i = 0; i < PPWI; i++) {
// Transform ligand atom
lpos[i].x() = transform[i][0].w() + linitpos.x() * transform[i][0].x() + linitpos.y() * transform[i][0].y() +
linitpos.z() * transform[i][0].z();
lpos[i].y() = transform[i][1].w() + linitpos.x() * transform[i][1].x() + linitpos.y() * transform[i][1].y() +
linitpos.z() * transform[i][1].z();
lpos[i].z() = transform[i][2].w() + linitpos.x() * transform[i][2].x() + linitpos.y() * transform[i][2].y() +
linitpos.z() * transform[i][2].z();
}

// Loop over protein atoms
for (size_t ip = 0; ip < natpro; ip++) {
// Load protein atom data
const Atom p_atom = proteins[ip];
const FFParams p_params = local_forcefield[p_atom.type];

const float radij = p_params.radius + l_params.radius;
const float r_radij = ONE / (radij);

const float elcdst = (p_params.hbtype == HBTYPE_F && l_params.hbtype == HBTYPE_F) ? FOUR : TWO;
const float elcdst1 = (p_params.hbtype == HBTYPE_F && l_params.hbtype == HBTYPE_F) ? QUARTER : HALF;
const bool type_E = ((p_params.hbtype == HBTYPE_E || l_params.hbtype == HBTYPE_E));

const bool phphb_ltz = p_params.hphb < ZERO;
const bool phphb_gtz = p_params.hphb > ZERO;
const bool phphb_nz = p_params.hphb != ZERO;
const float p_hphb = p_params.hphb * (phphb_ltz && lhphb_gtz ? -ONE : ONE);
const float l_hphb = l_params.hphb * (phphb_gtz && lhphb_ltz ? -ONE : ONE);
const float distdslv = (phphb_ltz ? (lhphb_ltz ? NPNPDIST : NPPDIST) : (lhphb_ltz ? NPPDIST : -FloatMax));
const float r_distdslv = ONE / (distdslv);

const float chrg_init = l_params.elsc * p_params.elsc;
const float dslv_init = p_hphb + l_hphb;

for (size_t i = 0; i < PPWI; i++) {
// Calculate distance between atoms
const float x = lpos[i].x() - p_atom.x;
const float y = lpos[i].y() - p_atom.y;
const float z = lpos[i].z() - p_atom.z;

// XXX as of oneapi-2021.1-beta10, the sycl::native::sqrt variant is significantly slower for no apparent
// reason
const float distij = sycl::sqrt(x * x + y * y + z * z);

// XXX as of oneapi-2021.1-beta10, the following variant is significantly slower for no apparent reason
// const float distij = sycl::distance(lpos[i], sycl::float3(p_atom.x, p_atom.y, p_atom.z));

// Calculate the sum of the sphere radii
const float distbb = distij - radij;
const bool zone1 = (distbb < ZERO);

// Calculate steric energy
etot[i] += (ONE - (distij * r_radij)) * (zone1 ? TWO * HARDNESS : ZERO);

// Calculate formal and dipole charge interactions
float chrg_e = chrg_init * ((zone1 ? ONE : (ONE - distbb * elcdst1)) * (distbb < elcdst ? ONE : ZERO));
const float neg_chrg_e = -sycl::fabs(chrg_e);
chrg_e = type_E ? neg_chrg_e : chrg_e;
etot[i] += chrg_e * CNSTNT;

// Calculate the two cases for Nonpolar-Polar repulsive interactions
const float coeff = (ONE - (distbb * r_distdslv));
float dslv_e = dslv_init * ((distbb < distdslv && phphb_nz) ? ONE : ZERO);
dslv_e *= (zone1 ? ONE : coeff);
etot[i] += dslv_e;
}
};
}

// Write results
const size_t td_base = gid * lrange * PPWI + lid;

if (td_base < nposes) {
for (size_t i = 0; i < PPWI; i++) {
energies[td_base + i * lrange] = etot[i] * HALF;
}
}
});
}

std::vector<sycl::device> devices;

public:
IMPL_CLS() : devices(sycl::device::get_devices()) {}

[[nodiscard]] std::string name() override { return "sycl"; };

[[nodiscard]] std::vector<Device> enumerateDevices() override {
std::vector<Device> xs;
for (size_t i = 0; i < devices.size(); i++)
xs.emplace_back(i, devices[i].template get_info<sycl::info::device::name>());
return xs;
};


template<typename T>[[nodiscard]] static T *allocate(size_t size, sycl::queue &queue) {
#ifdef BUDE_MANAGED_ALLOC
T *data = sycl::malloc_shared<T>(size, queue);
#else
T *data = sycl::malloc_device<T>(size, queue);
#endif
return data;
}

template<typename T>[[nodiscard]] static T *allocate(const std::vector<T> &xs, sycl::queue &queue) {
T *data = allocate<T>(std::size(xs), queue);
queue.memcpy(data, std::data(xs), sizeof(T) * std::size(xs));
return data;
}

[[nodiscard]] Sample fasten(const Params &p, size_t wgsize, size_t deviceIdx) const override {
auto device = devices[deviceIdx];

Sample sample(PPWI, wgsize, p.nposes());
sycl::queue queue(device);

auto contextStart = now();
auto proteins = allocate(p.protein, queue);
auto ligands = allocate(p.ligand, queue);
auto transforms_0 = allocate(p.poses[0], queue);
auto transforms_1 = allocate(p.poses[1], queue);
auto transforms_2 = allocate(p.poses[2], queue);
auto transforms_3 = allocate(p.poses[3], queue);
auto transforms_4 = allocate(p.poses[4], queue);
auto transforms_5 = allocate(p.poses[5], queue);
auto forcefields = allocate(p.forcefield, queue);
auto energies = allocate<float>(std::size(sample.energies), queue);
queue.wait_and_throw();
auto contextEnd = now();
sample.contextTime = {contextStart, contextEnd};

for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
auto kernelStart = now();
queue.submit([&](sycl::handler &h) {
fasten_main(h, wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(),
proteins, ligands, forcefields,
transforms_0, transforms_1, transforms_2,
transforms_3, transforms_4, transforms_5,
energies);
});
queue.wait_and_throw();
auto kernelEnd = now();
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

queue.memcpy(std::data(sample.energies), energies, sizeof(float) * std::size(sample.energies));
queue.wait_and_throw();

return sample;
};
};
106 changes: 106 additions & 0 deletions src/sycl-usm/model.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@

register_flag_optional(CMAKE_CXX_COMPILER
"Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler"
"c++")

register_flag_required(SYCL_COMPILER
"Compile using the specified SYCL compiler implementation
Supported values are
ONEAPI-ICPX - icpx as a standalone compiler
ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`)
DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm)
HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")

register_flag_optional(SYCL_COMPILER_DIR
"Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
ONEAPI-Clang - set to the directory that contains the Intel clang++ binary.
HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
"")

register_flag_optional(OpenCL_LIBRARY
"[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so"
"${OpenCL_LIBRARY}")

register_flag_optional(MANAGED_ALLOC "Use UVM (cudaMallocManaged) instead of the device-only allocation (cudaMalloc)"
"OFF")

macro(setup)
set(CMAKE_CXX_STANDARD 17)


if (${SYCL_COMPILER} STREQUAL "HIPSYCL")


set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)

if (NOT EXISTS "${hipSYCL_DIR}")
message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
endif ()
if (NOT EXISTS "${hipSYCL_DIR}")
message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
endif ()

# register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
find_package(hipSYCL CONFIG REQUIRED)
message(STATUS "ok")

elseif (${SYCL_COMPILER} STREQUAL "ACPP")
find_package(AdaptiveCpp CONFIG REQUIRED)
elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP")

list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
set(ComputeCpp_DIR ${SYCL_COMPILER_DIR})

# don't point to the CL dir as the imports already have the CL prefix
set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}")

register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0)
# ComputeCpp needs OpenCL
find_package(ComputeCpp REQUIRED)

# this must come after FindComputeCpp (!)
set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop)

elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
include_directories(${SYCL_COMPILER_DIR}/include/sycl)
register_append_cxx_flags(ANY -fsycl)
register_append_link_flags(-fsycl)
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
set(CMAKE_CXX_COMPILER icpx)
set(CMAKE_C_COMPILER icx)
register_append_cxx_flags(ANY -fsycl)
register_append_link_flags(-fsycl)
elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang")
set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_C_COMPILER clang)
register_append_cxx_flags(ANY -fsycl)
register_append_link_flags(-fsycl)
else ()
message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
endif ()

if (MANAGED_ALLOC)
register_definitions(BUDE_MANAGED_ALLOC)
endif ()

endmacro()


macro(setup_target NAME)
if (
(${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR
(${SYCL_COMPILER} STREQUAL "HIPSYCL") OR
(${SYCL_COMPILER} STREQUAL "ACPP"))

# so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their
# own custom integration header flags AFTER the target has been specified
# hence this macro here
add_sycl_to_target(
TARGET ${NAME}
SOURCES ${IMPL_SOURCES})
endif ()
endmacro()