Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new timers #2

Merged
merged 14 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 62 additions & 63 deletions src/acc/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,78 +158,77 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();

std::array<float *, 6> poses{};
auto protein = static_cast<Atom *>(std::malloc(sizeof(Atom) * p.natpro()));
auto ligand = static_cast<Atom *>(std::malloc(sizeof(Atom) * p.natlig()));
auto forcefield = static_cast<FFParams *>(std::malloc(sizeof(FFParams) * p.ntypes()));
auto energies = static_cast<float *>(std::malloc(sizeof(float) * p.nposes()));

for (size_t i = 0; i < 6; i++) {
poses[i] = static_cast<float *>(std::malloc(sizeof(float) * p.nposes()));
std::copy(p.poses[i].begin(), p.poses[i].end(), poses[i]);
}

std::copy(p.protein.begin(), p.protein.end(), protein);
std::copy(p.ligand.begin(), p.ligand.end(), ligand);
std::copy(p.forcefield.begin(), p.forcefield.end(), forcefield);
std::fill(energies, energies + p.nposes(), 0.0);

auto contextEnd = now();
sample.contextTime = {contextStart, contextEnd};

auto poses_0 = poses[0];
auto poses_1 = poses[1];
auto poses_2 = poses[2];
auto poses_3 = poses[3];
auto poses_4 = poses[4];
auto poses_5 = poses[5];
const auto ntypes = p.ntypes();
const auto nposes = p.nposes();
const auto natlig = p.natlig();
const auto natpro = p.natpro();

// clang-format off
#pragma acc data \
copyin( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes]) \
copyout(energies[:nposes])
{
auto poses = p.poses.data();
auto protein = p.protein.data();
auto ligand = p.ligand.data();
auto forcefield = p.forcefield.data();
auto energies = static_cast<float *>(std::calloc(nposes, sizeof(float)));

auto poses_0 = poses[0].data();
auto poses_1 = poses[1].data();
auto poses_2 = poses[2].data();
auto poses_3 = poses[3].data();
auto poses_4 = poses[4].data();
auto poses_5 = poses[5].data();

// clang-format off
auto hostToDeviceStart = now();

#pragma acc enter data \
copyin( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes]) \
create(energies[:nposes])

auto hostToDeviceEnd = now();
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

// clang-format on
for (size_t i = 0; i < p.totalIterations(); ++i) {
auto kernelStart = now();

// clang-format off
#pragma acc parallel loop \
present( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes], energies[:nposes])
// clang-format on
for (size_t i = 0; i < p.totalIterations(); ++i) {
auto kernelStart = now();

// clang-format off
#pragma acc parallel loop \
present( \
protein[:natpro], ligand[:natlig], \
forcefield[:ntypes], poses_0[:nposes], \
poses_1[:nposes], poses_2[:nposes], \
poses_3[:nposes], poses_4[:nposes], \
poses_5[:nposes], energies[:nposes]) wait
// clang-format on
for (size_t group = 0; group < (nposes / PPWI); group++) {
fasten_main<PPWI>(group, ntypes, nposes, natlig, natpro, //
protein, ligand, //
poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, //
forcefield, energies);
}
auto kernelEnd = now();
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
for (size_t group = 0; group < (nposes / PPWI); group++) {
fasten_main<PPWI>(group, ntypes, nposes, natlig, natpro, //
protein, ligand, //
poses_0, poses_1, poses_2, poses_3, poses_4, poses_5, //
forcefield, energies);
}
auto kernelEnd = now();
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

auto deviceToHostStart = now();

#pragma acc update \
host(energies[:nposes])

auto deviceToHostEnd = now();
sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

#pragma acc exit data \
delete(protein, ligand, forcefield, \
poses_0, poses_1, poses_2, \
poses_3, poses_4, poses_5, \
energies)

std::copy(energies, energies + p.nposes(), sample.energies.begin());
std::free(protein);
std::free(ligand);
std::free(forcefield);
std::free(energies);
for (auto &pose : poses)
std::free(pose);

return sample;
};
Expand Down
5 changes: 3 additions & 2 deletions src/bude.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,10 @@ struct Sample {
size_t ppwi, wgsize;
std::vector<float> energies;
std::vector<std::pair<TimePoint, TimePoint>> kernelTimes;
std::optional<std::pair<TimePoint, TimePoint>> contextTime;
std::optional<std::pair<TimePoint, TimePoint>> hostToDevice;
std::optional<std::pair<TimePoint, TimePoint>> deviceToHost;
Sample(size_t ppwi, size_t wgsize, size_t nposes)
: ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), contextTime() {}
: ppwi(ppwi), wgsize(wgsize), energies(nposes), kernelTimes(), hostToDevice(), deviceToHost() {}
};

using Device = std::pair<size_t, std::string>;
Expand Down
11 changes: 8 additions & 3 deletions src/cuda/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();
auto hostToDeviceStart = now();
auto protein = allocate(p.protein);
auto ligand = allocate(p.ligand);
auto transforms_0 = allocate(p.poses[0]);
Expand All @@ -265,9 +265,9 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
auto forcefield = allocate(p.forcefield);
auto results = allocate<float>(sample.energies.size());
checkError(cudaDeviceSynchronize());
auto contextEnd = now();
auto hostToDeviceEnd = now();

sample.contextTime = {contextStart, contextEnd};
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

size_t global = std::ceil(double(p.nposes()) / PPWI);
global = std::ceil(double(global) / double(wgsize));
Expand All @@ -285,8 +285,13 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

auto deviceToHostStart = now();

checkError(
cudaMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), cudaMemcpyDeviceToHost));

auto deviceToHostEnd = now();
sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

free(protein);
free(ligand);
Expand Down
10 changes: 7 additions & 3 deletions src/hip/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();
auto hostToDeviceStart = now();
auto protein = allocate(p.protein);
auto ligand = allocate(p.ligand);
auto transforms_0 = allocate(p.poses[0]);
Expand All @@ -227,9 +227,9 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
auto forcefield = allocate(p.forcefield);
auto results = allocate<float>(sample.energies.size());
checkError(hipDeviceSynchronize());
auto contextEnd = now();
auto hostToDeviceEnd = now();

sample.contextTime = {contextStart, contextEnd};
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

size_t global = std::ceil(double(p.nposes()) / PPWI);
global = std::ceil(double(global) / double(wgsize));
Expand All @@ -247,8 +247,12 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

auto deviceToHostStart = now();
checkError(
hipMemcpy(sample.energies.data(), results, sample.energies.size() * sizeof(float), hipMemcpyDeviceToHost));
auto deviceToHostEnd = now();

sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

free(protein);
free(ligand);
Expand Down
76 changes: 44 additions & 32 deletions src/kokkos/fasten.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,10 +172,9 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
}

template <typename T> static Kokkos::View<T *> mkView(const std::string &name, const std::vector<T> &xs) {
Kokkos::View<T *> view(name, xs.size());
auto mirror = Kokkos::create_mirror_view(view);
for (size_t i = 0; i < xs.size(); i++)
mirror[i] = xs[i];
Kokkos::View<const T *, Kokkos::LayoutLeft, Kokkos::HostSpace,
Kokkos::MemoryTraits<Kokkos::Unmanaged>> mirror (std::data(xs), std::size(xs));
Kokkos::View<T *> view (name, std::size(xs));
Kokkos::deep_copy(view, mirror);
return view;
}
Expand All @@ -195,39 +194,52 @@ template <size_t PPWI> class IMPL_CLS final : public Bude<PPWI> {
if (!Kokkos::is_initialized()) {
Kokkos::initialize();
}

Sample sample(PPWI, wgsize, p.nposes());

auto contextStart = now();

auto protein = mkView("protein", p.protein);
auto ligand = mkView("ligand", p.ligand);
auto transforms_0 = mkView("transforms_0", p.poses[0]);
auto transforms_1 = mkView("transforms_1", p.poses[1]);
auto transforms_2 = mkView("transforms_2", p.poses[2]);
auto transforms_3 = mkView("transforms_3", p.poses[3]);
auto transforms_4 = mkView("transforms_4", p.poses[4]);
auto transforms_5 = mkView("transforms_5", p.poses[5]);
auto forcefield = mkView("forcefield", p.forcefield);
Kokkos::View<float *> results("results", sample.energies.size());
Kokkos::fence();
auto contextEnd = now();
sample.contextTime = {contextStart, contextEnd};

for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
auto kernelStart = now();
fasten_main(wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), //
protein, ligand, forcefield, //
transforms_0, transforms_1, transforms_2, transforms_3, transforms_4, transforms_5, results);
{
auto hostToDeviceStart = now();

auto protein = mkView("protein", p.protein);
auto ligand = mkView("ligand", p.ligand);
auto transforms_0 = mkView("transforms_0", p.poses[0]);
auto transforms_1 = mkView("transforms_1", p.poses[1]);
auto transforms_2 = mkView("transforms_2", p.poses[2]);
auto transforms_3 = mkView("transforms_3", p.poses[3]);
auto transforms_4 = mkView("transforms_4", p.poses[4]);
auto transforms_5 = mkView("transforms_5", p.poses[5]);
auto forcefield = mkView("forcefield", p.forcefield);
Kokkos::View<float *> results(Kokkos::ViewAllocateWithoutInitializing("results"), sample.energies.size());
Kokkos::fence();
auto kernelEnd = now();
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);

auto hostToDeviceEnd = now();
sample.hostToDevice = {hostToDeviceStart, hostToDeviceEnd};

for (size_t i = 0; i < p.iterations + p.warmupIterations; ++i) {
auto kernelStart = now();
fasten_main(wgsize, p.ntypes(), p.nposes(), p.natlig(), p.natpro(), //
protein, ligand, forcefield, //
transforms_0, transforms_1, transforms_2, transforms_3, transforms_4, transforms_5, results);
Kokkos::fence();
auto kernelEnd = now();
sample.kernelTimes.emplace_back(kernelStart, kernelEnd);
}

auto deviceToHostStart = now();

auto result_mirror = Kokkos::create_mirror_view(results);
Kokkos::deep_copy(result_mirror, results);

auto deviceToHostEnd = now();
sample.deviceToHost = {deviceToHostStart, deviceToHostEnd};

for (size_t i = 0; i < results.size(); i++) {
sample.energies[i] = result_mirror[i];
}
}

auto result_mirror = Kokkos::create_mirror_view(results);
Kokkos::deep_copy(result_mirror, results);
for (size_t i = 0; i < results.size(); i++) {
sample.energies[i] = result_mirror[i];
if (!Kokkos::is_finalized()) {
Kokkos::finalize();
}

return sample;
Expand Down
21 changes: 15 additions & 6 deletions src/kokkos/model.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ register_flag_optional(CMAKE_CXX_COMPILER
See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
"c++")

register_flag_required(KOKKOS_IN_TREE
register_flag_optional(KOKKOS_IN_TREE
"Absolute path to the *source* distribution directory of Kokkos.
Remember to append Kokkos specific flags as well, for example:

-DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")

See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options")
register_flag_optional(KOKKOS_IN_PACKAGE
"Absolute path to package R-Path containing Kokkos libs.
Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")

# compiler vendor and arch specific flags
set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
Expand All @@ -19,15 +21,22 @@ macro(setup)

cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)

message(STATUS "Building using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")

if (EXISTS "${KOKKOS_IN_TREE}")
if (KOKKOS_IN_TREE)
add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
register_link_library(Kokkos::kokkos)
elseif (KOKKOS_IN_PACKAGE)
message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
find_package(Kokkos REQUIRED)
register_link_library(Kokkos::kokkos)
else ()
message(FATAL_ERROR "`${KOKKOS_IN_TREE}` does not exist")
message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
endif ()

register_append_compiler_and_arch_specific_cxx_flags(
KOKKOS_FLAGS_CPU
${CMAKE_CXX_COMPILER_ID}
Expand Down
Loading
Loading