From 9ae49179837e4c6427c7852b5632150806cebb2a Mon Sep 17 00:00:00 2001 From: Christian von Elm Date: Mon, 12 Feb 2024 16:40:44 +0100 Subject: [PATCH] feat(nvidia): Add cupti support This commit adds a --nvidia option, which injects a library into the program under measurement, which records entry and exit into CUDA kernels via CUPTI --- CMakeLists.txt | 54 ++++ include/lo2s/build_config.hpp.in | 4 + include/lo2s/config.hpp | 4 + include/lo2s/cupti/events.hpp | 50 ++++ include/lo2s/cupti/reader.hpp | 98 +++++++ include/lo2s/measurement_scope.hpp | 8 + .../lo2s/monitor/abstract_process_monitor.hpp | 2 +- include/lo2s/monitor/process_monitor.hpp | 3 +- include/lo2s/monitor/scope_monitor.hpp | 7 +- .../lo2s/monitor/system_process_monitor.hpp | 4 +- include/lo2s/ringbuf.hpp | 245 ++++++++++++++++++ include/lo2s/shared_memory.hpp | 2 + include/lo2s/trace/reg_keys.hpp | 4 +- include/lo2s/trace/trace.hpp | 5 + include/lo2s/util.hpp | 2 + man/lo2s.1.pod | 19 ++ src/config.cpp | 68 ++++- src/cupti/lib.cpp | 220 ++++++++++++++++ src/monitor/poll_monitor.cpp | 16 +- src/monitor/process_monitor.cpp | 11 +- src/monitor/process_monitor_main.cpp | 47 +++- src/monitor/scope_monitor.cpp | 18 +- src/monitor/system_process_monitor.cpp | 3 +- src/perf/counter/userspace/reader.cpp | 19 +- src/trace/trace.cpp | 25 +- src/util.cpp | 24 ++ 26 files changed, 898 insertions(+), 64 deletions(-) create mode 100644 include/lo2s/cupti/events.hpp create mode 100644 include/lo2s/cupti/reader.hpp create mode 100644 include/lo2s/ringbuf.hpp create mode 100644 src/cupti/lib.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index cba44f13..8c3f91f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,6 +34,7 @@ IfUpdatedUnsetAll(lo2s_USE_STATIC_LIBS Libpfm_USE_STATIC_LIBS X86Adapt_STATIC x86_energy_STATIC + CUDA_USE_STATIC_LIBS ) if(lo2s_USE_STATIC_LIBS STREQUAL "OFF") @@ -45,6 +46,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "OFF") set(x86_energy_STATIC OFF CACHE BOOL "") set(Sensors_USE_STATIC_LIBS OFF CACHE BOOL "") set(Libpfm_USE_STATIC_LIBS OFF CACHE BOOL "") + set(CUDA_USE_STATIC_LIBS OFF CACHE BOOL "") endif() if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY") @@ -56,6 +58,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY") set(x86_energy_STATIC ON CACHE BOOL "") set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "") set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "") + set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc") endif() @@ -68,6 +71,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "ALL") set(x86_energy_STATIC ON CACHE BOOL "") set(Sensors_USE_STATIC_LIBS ON CACHE BOOL "") set(Libpfm_USE_STATIC_LIBS ON CACHE BOOL "") + set(CUDA_USE_STATIC_LIBS ON CACHE BOOL "") # Doesn't seem to work with clang, even though it should, # but at least it doesn't complain about it either @@ -107,6 +111,7 @@ find_package(Sensors) find_package(Veosinfo) find_package(Libpfm) find_package(PkgConfig) +find_package(CUDAToolkit) if(PkgConfig_FOUND) pkg_check_modules(Audit audit) @@ -129,6 +134,8 @@ CMAKE_DEPENDENT_OPTION(USE_LIBAUDIT "Use libaudit for syscall name resolution" O add_feature_info("USE_LIBAUDIT" USE_LIBAUDIT "Use libaudit for syscall name resolution.") CMAKE_DEPENDENT_OPTION(USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards." ON "Veosinfo_FOUND" OFF) add_feature_info("USE_VEOSINFO" USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards.") +CMAKE_DEPENDENT_OPTION(USE_CUPTI "Use CUPTI to record CUDA activity." ON "CUDAToolkit_FOUND" OFF) +add_feature_info("USE_CUPTI" USE_CUPTI "Use CUPTI to record CUDA activity.") # system configuration checks CHECK_INCLUDE_FILES(linux/hw_breakpoint.h HAVE_HW_BREAKPOINT_H) CHECK_STRUCT_HAS_MEMBER("struct perf_event_attr" clockid linux/perf_event.h HAVE_PERF_EVENT_ATTR_CLOCKID) @@ -144,6 +151,13 @@ if(NOT CLOCK_GETTIME_FOUND) unset(CMAKE_REQUIRED_LIBRARIES) endif() +check_function_exists(shm_open SHM_OPEN_FOUND) +if(NOT SHM_OPEN_FOUND) + set(CMAKE_REQUIRED_LIBRARIES "rt") + check_function_exists(shm_open SHM_OPEN_FOUND_WITH_RT) + unset(CMAKE_REQUIRED_LIBRARIES) +endif() + CHECK_STRUCT_HAS_BITFIELD("struct perf_event_attr" context_switch linux/perf_event.h HAVE_PERF_RECORD_SWITCH) if(NOT HAVE_PERF_RECORD_SWITCH) @@ -226,6 +240,14 @@ if(NOT CLOCK_GETTIME_FOUND) endif() endif() +if(NOT SHM_OPEN_FOUND) + if(SHM_OPEN_FOUND_WITH_RT) + target_link_libraries(lo2s PRIVATE rt) + else() + message(SEND_ERROR "Could not find the function shm_open(), but it is required.") + endif() +endif() + # handle x86_adapt dependency if(X86Adapt_FOUND) target_sources(lo2s PRIVATE @@ -306,6 +328,38 @@ if (USE_LIBAUDIT) endif() endif() +set(LO2S_CUDA_INJECTIONLIB_PATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblo2s_injection.so") +if(USE_CUPTI) + if(CUDAToolkit_FOUND) + add_library(lo2s_injection SHARED src/cupti/lib.cpp) + target_include_directories(lo2s_injection PRIVATE include + ${CMAKE_CURRENT_BINARY_DIR}/include) + + if (CUDA_USE_STATIC_LIBS) + target_link_libraries(lo2s_injection PRIVATE CUDA::cupti_static) + else() + target_link_libraries(lo2s_injection PRIVATE CUDA::cupti) + endif() + + target_link_libraries(lo2s_injection PRIVATE fmt::fmt + Nitro::log + Nitro::env + Nitro::dl + Nitro::options + otf2xx::Writer) + + if(SHM_OPEN_FOUND_WITH_RT) + target_link_libraries(lo2s_injection PRIVATE rt) + endif() + + target_compile_definitions(lo2s PUBLIC HAVE_CUDA) + install(TARGETS lo2s_injection LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}) + else() + message(SEND_ERROR "Cupti not found but requested.") + endif() +endif() + + # generate version string used in lo2s if(Git_FOUND) diff --git a/include/lo2s/build_config.hpp.in b/include/lo2s/build_config.hpp.in index 4186c657..b2e438df 100644 --- a/include/lo2s/build_config.hpp.in +++ b/include/lo2s/build_config.hpp.in @@ -40,3 +40,7 @@ #cmakedefine LO2S_COPYRIGHT_YEAR "@LO2S_COPYRIGHT_YEAR@" + +// The CUDA injection library installation path + +#cmakedefine LO2S_CUDA_INJECTIONLIB_PATH "@LO2S_CUDA_INJECTIONLIB_PATH@" diff --git a/include/lo2s/config.hpp b/include/lo2s/config.hpp index 76427bf8..53fd6f36 100644 --- a/include/lo2s/config.hpp +++ b/include/lo2s/config.hpp @@ -98,6 +98,10 @@ struct Config bool use_nec; std::chrono::microseconds nec_read_interval; std::chrono::milliseconds nec_check_interval; + // Nvidia CUPTI + bool use_nvidia; + std::string cuda_injectionlib_path; + uint64_t nvidia_ringbuf_size; }; const Config& config(); diff --git a/include/lo2s/cupti/events.hpp b/include/lo2s/cupti/events.hpp new file mode 100644 index 00000000..ba994458 --- /dev/null +++ b/include/lo2s/cupti/events.hpp @@ -0,0 +1,50 @@ +/* + * This file is part of the lo2s software. + * Linux OTF2 sampling + * + * Copyright (c) 2024, + * Technische Universitaet Dresden, Germany + * + * lo2s is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lo2s is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with lo2s. If not, see . + */ + +#pragma once + +#include + +namespace lo2s +{ +namespace cupti +{ +enum class EventType : uint64_t +{ + CUPTI_KERNEL = 1, +}; + +struct event_header +{ + EventType type; + uint64_t size; +}; + +struct event_kernel +{ + struct event_header header; + uint64_t start; + uint64_t end; + char name[1]; +}; + +} // namespace cupti +} // namespace lo2s diff --git a/include/lo2s/cupti/reader.hpp b/include/lo2s/cupti/reader.hpp new file mode 100644 index 00000000..5c474b67 --- /dev/null +++ b/include/lo2s/cupti/reader.hpp @@ -0,0 +1,98 @@ +/* + * This file is part of the lo2s software. + * Linux OTF2 sampling + * + * Copyright (c) 2016, + * Technische Universitaet Dresden, Germany + * + * lo2s is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lo2s is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with lo2s. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +extern "C" +{ +#include +#include +} + +namespace lo2s +{ +namespace cupti +{ + +class Reader +{ +public: + Reader(trace::Trace& trace, Process process) + : process_(process), trace_(trace), time_converter_(perf::time::Converter::instance()), + ringbuf_reader_("cupti", process.as_pid_t(), true, config().nvidia_ringbuf_size), + timer_fd_(timerfd_from_ns(config().userspace_read_interval)), + executable_name_(get_process_exe(process)) + { + } + + void read() + { + struct event_header* header = nullptr; + + while ((header = reinterpret_cast( + ringbuf_reader_.get(sizeof(struct event_header)))) != nullptr) + { + if (header->type == EventType::CUPTI_KERNEL) + { + struct event_kernel* kernel = + reinterpret_cast(ringbuf_reader_.get(header->size)); + + auto& writer = trace_.cuda_writer(Thread(process_.as_thread())); + + std::string kernel_name = kernel->name; + auto& cu_cctx = trace_.cuda_calling_context(executable_name_, kernel_name); + + writer.write_calling_context_enter(time_converter_(kernel->start), cu_cctx.ref(), + 2); + writer.write_calling_context_leave(time_converter_(kernel->end), cu_cctx.ref()); + } + + ringbuf_reader_.pop(header->size); + } + } + + int fd() + { + return timer_fd_; + } + +private: + Process process_; + trace::Trace& trace_; + perf::time::Converter& time_converter_; + RingBufReader ringbuf_reader_; + int timer_fd_; + std::string executable_name_; +}; +} // namespace cupti +} // namespace lo2s diff --git a/include/lo2s/measurement_scope.hpp b/include/lo2s/measurement_scope.hpp index bc3a2979..f3841815 100644 --- a/include/lo2s/measurement_scope.hpp +++ b/include/lo2s/measurement_scope.hpp @@ -33,6 +33,7 @@ enum class MeasurementScopeType NEC_METRIC, BIO, SYSCALL, + CUDA, UNKNOWN }; @@ -79,6 +80,11 @@ struct MeasurementScope return { MeasurementScopeType::SYSCALL, s }; } + static MeasurementScope cuda(ExecutionScope s) + { + return { MeasurementScopeType::CUDA, s }; + } + friend bool operator==(const MeasurementScope& lhs, const MeasurementScope& rhs) { return (lhs.scope == rhs.scope) && lhs.type == rhs.type; @@ -111,6 +117,8 @@ struct MeasurementScope return fmt::format("block layer I/O events for {}", scope.name()); case MeasurementScopeType::SYSCALL: return fmt::format("syscall events for {}", scope.name()); + case lo2s::MeasurementScopeType::CUDA: + return fmt::format("cuda kernel events for {}", scope.name()); default: throw new std::runtime_error("Unknown ExecutionScopeType!"); } diff --git a/include/lo2s/monitor/abstract_process_monitor.hpp b/include/lo2s/monitor/abstract_process_monitor.hpp index bcc93a79..18bb6859 100644 --- a/include/lo2s/monitor/abstract_process_monitor.hpp +++ b/include/lo2s/monitor/abstract_process_monitor.hpp @@ -41,7 +41,7 @@ class AbstractProcessMonitor virtual void insert_process(Process parent, Process process, std::string proc_name, bool spawn = false) = 0; virtual void insert_thread(Process process, Thread thread, std::string name = "", - bool spawn = false) = 0; + bool spawn = false, bool is_process = false) = 0; virtual void exit_thread(Thread thread) = 0; diff --git a/include/lo2s/monitor/process_monitor.hpp b/include/lo2s/monitor/process_monitor.hpp index d5463c63..609f5d42 100644 --- a/include/lo2s/monitor/process_monitor.hpp +++ b/include/lo2s/monitor/process_monitor.hpp @@ -45,7 +45,8 @@ class ProcessMonitor : public AbstractProcessMonitor, public MainMonitor ~ProcessMonitor(); void insert_process(Process parent, Process child, std::string proc_name, bool spawn = false) override; - void insert_thread(Process parent, Thread child, std::string name, bool spawn = false) override; + void insert_thread(Process parent, Thread child, std::string name, bool spawn = false, + bool is_process = false) override; void exit_thread(Thread thread) override; diff --git a/include/lo2s/monitor/scope_monitor.hpp b/include/lo2s/monitor/scope_monitor.hpp index f68cd6e7..c809d64c 100644 --- a/include/lo2s/monitor/scope_monitor.hpp +++ b/include/lo2s/monitor/scope_monitor.hpp @@ -24,14 +24,15 @@ #include #include +#include #include #include - #include #include #include #include +#include #include #include @@ -50,7 +51,8 @@ namespace monitor class ScopeMonitor : public PollMonitor { public: - ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec); + ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec, + bool is_process = false); void initialize_thread() override; void finalize_thread() override; @@ -74,6 +76,7 @@ class ScopeMonitor : public PollMonitor std::unique_ptr sample_writer_; std::unique_ptr group_counter_writer_; std::unique_ptr userspace_counter_writer_; + std::unique_ptr cupti_reader_; }; } // namespace monitor } // namespace lo2s diff --git a/include/lo2s/monitor/system_process_monitor.hpp b/include/lo2s/monitor/system_process_monitor.hpp index 3162ecde..ed785718 100644 --- a/include/lo2s/monitor/system_process_monitor.hpp +++ b/include/lo2s/monitor/system_process_monitor.hpp @@ -46,8 +46,8 @@ class SystemProcessMonitor : public AbstractProcessMonitor virtual void insert_process(Process parent, Process process, std::string proc_name, bool spawn) override; - virtual void insert_thread(Process process, Thread thread, std::string name, - bool spawn) override; + virtual void insert_thread(Process process, Thread thread, std::string name, bool spawn, + bool is_process) override; virtual void exit_thread(Thread thread) override; diff --git a/include/lo2s/ringbuf.hpp b/include/lo2s/ringbuf.hpp new file mode 100644 index 00000000..60fe592b --- /dev/null +++ b/include/lo2s/ringbuf.hpp @@ -0,0 +1,245 @@ +/* + * This file is part of the lo2s software. + * Linux OTF2 sampling + * + * Copyright (c) 2024, + * Technische Universitaet Dresden, Germany + * + * lo2s is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lo2s is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with lo2s. If not, see . + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +extern "C" +{ +#include +#include +#include +#include +} + +namespace lo2s +{ + +// To resolve possible ringbuf format incompatibilities +#define RINGBUF_VERSION 1 + +struct ringbuf_header +{ + uint64_t version; + uint64_t size; + std::atomic_uint64_t head; + std::atomic_uint64_t tail; +}; + +class ShmRingbuf +{ +public: + ShmRingbuf(std::string component, pid_t pid, bool create, size_t pages) + { + std::string filename = "/lo2s-" + component + "-" + std::to_string(pid); + + fd_ = shm_open(filename.c_str(), create ? O_RDWR | O_CREAT | O_EXCL : O_RDWR, 0600); + if (fd_ == -1) + { + throw std::system_error(errno, std::system_category()); + } + + size_t pagesize = sysconf(_SC_PAGESIZE); + size_t size; + + if (create) + { + size = pagesize * pages; + ftruncate(fd_, size + sysconf(_SC_PAGESIZE)); + } + else + { + auto header_map = SharedMemory(fd_, sizeof(struct ringbuf_header), 0); + size = header_map.as()->size; + } + + // To handle events that wrap around the ringbuffer, map it twice into virtual memory + // back-to-back. This way events that wrap around the ringbuffer can be read and written + // without noticing the wraparound: + // + // in physical memory: [ent|-----|ev] + // + // in virtual memory: [ent|-----|ev][ent----|ev] + // + // As there is no way to reserve a range of virtual memory, mmap()-ing two adjacent + // ring-buffer without races is tricky. We solve this problem by mmap()-ing an area twice + // the size of the ringbuffer and then overwriting the latter half of this mapping with + // another mapping of the ringbuffer using MMAP_FIXED. This way we only touch mappings we + // control. Also, put the ringbuffer header on a separate page to make life easier. + + first_mapping_ = SharedMemory(fd_, size * 2 + pagesize, 0); + + second_mapping_ = SharedMemory(fd_, size, pagesize, first_mapping_.as() + size); + + header_ = first_mapping_.as(); + start_ = first_mapping_.as() + pagesize; + + if (create) + { + header_->version = RINGBUF_VERSION; + header_->size = size; + header_->tail.store(0); + header_->head.store(0); + } + else + { + if (header_->version != RINGBUF_VERSION) + { + throw new std::runtime_error("Incompatible RingBuffer Version " + + std::to_string(header_->version) + + " detected on other side!"); + } + } + } + + uint64_t head() + { + return header_->head.load(); + } + + uint64_t tail() + { + return header_->tail.load(); + } + + void head(uint64_t new_head) + { + return header_->head.store(new_head); + } + + void tail(uint64_t new_tail) + { + return header_->tail.store(new_tail); + } + + uint64_t size() + { + return header_->size; + } + +protected: + std::byte* start_; + +private: + struct ringbuf_header* header_; + int fd_; + SharedMemory first_mapping_, second_mapping_; +}; + +class RingBufWriter : public ShmRingbuf +{ +public: + RingBufWriter(std::string component, pid_t pid, bool create, size_t pages = 0) + : ShmRingbuf(component, pid, create, pages) + { + } + + std::byte* reserve(size_t ev_size) + { + if (ev_size == 0) + { + return nullptr; + } + + // No other reservation can be active! + assert(reserved_size_ == 0); + + if (head() >= tail() && ev_size >= tail() - head() + size()) + { + return nullptr; + } + if (head() < tail() && ev_size >= tail() - head()) + { + return nullptr; + } + + reserved_size_ = ev_size; + return start_ + head(); + } + + void commit() + { + assert(reserved_size_ != 0); + + head((head() + reserved_size_) % size()); + reserved_size_ = 0; + } + +private: + size_t reserved_size_ = 0; +}; + +class RingBufReader : public ShmRingbuf +{ +public: + RingBufReader(std::string component, pid_t pid, bool create, size_t pages = 0) + : ShmRingbuf(component, pid, create, pages) + { + } + + std::byte* get(size_t size) + { + if (size == 0) + { + return nullptr; + } + + if (!can_be_loaded(size)) + { + return nullptr; + } + return start_ + tail(); + } + + void pop(size_t ev_size) + { + if (ev_size == 0) + { + return; + } + + // Calling pop() without trying to get() data from the ringbuffer first is an error + assert(can_be_loaded(ev_size)); + + tail((tail() + ev_size) % size()); + } + +private: + bool can_be_loaded(size_t ev_size) + { + if (tail() <= head()) + { + return tail() + ev_size <= head(); + } + + return tail() + ev_size <= head() + size(); + } +}; +} // namespace lo2s diff --git a/include/lo2s/shared_memory.hpp b/include/lo2s/shared_memory.hpp index ee307439..35a546d8 100644 --- a/include/lo2s/shared_memory.hpp +++ b/include/lo2s/shared_memory.hpp @@ -22,7 +22,9 @@ #pragma once #include +#include +#include #include extern "C" diff --git a/include/lo2s/trace/reg_keys.hpp b/include/lo2s/trace/reg_keys.hpp index ae0ba9d2..542f8945 100644 --- a/include/lo2s/trace/reg_keys.hpp +++ b/include/lo2s/trace/reg_keys.hpp @@ -231,8 +231,8 @@ struct Holder template <> struct Holder { - using type = - otf2::lookup_definition_holder; + using type = otf2::lookup_definition_holder; }; template <> diff --git a/include/lo2s/trace/trace.hpp b/include/lo2s/trace/trace.hpp index ad8e71ce..9075292e 100644 --- a/include/lo2s/trace/trace.hpp +++ b/include/lo2s/trace/trace.hpp @@ -19,6 +19,7 @@ * along with lo2s. If not, see . */ #pragma once +#include "otf2xx/definition/calling_context.hpp" #include #include #include @@ -131,12 +132,16 @@ class Trace otf2::definition::mapping_table merge_syscall_contexts(const std::set& used_syscalls); otf2::writer::local& sample_writer(const ExecutionScope& scope); + otf2::writer::local& cuda_writer(const Thread& thread); otf2::writer::local& metric_writer(const MeasurementScope& scope); otf2::writer::local& syscall_writer(const Cpu& cpu); otf2::writer::local& bio_writer(BlockDevice dev); otf2::writer::local& create_metric_writer(const std::string& name); otf2::writer::local& nec_writer(NecDevice device, const Thread& nec_thread); + otf2::definition::calling_context& cuda_calling_context(std::string& exe, + std::string& function); + otf2::definition::io_handle& block_io_handle(BlockDevice dev); otf2::definition::metric_member diff --git a/include/lo2s/util.hpp b/include/lo2s/util.hpp index 2d85569f..efd3fca3 100644 --- a/include/lo2s/util.hpp +++ b/include/lo2s/util.hpp @@ -122,4 +122,6 @@ Thread gettid(); std::set parse_list(std::string list); std::set parse_list_from_file(std::filesystem::path file); + +int timerfd_from_ns(std::chrono::nanoseconds duration); } // namespace lo2s diff --git a/man/lo2s.1.pod b/man/lo2s.1.pod index 7b00d2b8..b1c51d57 100644 --- a/man/lo2s.1.pod +++ b/man/lo2s.1.pod @@ -28,6 +28,7 @@ S<[B<--metric-count> I | B<--metric-frequency> I]> S<[B<-x> I]> S<[B<-X>]> S<[B<-s SYSCALL>]> +S<[B<--accel ACCEL>]> S<{ I | I }> =item I := { I | B<--> I [I...] | B<-p> I } @@ -387,6 +388,24 @@ Record measurements for each sensor found by L. =back +=head2 B options + +=over + +=item B<--accel> I + +Record activity events (instruction samples or kernel execution information) for the given accelerator. Usable accelerators are "nec" for NEC SX-Aurora and "nvidia" for NVidia CUDA accelerators. + +=item B<--nec-readout-interval> I + +Set the interval (in microseconds) between NEC SX-Aurora instruction samples. + +=item B<--nec-check-interval> I + +Set the interval (in milliseconds) between checks for new NEC SX-Aurora processes. + +=back + =head2 Arguments to options =over diff --git a/src/config.cpp b/src/config.cpp index d4399d6d..55505876 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -164,7 +164,7 @@ void parse_program_options(int argc, const char** argv) auto& x86_energy_options = parser.group("x86_energy options"); auto& sensors_options = parser.group("sensors options"); auto& io_options = parser.group("I/O recording options"); - auto& nec_options = parser.group("NEC SX-Aurora Tsubasa recording options"); + auto& accel_options = parser.group("Accelerator options"); lo2s::Config config; @@ -346,16 +346,42 @@ void parse_program_options(int argc, const char** argv) io_options.toggle("block-io", "Enable recording of block I/O events (requires access to debugfs)"); - nec_options.toggle("nec", "Enable NEC Vector Engine sampling"); - nec_options.option("nec-readout-interval", "NEC sampling interval") + std::vector accelerators; + +#ifdef HAVE_CUDA + accelerators.push_back("nvidia"); +#endif +#ifdef HAVE_VEOSINFO + accelerators.push_back("nec"); +#endif + + accel_options + .multi_option( + "accel", + fmt::format("Accelerator to record execution events for. Available accelerators: {}", + fmt::join(accelerators, ", "))) + .metavar("ACCEL") + .optional(); + + accel_options.option("nec-readout-interval", "Accelerator sampling interval") .optional() .metavar("USEC") .default_value("1"); - nec_options.option("nec-check-interval", "The interval between checks for new VE processes") + accel_options.option("nec-check-interval", "The interval between checks for new VE processes") .optional() .metavar("MSEC") .default_value("100"); + accel_options.option("nvidia-injection-path", "path to the lo2s cupti injection library") + .optional() + .metavar("PATH") + .default_value(LO2S_CUDA_INJECTIONLIB_PATH); + + accel_options.option("nvidia-ringbuf-size", "Size of the injection library ring-buffer") + .optional() + .metavar("BYTE") + .default_value("65536"); + nitro::options::arguments arguments; try { @@ -382,7 +408,10 @@ void parse_program_options(int argc, const char** argv) config.use_x86_energy = arguments.given("x86-energy"); config.use_sensors = arguments.given("sensors"); config.use_block_io = arguments.given("block-io"); - config.use_nec = arguments.given("nec"); + +#ifdef HAVE_CUDA + config.cuda_injectionlib_path = arguments.get("nvidia-injection-path"); +#endif config.command = arguments.positionals(); if (arguments.given("help")) @@ -514,6 +543,35 @@ void parse_program_options(int argc, const char** argv) } } + for (const auto& accel : arguments.get_all("accel")) + { + if (accel == "nec") + { +#ifdef HAVE_VEOSINFO + config.use_nec = true; +#else + std::cerr << "lo2s was built without support for NEC SX-Aurora sampling\n"; + std::exit(EXIT_FAILURE); +#endif + } + else if (accel == "nvidia") + { +#ifdef HAVE_CUDA + config.use_nvidia = true; + config.nvidia_ringbuf_size = arguments.as("nvidia-ringbuf-size"); +#else + std::cerr << "lo2s was built without support for CUDA kernel recording\n"; + std::exit(EXIT_FAILURE); +#endif + } + else + { + std::cerr << "Unknown Accelerator " << accel << "!"; + parser.usage(); + std::exit(EXIT_FAILURE); + } + } + std::vector perf_group_events = arguments.get_all("metric-event"); std::vector perf_userspace_events = arguments.get_all("userspace-metric-event"); diff --git a/src/cupti/lib.cpp b/src/cupti/lib.cpp new file mode 100644 index 00000000..ca100f1e --- /dev/null +++ b/src/cupti/lib.cpp @@ -0,0 +1,220 @@ +/* + * This file is part of the lo2s software. + * Linux OTF2 sampling + * + * Copyright (c) 2016, + * Technische Universitaet Dresden, Germany + * + * lo2s is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * lo2s is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with lo2s. If not, see . + */ + +#include +#include + +#include +#include +#include + +#include +#include +#include + +extern "C" +{ +#include +#include +#include +} + +// Allocate 8 MiB every time CUPTI asks for more event memory +constexpr size_t CUPTI_BUFFER_SIZE = 8 * 1024 * 1024; + +std::unique_ptr rb_writer = nullptr; +CUpti_SubscriberHandle subscriber = nullptr; + +clockid_t clockid = CLOCK_MONOTONIC_RAW; + +static void atExitHandler(void) +{ + // Flush all remaining activity records + cuptiActivityFlushAll(1); +} + +// Through bufferRequested, CUPTI asks for more memory to fill with events +static void CUPTIAPI bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) +{ + assert(buffer != nullptr && size != nullptr && maxNumRecords != nullptr); + + *maxNumRecords = 0; + *size = CUPTI_BUFFER_SIZE; + *buffer = static_cast(malloc(*size)); + + if (*buffer == nullptr) + { + std::cerr << "Error: Out of memory.\n"; + exit(-1); + } +} + +// bufferCompleted is called when a requested buffer (created through bufferRequested) has +// been filled with event data or the end of measurement is reached. We then can process the events +// in that CUPTI event buffer and write them to the lo2s ring-buffer +static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size, + size_t validSize) +{ + CUpti_Activity* record = nullptr; + + size_t ringbuf_full_dropped = 0; + while (cuptiActivityGetNextRecord(buffer, validSize, &record) == CUPTI_SUCCESS) + { + switch (record->kind) + { + case CUPTI_ACTIVITY_KIND_KERNEL: + case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: + { + CUpti_ActivityKernel6* kernel = reinterpret_cast(record); + + uint64_t name_len = strlen(kernel->name); + + struct lo2s::cupti::event_kernel* ev = + reinterpret_cast( + rb_writer->reserve(sizeof(struct lo2s::cupti::event_kernel) + name_len)); + + if (ev == nullptr) + { + ringbuf_full_dropped++; + continue; + } + + ev->header.type = lo2s::cupti::EventType::CUPTI_KERNEL; + ev->header.size = sizeof(struct lo2s::cupti::event_kernel) + name_len; + ev->start = kernel->start; + ev->end = kernel->end; + memcpy(ev->name, kernel->name, name_len + 1); + + rb_writer->commit(); + break; + } + default: + break; + } + } + + size_t cupti_dropped; + cuptiActivityGetNumDroppedRecords(ctx, streamId, &cupti_dropped); + if (cupti_dropped != 0) + { + std::cerr << "Dropped " << cupti_dropped << " activity records in CUPTI.\n"; + } + + if (ringbuf_full_dropped != 0) + { + std::cerr << "lo2s Ringbuffer full, dropped" << ringbuf_full_dropped + << " events. Try to increase --nvidia-ringbuf-size!" + + << std::endl; + } + + free(buffer); +} + +// callbackHandler is our universal callback handler for the callback based part of the CUPTI +// event API. We attach it to the following events: +// +// - cuProfilerStart -> enable the CUPTI Activity API tracing for the given cuContext +// - cuProfilerStop -> flush event buffers and disable CUPTI Activity API tracing +// - cudaDeviceReset -> flush event buffers +void CUPTIAPI callbackHandler(void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid, + void* cbdata) +{ + const CUpti_CallbackData* cbInfo = (CUpti_CallbackData*)cbdata; + + if (domain == CUPTI_CB_DOMAIN_DRIVER_API) + { + if (cbid == CUPTI_DRIVER_TRACE_CBID_cuProfilerStart) + { + if (cbInfo->callbackSite == CUPTI_API_EXIT) + { + cuptiActivityEnableContext(cbInfo->context, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + } + } + else if (cbid == CUPTI_DRIVER_TRACE_CBID_cuProfilerStop) + { + if (cbInfo->callbackSite == CUPTI_API_ENTER) + { + cuptiActivityFlushAll(0); + cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API, + CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020); + + cuptiActivityDisableContext(cbInfo->context, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + } + } + } + + // Also flush on CUDA device reset + else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) + { + if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020) + { + if (cbInfo->callbackSite == CUPTI_API_ENTER) + { + cuptiActivityFlushAll(0); + } + } + } +} + +uint64_t timestampfunc() +{ + struct timespec ts; + clock_gettime(clockid, &ts); + uint64_t res = ts.tv_sec * 1000000000 + ts.tv_nsec; + return res; +} + +extern "C" int InitializeInjection(void) +{ + + std::string rb_size_str; + rb_writer = std::make_unique("cupti", getpid(), false); + char* clockid_str = getenv("LO2S_CLOCKID"); + + if (clockid_str != nullptr) + { + clockid = std::stoi(clockid_str); + } + + // Register an atexit() handler for clean-up + atexit(&atExitHandler); + + cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)callbackHandler, nullptr); + + // Supply our own timestamp generation function. Saves us the work of converting timestamps + cuptiActivityRegisterTimestampCallback(timestampfunc); + + // Register CUDA API callbacks for us to attach to new CUDA contexts + cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuProfilerStart); + cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API, + CUPTI_DRIVER_TRACE_CBID_cuProfilerStop); + + cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL); + + // Register buffer callbacks. When cupti needs a new buffer for recording date, it calls + // bufferRequested. When the buffer is full, bufferCompleted is used to write the data to the + // lo2s ring-buffer + cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted); + + return 1; +} diff --git a/src/monitor/poll_monitor.cpp b/src/monitor/poll_monitor.cpp index d19effa5..03d0a6e4 100644 --- a/src/monitor/poll_monitor.cpp +++ b/src/monitor/poll_monitor.cpp @@ -22,12 +22,9 @@ #include #include #include +#include #include -extern "C" -{ -#include -} namespace lo2s { @@ -51,18 +48,9 @@ PollMonitor::PollMonitor(trace::Trace& trace, const std::string& name, if (read_interval.count() != 0) { - tspec.it_value.tv_nsec = 1; - - tspec.it_interval.tv_sec = - std::chrono::duration_cast(read_interval).count(); - - tspec.it_interval.tv_nsec = (read_interval % std::chrono::seconds(1)).count(); - - timer_pfd().fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + timer_pfd().fd = timerfd_from_ns(read_interval); timer_pfd().events = POLLIN; timer_pfd().revents = 0; - - timerfd_settime(timer_pfd().fd, TFD_TIMER_ABSTIME, &tspec, NULL); } else { diff --git a/src/monitor/process_monitor.cpp b/src/monitor/process_monitor.cpp index 3b7cbeee..f4d66047 100644 --- a/src/monitor/process_monitor.cpp +++ b/src/monitor/process_monitor.cpp @@ -38,10 +38,11 @@ void ProcessMonitor::insert_process(Process parent, Process process, std::string bool spawn) { trace_.add_process(parent, process, proc_name); - insert_thread(process, process.as_thread(), proc_name, spawn); + insert_thread(process, process.as_thread(), proc_name, spawn, true); } -void ProcessMonitor::insert_thread(Process process, Thread thread, std::string name, bool spawn) +void ProcessMonitor::insert_thread(Process process, Thread thread, std::string name, bool spawn, + bool is_process) { trace_.add_thread(thread, name); @@ -56,9 +57,9 @@ void ProcessMonitor::insert_thread(Process process, Thread thread, std::string n { try { - auto inserted = - threads_.emplace(std::piecewise_construct, std::forward_as_tuple(thread), - std::forward_as_tuple(ExecutionScope(thread), *this, spawn)); + auto inserted = threads_.emplace( + std::piecewise_construct, std::forward_as_tuple(thread), + std::forward_as_tuple(ExecutionScope(thread), *this, spawn, is_process)); assert(inserted.second); // actually start thread inserted.first->second.start(); diff --git a/src/monitor/process_monitor_main.cpp b/src/monitor/process_monitor_main.cpp index aa4a4819..7927c801 100644 --- a/src/monitor/process_monitor_main.cpp +++ b/src/monitor/process_monitor_main.cpp @@ -19,6 +19,7 @@ * along with lo2s. If not, see . */ +#include #include #include @@ -26,6 +27,7 @@ #include #include +#include #include #include #include @@ -117,6 +119,19 @@ static void drop_privileges() assert(getgid() != 0); } +std::vector to_vector_of_c_str(const std::vector& vec) +{ + std::vector res; + std::transform(vec.begin(), vec.end(), std::back_inserter(res), [](const std::string& s) { + char* pc = new char[s.size() + 1]; + std::strcpy(pc, s.c_str()); + return pc; + }); + res.push_back(nullptr); + + return res; +} + [[noreturn]] static void run_command(const std::vector& command_and_args) { struct rlimit initial_rlimit = initial_rlimit_fd(); @@ -137,14 +152,20 @@ static void drop_privileges() /* we need ptrace to get fork/clone/... */ ptrace(PTRACE_TRACEME, 0, NULL, NULL); - std::vector tmp; - std::transform(command_and_args.begin(), command_and_args.end(), std::back_inserter(tmp), - [](const std::string& s) { - char* pc = new char[s.size() + 1]; - std::strcpy(pc, s.c_str()); - return pc; - }); - tmp.push_back(nullptr); + std::vector env; +#ifdef HAVE_CUDA + if (config().use_nvidia) + { + env = { "CUDA_INJECTION64_PATH=" + config().cuda_injectionlib_path }; + + if (config().use_clockid) + { + env.push_back("LO2S_CLOCKID=" + std::to_string(config().clockid)); + } + } +#endif + std::vector c_env = to_vector_of_c_str(env); + std::vector c_args = to_vector_of_c_str(command_and_args); Log::debug() << "Execute the command: " << nitro::lang::join(command_and_args); @@ -158,13 +179,19 @@ static void drop_privileges() } // run the application which should be sampled - execvp(tmp[0], &tmp[0]); + execvpe(c_args[0], &c_args[0], &c_env[0]); // should not be executed -> exec failed, let's clean up anyway. - for (auto cp : tmp) + for (auto cp : c_args) + { + delete[] cp; + } + + for (auto cp : c_env) { delete[] cp; } + Log::error() << "Could not execute the command: " << nitro::lang::join(command_and_args); throw_errno(); } diff --git a/src/monitor/scope_monitor.cpp b/src/monitor/scope_monitor.cpp index c97c3d1b..af9f6af9 100644 --- a/src/monitor/scope_monitor.cpp +++ b/src/monitor/scope_monitor.cpp @@ -28,9 +28,6 @@ #include #include -#include - -#include extern "C" { @@ -42,7 +39,8 @@ namespace lo2s namespace monitor { -ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec) +ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec, + bool is_process) : PollMonitor(parent.trace(), scope.name(), config().perf_read_interval), scope_(scope) { if (config().sampling || scope.is_cpu()) @@ -72,6 +70,13 @@ ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enabl add_fd(userspace_counter_writer_->fd()); } + if (config().use_nvidia && is_process) + { + cupti_reader_ = + std::make_unique(parent.trace(), scope.as_thread().as_process()); + add_fd(cupti_reader_->fd()); + } + // note: start() can now be called } @@ -95,6 +100,11 @@ void ScopeMonitor::monitor(int fd) try_pin_to_scope(scope_); } + if (cupti_reader_ && (fd == cupti_reader_->fd() || fd == stop_pfd().fd)) + { + cupti_reader_->read(); + } + if (syscall_writer_ && (fd == timer_pfd().fd || fd == stop_pfd().fd || syscall_writer_->fd() == fd)) { diff --git a/src/monitor/system_process_monitor.cpp b/src/monitor/system_process_monitor.cpp index b87001c5..a86d07d6 100644 --- a/src/monitor/system_process_monitor.cpp +++ b/src/monitor/system_process_monitor.cpp @@ -34,7 +34,8 @@ void SystemProcessMonitor::insert_process([[maybe_unused]] Process parent, } void SystemProcessMonitor::insert_thread([[maybe_unused]] Process process, Thread thread, - std::string name, [[maybe_unused]] bool spawn) + std::string name, [[maybe_unused]] bool spawn, + [[maybe_unused]] bool is_process) { // in system monitoring, we only need to track the threads spawned from the process lo2s spawned // itself. Without this, these threads end up as "". Sad times. diff --git a/src/perf/counter/userspace/reader.cpp b/src/perf/counter/userspace/reader.cpp index 32b82f95..985397e3 100644 --- a/src/perf/counter/userspace/reader.cpp +++ b/src/perf/counter/userspace/reader.cpp @@ -31,7 +31,6 @@ extern "C" { -#include #include } @@ -43,26 +42,14 @@ namespace counter { namespace userspace { - template Reader::Reader(ExecutionScope scope) : counter_collection_( CounterProvider::instance().collection_for(MeasurementScope::userspace_metric(scope))), - counter_buffer_(counter_collection_.counters.size()), data_(counter_collection_.counters.size()) + counter_buffer_(counter_collection_.counters.size()), + timer_fd_(timerfd_from_ns(config().userspace_read_interval)), + data_(counter_collection_.counters.size()) { - struct itimerspec tspec; - memset(&tspec, 0, sizeof(struct itimerspec)); - tspec.it_value.tv_nsec = 1; - - tspec.it_interval.tv_sec = - std::chrono::duration_cast(config().userspace_read_interval).count(); - - tspec.it_interval.tv_nsec = - (config().userspace_read_interval % std::chrono::seconds(1)).count(); - timer_fd_ = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); - - timerfd_settime(timer_fd_, TFD_TIMER_ABSTIME, &tspec, NULL); - for (auto& event : counter_collection_.counters) { counter_fds_.emplace_back(perf_event_description_open(scope, event, -1)); diff --git a/src/trace/trace.cpp b/src/trace/trace.cpp index 46df1980..a3f7ecba 100644 --- a/src/trace/trace.cpp +++ b/src/trace/trace.cpp @@ -19,7 +19,6 @@ * along with lo2s. If not, see . */ -#include "otf2xx/chrono/duration.hpp" #include #include @@ -411,6 +410,21 @@ otf2::writer::local& Trace::sample_writer(const ExecutionScope& writer_scope) return archive_(location(writer_scope)); } +otf2::writer::local& Trace::cuda_writer(const Thread& thread) +{ + MeasurementScope scope = MeasurementScope::cuda(thread.as_scope()); + + const auto& cuda_location_group = registry_.emplace( + ByMeasurementScope(scope), intern(scope.name()), + otf2::common::location_group_type::accelerator, system_tree_root_node_); + + const auto& intern_location = registry_.emplace( + ByMeasurementScope(scope), intern(scope.name()), cuda_location_group, + otf2::definition::location::location_type::accelerator_stream); + + return archive_(intern_location); +} + otf2::writer::local& Trace::nec_writer(NecDevice device, const Thread& nec_thread) { @@ -489,6 +503,15 @@ otf2::writer::local& Trace::create_metric_writer(const std::string& name) return archive_(location); } +otf2::definition::calling_context& Trace::cuda_calling_context(std::string& file, + std::string& function) +{ + LineInfo info = LineInfo::for_function(file.c_str(), function.c_str(), 0, ""); + + return registry_.emplace( + ByLineInfo(info), intern_region(info), intern_scl(info)); +} + otf2::definition::io_handle& Trace::block_io_handle(BlockDevice dev) { diff --git a/src/util.cpp b/src/util.cpp index ff4fc632..43c9dfaa 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -27,6 +27,7 @@ extern "C" #include #include #include +#include #include #include #include @@ -411,4 +412,27 @@ void bump_rlimit_fd() "resource limit."; } } + +int timerfd_from_ns(std::chrono::nanoseconds duration) +{ + int timerfd; + struct itimerspec tspec; + memset(&tspec, 0, sizeof(struct itimerspec)); + + tspec.it_interval.tv_sec = std::chrono::duration_cast(duration).count(); + tspec.it_interval.tv_nsec = (duration % std::chrono::seconds(1)).count(); + + timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + + if (timerfd == -1) + { + throw_errno(); + } + + if (timerfd_settime(timerfd, TFD_TIMER_ABSTIME, &tspec, NULL) == -1) + { + throw_errno(); + } + return timerfd; +} } // namespace lo2s