From 9ae49179837e4c6427c7852b5632150806cebb2a Mon Sep 17 00:00:00 2001
From: Christian von Elm <christian.von_elm@tu-dresden.de>
Date: Mon, 12 Feb 2024 16:40:44 +0100
Subject: [PATCH] feat(nvidia): Add cupti support

This commit adds a --nvidia option, which injects a library into the program under measurement, which records entry and exit into CUDA kernels via CUPTI
---
 CMakeLists.txt                                |  54 ++++
 include/lo2s/build_config.hpp.in              |   4 +
 include/lo2s/config.hpp                       |   4 +
 include/lo2s/cupti/events.hpp                 |  50 ++++
 include/lo2s/cupti/reader.hpp                 |  98 +++++++
 include/lo2s/measurement_scope.hpp            |   8 +
 .../lo2s/monitor/abstract_process_monitor.hpp |   2 +-
 include/lo2s/monitor/process_monitor.hpp      |   3 +-
 include/lo2s/monitor/scope_monitor.hpp        |   7 +-
 .../lo2s/monitor/system_process_monitor.hpp   |   4 +-
 include/lo2s/ringbuf.hpp                      | 245 ++++++++++++++++++
 include/lo2s/shared_memory.hpp                |   2 +
 include/lo2s/trace/reg_keys.hpp               |   4 +-
 include/lo2s/trace/trace.hpp                  |   5 +
 include/lo2s/util.hpp                         |   2 +
 man/lo2s.1.pod                                |  19 ++
 src/config.cpp                                |  68 ++++-
 src/cupti/lib.cpp                             | 220 ++++++++++++++++
 src/monitor/poll_monitor.cpp                  |  16 +-
 src/monitor/process_monitor.cpp               |  11 +-
 src/monitor/process_monitor_main.cpp          |  47 +++-
 src/monitor/scope_monitor.cpp                 |  18 +-
 src/monitor/system_process_monitor.cpp        |   3 +-
 src/perf/counter/userspace/reader.cpp         |  19 +-
 src/trace/trace.cpp                           |  25 +-
 src/util.cpp                                  |  24 ++
 26 files changed, 898 insertions(+), 64 deletions(-)
 create mode 100644 include/lo2s/cupti/events.hpp
 create mode 100644 include/lo2s/cupti/reader.hpp
 create mode 100644 include/lo2s/ringbuf.hpp
 create mode 100644 src/cupti/lib.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cba44f13..8c3f91f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -34,6 +34,7 @@ IfUpdatedUnsetAll(lo2s_USE_STATIC_LIBS
     Libpfm_USE_STATIC_LIBS
     X86Adapt_STATIC
     x86_energy_STATIC
+    CUDA_USE_STATIC_LIBS
 )
 
 if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
@@ -45,6 +46,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "OFF")
     set(x86_energy_STATIC        OFF CACHE BOOL "")
     set(Sensors_USE_STATIC_LIBS  OFF CACHE BOOL "")
     set(Libpfm_USE_STATIC_LIBS  OFF CACHE BOOL "")
+    set(CUDA_USE_STATIC_LIBS     OFF CACHE BOOL "")
 endif()
 
 if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
@@ -56,6 +58,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "MOSTLY")
     set(x86_energy_STATIC        ON CACHE BOOL "")
     set(Sensors_USE_STATIC_LIBS  ON CACHE BOOL "")
     set(Libpfm_USE_STATIC_LIBS  ON CACHE BOOL "")
+    set(CUDA_USE_STATIC_LIBS     ON CACHE BOOL "")
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -static-libstdc++ -static-libgcc")
 endif()
 
@@ -68,6 +71,7 @@ if(lo2s_USE_STATIC_LIBS STREQUAL "ALL")
     set(x86_energy_STATIC        ON CACHE BOOL "")
     set(Sensors_USE_STATIC_LIBS  ON CACHE BOOL "")
     set(Libpfm_USE_STATIC_LIBS  ON CACHE BOOL "")
+    set(CUDA_USE_STATIC_LIBS     ON CACHE BOOL "")
 
     # Doesn't seem to work with clang, even though it should,
     # but at least it doesn't complain about it either
@@ -107,6 +111,7 @@ find_package(Sensors)
 find_package(Veosinfo)
 find_package(Libpfm)
 find_package(PkgConfig)
+find_package(CUDAToolkit)
 
 if(PkgConfig_FOUND)
     pkg_check_modules(Audit audit)
@@ -129,6 +134,8 @@ CMAKE_DEPENDENT_OPTION(USE_LIBAUDIT "Use libaudit for syscall name resolution" O
 add_feature_info("USE_LIBAUDIT" USE_LIBAUDIT "Use libaudit for syscall name resolution.")
 CMAKE_DEPENDENT_OPTION(USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora Tsubasa cards." ON "Veosinfo_FOUND" OFF)
 add_feature_info("USE_VEOSINFO" USE_VEOSINFO "Use libveosinfo to sample NEC SX-Aurora  Tsubasa cards.")
+CMAKE_DEPENDENT_OPTION(USE_CUPTI "Use CUPTI to record CUDA activity." ON "CUDAToolkit_FOUND" OFF)
+add_feature_info("USE_CUPTI" USE_CUPTI "Use CUPTI to record CUDA activity.")
 # system configuration checks
 CHECK_INCLUDE_FILES(linux/hw_breakpoint.h HAVE_HW_BREAKPOINT_H)
 CHECK_STRUCT_HAS_MEMBER("struct perf_event_attr" clockid linux/perf_event.h HAVE_PERF_EVENT_ATTR_CLOCKID)
@@ -144,6 +151,13 @@ if(NOT CLOCK_GETTIME_FOUND)
     unset(CMAKE_REQUIRED_LIBRARIES)
 endif()
 
+check_function_exists(shm_open SHM_OPEN_FOUND)
+if(NOT SHM_OPEN_FOUND)
+    set(CMAKE_REQUIRED_LIBRARIES "rt")
+    check_function_exists(shm_open SHM_OPEN_FOUND_WITH_RT)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+endif()
+
 CHECK_STRUCT_HAS_BITFIELD("struct perf_event_attr" context_switch linux/perf_event.h HAVE_PERF_RECORD_SWITCH)
 
 if(NOT HAVE_PERF_RECORD_SWITCH)
@@ -226,6 +240,14 @@ if(NOT CLOCK_GETTIME_FOUND)
     endif()
 endif()
 
+if(NOT SHM_OPEN_FOUND)
+    if(SHM_OPEN_FOUND_WITH_RT)
+        target_link_libraries(lo2s PRIVATE rt)
+    else()
+        message(SEND_ERROR "Could not find the function shm_open(), but it is required.")
+    endif()
+endif()
+
 # handle x86_adapt dependency
 if(X86Adapt_FOUND)
     target_sources(lo2s PRIVATE
@@ -306,6 +328,38 @@ if (USE_LIBAUDIT)
     endif()
 endif()
 
+set(LO2S_CUDA_INJECTIONLIB_PATH "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/liblo2s_injection.so")
+if(USE_CUPTI)
+    if(CUDAToolkit_FOUND)
+        add_library(lo2s_injection SHARED src/cupti/lib.cpp)
+        target_include_directories(lo2s_injection PRIVATE include
+            ${CMAKE_CURRENT_BINARY_DIR}/include)
+
+        if (CUDA_USE_STATIC_LIBS)
+            target_link_libraries(lo2s_injection PRIVATE CUDA::cupti_static)
+        else()
+            target_link_libraries(lo2s_injection PRIVATE CUDA::cupti)
+        endif()
+
+	target_link_libraries(lo2s_injection PRIVATE fmt::fmt
+            Nitro::log
+            Nitro::env
+            Nitro::dl
+            Nitro::options
+	    otf2xx::Writer)
+
+        if(SHM_OPEN_FOUND_WITH_RT)
+            target_link_libraries(lo2s_injection PRIVATE rt)
+        endif()
+
+        target_compile_definitions(lo2s PUBLIC HAVE_CUDA)
+        install(TARGETS lo2s_injection LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    else()
+        message(SEND_ERROR "Cupti not found but requested.")
+    endif()
+endif()
+
+
 
 # generate version string used in lo2s
 if(Git_FOUND)
diff --git a/include/lo2s/build_config.hpp.in b/include/lo2s/build_config.hpp.in
index 4186c657..b2e438df 100644
--- a/include/lo2s/build_config.hpp.in
+++ b/include/lo2s/build_config.hpp.in
@@ -40,3 +40,7 @@
 
 
 #cmakedefine LO2S_COPYRIGHT_YEAR "@LO2S_COPYRIGHT_YEAR@"
+
+// The CUDA injection library installation path
+
+#cmakedefine LO2S_CUDA_INJECTIONLIB_PATH "@LO2S_CUDA_INJECTIONLIB_PATH@"
diff --git a/include/lo2s/config.hpp b/include/lo2s/config.hpp
index 76427bf8..53fd6f36 100644
--- a/include/lo2s/config.hpp
+++ b/include/lo2s/config.hpp
@@ -98,6 +98,10 @@ struct Config
     bool use_nec;
     std::chrono::microseconds nec_read_interval;
     std::chrono::milliseconds nec_check_interval;
+    // Nvidia CUPTI
+    bool use_nvidia;
+    std::string cuda_injectionlib_path;
+    uint64_t nvidia_ringbuf_size;
 };
 
 const Config& config();
diff --git a/include/lo2s/cupti/events.hpp b/include/lo2s/cupti/events.hpp
new file mode 100644
index 00000000..ba994458
--- /dev/null
+++ b/include/lo2s/cupti/events.hpp
@@ -0,0 +1,50 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2024,
+ *    Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace lo2s
+{
+namespace cupti
+{
+enum class EventType : uint64_t
+{
+    CUPTI_KERNEL = 1,
+};
+
+struct event_header
+{
+    EventType type;
+    uint64_t size;
+};
+
+struct event_kernel
+{
+    struct event_header header;
+    uint64_t start;
+    uint64_t end;
+    char name[1];
+};
+
+} // namespace cupti
+} // namespace lo2s
diff --git a/include/lo2s/cupti/reader.hpp b/include/lo2s/cupti/reader.hpp
new file mode 100644
index 00000000..5c474b67
--- /dev/null
+++ b/include/lo2s/cupti/reader.hpp
@@ -0,0 +1,98 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2016,
+ *    Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <lo2s/config.hpp>
+#include <lo2s/cupti/events.hpp>
+#include <lo2s/log.hpp>
+#include <lo2s/perf/time/converter.hpp>
+#include <lo2s/ringbuf.hpp>
+#include <lo2s/trace/trace.hpp>
+#include <lo2s/types.hpp>
+
+#include <chrono>
+#include <cstdlib>
+#include <string>
+
+extern "C"
+{
+#include <sys/timerfd.h>
+#include <unistd.h>
+}
+
+namespace lo2s
+{
+namespace cupti
+{
+
+class Reader
+{
+public:
+    Reader(trace::Trace& trace, Process process)
+    : process_(process), trace_(trace), time_converter_(perf::time::Converter::instance()),
+      ringbuf_reader_("cupti", process.as_pid_t(), true, config().nvidia_ringbuf_size),
+      timer_fd_(timerfd_from_ns(config().userspace_read_interval)),
+      executable_name_(get_process_exe(process))
+    {
+    }
+
+    void read()
+    {
+        struct event_header* header = nullptr;
+
+        while ((header = reinterpret_cast<struct event_header*>(
+                    ringbuf_reader_.get(sizeof(struct event_header)))) != nullptr)
+        {
+            if (header->type == EventType::CUPTI_KERNEL)
+            {
+                struct event_kernel* kernel =
+                    reinterpret_cast<struct event_kernel*>(ringbuf_reader_.get(header->size));
+
+                auto& writer = trace_.cuda_writer(Thread(process_.as_thread()));
+
+                std::string kernel_name = kernel->name;
+                auto& cu_cctx = trace_.cuda_calling_context(executable_name_, kernel_name);
+
+                writer.write_calling_context_enter(time_converter_(kernel->start), cu_cctx.ref(),
+                                                   2);
+                writer.write_calling_context_leave(time_converter_(kernel->end), cu_cctx.ref());
+            }
+
+            ringbuf_reader_.pop(header->size);
+        }
+    }
+
+    int fd()
+    {
+        return timer_fd_;
+    }
+
+private:
+    Process process_;
+    trace::Trace& trace_;
+    perf::time::Converter& time_converter_;
+    RingBufReader ringbuf_reader_;
+    int timer_fd_;
+    std::string executable_name_;
+};
+} // namespace cupti
+} // namespace lo2s
diff --git a/include/lo2s/measurement_scope.hpp b/include/lo2s/measurement_scope.hpp
index bc3a2979..f3841815 100644
--- a/include/lo2s/measurement_scope.hpp
+++ b/include/lo2s/measurement_scope.hpp
@@ -33,6 +33,7 @@ enum class MeasurementScopeType
     NEC_METRIC,
     BIO,
     SYSCALL,
+    CUDA,
     UNKNOWN
 };
 
@@ -79,6 +80,11 @@ struct MeasurementScope
         return { MeasurementScopeType::SYSCALL, s };
     }
 
+    static MeasurementScope cuda(ExecutionScope s)
+    {
+        return { MeasurementScopeType::CUDA, s };
+    }
+
     friend bool operator==(const MeasurementScope& lhs, const MeasurementScope& rhs)
     {
         return (lhs.scope == rhs.scope) && lhs.type == rhs.type;
@@ -111,6 +117,8 @@ struct MeasurementScope
             return fmt::format("block layer I/O events for {}", scope.name());
         case MeasurementScopeType::SYSCALL:
             return fmt::format("syscall events for {}", scope.name());
+        case lo2s::MeasurementScopeType::CUDA:
+            return fmt::format("cuda kernel events for {}", scope.name());
         default:
             throw new std::runtime_error("Unknown ExecutionScopeType!");
         }
diff --git a/include/lo2s/monitor/abstract_process_monitor.hpp b/include/lo2s/monitor/abstract_process_monitor.hpp
index bcc93a79..18bb6859 100644
--- a/include/lo2s/monitor/abstract_process_monitor.hpp
+++ b/include/lo2s/monitor/abstract_process_monitor.hpp
@@ -41,7 +41,7 @@ class AbstractProcessMonitor
     virtual void insert_process(Process parent, Process process, std::string proc_name,
                                 bool spawn = false) = 0;
     virtual void insert_thread(Process process, Thread thread, std::string name = "",
-                               bool spawn = false) = 0;
+                               bool spawn = false, bool is_process = false) = 0;
 
     virtual void exit_thread(Thread thread) = 0;
 
diff --git a/include/lo2s/monitor/process_monitor.hpp b/include/lo2s/monitor/process_monitor.hpp
index d5463c63..609f5d42 100644
--- a/include/lo2s/monitor/process_monitor.hpp
+++ b/include/lo2s/monitor/process_monitor.hpp
@@ -45,7 +45,8 @@ class ProcessMonitor : public AbstractProcessMonitor, public MainMonitor
     ~ProcessMonitor();
     void insert_process(Process parent, Process child, std::string proc_name,
                         bool spawn = false) override;
-    void insert_thread(Process parent, Thread child, std::string name, bool spawn = false) override;
+    void insert_thread(Process parent, Thread child, std::string name, bool spawn = false,
+                       bool is_process = false) override;
 
     void exit_thread(Thread thread) override;
 
diff --git a/include/lo2s/monitor/scope_monitor.hpp b/include/lo2s/monitor/scope_monitor.hpp
index f68cd6e7..c809d64c 100644
--- a/include/lo2s/monitor/scope_monitor.hpp
+++ b/include/lo2s/monitor/scope_monitor.hpp
@@ -24,14 +24,15 @@
 #include <lo2s/monitor/main_monitor.hpp>
 #include <lo2s/monitor/poll_monitor.hpp>
 
+#include <lo2s/cupti/reader.hpp>
 #include <lo2s/perf/counter/group/writer.hpp>
 #include <lo2s/perf/counter/userspace/writer.hpp>
-
 #include <lo2s/perf/sample/writer.hpp>
 #include <lo2s/perf/syscall/writer.hpp>
 
 #include <array>
 #include <chrono>
+#include <memory>
 #include <thread>
 
 #include <cstddef>
@@ -50,7 +51,8 @@ namespace monitor
 class ScopeMonitor : public PollMonitor
 {
 public:
-    ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec);
+    ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec,
+                 bool is_process = false);
 
     void initialize_thread() override;
     void finalize_thread() override;
@@ -74,6 +76,7 @@ class ScopeMonitor : public PollMonitor
     std::unique_ptr<perf::sample::Writer> sample_writer_;
     std::unique_ptr<perf::counter::group::Writer> group_counter_writer_;
     std::unique_ptr<perf::counter::userspace::Writer> userspace_counter_writer_;
+    std::unique_ptr<cupti::Reader> cupti_reader_;
 };
 } // namespace monitor
 } // namespace lo2s
diff --git a/include/lo2s/monitor/system_process_monitor.hpp b/include/lo2s/monitor/system_process_monitor.hpp
index 3162ecde..ed785718 100644
--- a/include/lo2s/monitor/system_process_monitor.hpp
+++ b/include/lo2s/monitor/system_process_monitor.hpp
@@ -46,8 +46,8 @@ class SystemProcessMonitor : public AbstractProcessMonitor
     virtual void insert_process(Process parent, Process process, std::string proc_name,
                                 bool spawn) override;
 
-    virtual void insert_thread(Process process, Thread thread, std::string name,
-                               bool spawn) override;
+    virtual void insert_thread(Process process, Thread thread, std::string name, bool spawn,
+                               bool is_process) override;
 
     virtual void exit_thread(Thread thread) override;
 
diff --git a/include/lo2s/ringbuf.hpp b/include/lo2s/ringbuf.hpp
new file mode 100644
index 00000000..60fe592b
--- /dev/null
+++ b/include/lo2s/ringbuf.hpp
@@ -0,0 +1,245 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2024,
+ *    Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <lo2s/error.hpp>
+#include <lo2s/shared_memory.hpp>
+
+#include <atomic>
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <string>
+#include <system_error>
+
+extern "C"
+{
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+}
+
+namespace lo2s
+{
+
+// To resolve possible ringbuf format incompatibilities
+#define RINGBUF_VERSION 1
+
+struct ringbuf_header
+{
+    uint64_t version;
+    uint64_t size;
+    std::atomic_uint64_t head;
+    std::atomic_uint64_t tail;
+};
+
+class ShmRingbuf
+{
+public:
+    ShmRingbuf(std::string component, pid_t pid, bool create, size_t pages)
+    {
+        std::string filename = "/lo2s-" + component + "-" + std::to_string(pid);
+
+        fd_ = shm_open(filename.c_str(), create ? O_RDWR | O_CREAT | O_EXCL : O_RDWR, 0600);
+        if (fd_ == -1)
+        {
+            throw std::system_error(errno, std::system_category());
+        }
+
+        size_t pagesize = sysconf(_SC_PAGESIZE);
+        size_t size;
+
+        if (create)
+        {
+            size = pagesize * pages;
+            ftruncate(fd_, size + sysconf(_SC_PAGESIZE));
+        }
+        else
+        {
+            auto header_map = SharedMemory(fd_, sizeof(struct ringbuf_header), 0);
+            size = header_map.as<struct ringbuf_header>()->size;
+        }
+
+        // To handle events that wrap around the ringbuffer, map it twice into virtual memory
+        // back-to-back. This way events that wrap around the ringbuffer can be read and written
+        // without noticing the wraparound:
+        //
+        // in physical memory: [ent|-----|ev]
+        //
+        // in virtual memory:  [ent|-----|ev][ent----|ev]
+        //
+        // As there is no way to reserve a range of virtual memory, mmap()-ing two adjacent
+        // ring-buffer without races is tricky. We solve this problem by mmap()-ing an area twice
+        // the size of the ringbuffer and then overwriting the latter half of this mapping with
+        // another mapping of the ringbuffer using MMAP_FIXED. This way we only touch mappings we
+        // control. Also, put the ringbuffer header on a separate page to make life easier.
+
+        first_mapping_ = SharedMemory(fd_, size * 2 + pagesize, 0);
+
+        second_mapping_ = SharedMemory(fd_, size, pagesize, first_mapping_.as<std::byte>() + size);
+
+        header_ = first_mapping_.as<struct ringbuf_header>();
+        start_ = first_mapping_.as<std::byte>() + pagesize;
+
+        if (create)
+        {
+            header_->version = RINGBUF_VERSION;
+            header_->size = size;
+            header_->tail.store(0);
+            header_->head.store(0);
+        }
+        else
+        {
+            if (header_->version != RINGBUF_VERSION)
+            {
+                throw new std::runtime_error("Incompatible RingBuffer Version " +
+                                             std::to_string(header_->version) +
+                                             " detected on other side!");
+            }
+        }
+    }
+
+    uint64_t head()
+    {
+        return header_->head.load();
+    }
+
+    uint64_t tail()
+    {
+        return header_->tail.load();
+    }
+
+    void head(uint64_t new_head)
+    {
+        return header_->head.store(new_head);
+    }
+
+    void tail(uint64_t new_tail)
+    {
+        return header_->tail.store(new_tail);
+    }
+
+    uint64_t size()
+    {
+        return header_->size;
+    }
+
+protected:
+    std::byte* start_;
+
+private:
+    struct ringbuf_header* header_;
+    int fd_;
+    SharedMemory first_mapping_, second_mapping_;
+};
+
+class RingBufWriter : public ShmRingbuf
+{
+public:
+    RingBufWriter(std::string component, pid_t pid, bool create, size_t pages = 0)
+    : ShmRingbuf(component, pid, create, pages)
+    {
+    }
+
+    std::byte* reserve(size_t ev_size)
+    {
+        if (ev_size == 0)
+        {
+            return nullptr;
+        }
+
+        // No other reservation can be active!
+        assert(reserved_size_ == 0);
+
+        if (head() >= tail() && ev_size >= tail() - head() + size())
+        {
+            return nullptr;
+        }
+        if (head() < tail() && ev_size >= tail() - head())
+        {
+            return nullptr;
+        }
+
+        reserved_size_ = ev_size;
+        return start_ + head();
+    }
+
+    void commit()
+    {
+        assert(reserved_size_ != 0);
+
+        head((head() + reserved_size_) % size());
+        reserved_size_ = 0;
+    }
+
+private:
+    size_t reserved_size_ = 0;
+};
+
+class RingBufReader : public ShmRingbuf
+{
+public:
+    RingBufReader(std::string component, pid_t pid, bool create, size_t pages = 0)
+    : ShmRingbuf(component, pid, create, pages)
+    {
+    }
+
+    std::byte* get(size_t size)
+    {
+        if (size == 0)
+        {
+            return nullptr;
+        }
+
+        if (!can_be_loaded(size))
+        {
+            return nullptr;
+        }
+        return start_ + tail();
+    }
+
+    void pop(size_t ev_size)
+    {
+        if (ev_size == 0)
+        {
+            return;
+        }
+
+        // Calling pop() without trying to get() data from the ringbuffer first is an error
+        assert(can_be_loaded(ev_size));
+
+        tail((tail() + ev_size) % size());
+    }
+
+private:
+    bool can_be_loaded(size_t ev_size)
+    {
+        if (tail() <= head())
+        {
+            return tail() + ev_size <= head();
+        }
+
+        return tail() + ev_size <= head() + size();
+    }
+};
+} // namespace lo2s
diff --git a/include/lo2s/shared_memory.hpp b/include/lo2s/shared_memory.hpp
index ee307439..35a546d8 100644
--- a/include/lo2s/shared_memory.hpp
+++ b/include/lo2s/shared_memory.hpp
@@ -22,7 +22,9 @@
 #pragma once
 
 #include <lo2s/error.hpp>
+#include <lo2s/util.hpp>
 
+#include <cassert>
 #include <utility>
 
 extern "C"
diff --git a/include/lo2s/trace/reg_keys.hpp b/include/lo2s/trace/reg_keys.hpp
index ae0ba9d2..542f8945 100644
--- a/include/lo2s/trace/reg_keys.hpp
+++ b/include/lo2s/trace/reg_keys.hpp
@@ -231,8 +231,8 @@ struct Holder<otf2::definition::region>
 template <>
 struct Holder<otf2::definition::calling_context>
 {
-    using type =
-        otf2::lookup_definition_holder<otf2::definition::calling_context, ByThread, BySyscall>;
+    using type = otf2::lookup_definition_holder<otf2::definition::calling_context, ByThread,
+                                                ByLineInfo, BySyscall>;
 };
 
 template <>
diff --git a/include/lo2s/trace/trace.hpp b/include/lo2s/trace/trace.hpp
index ad8e71ce..9075292e 100644
--- a/include/lo2s/trace/trace.hpp
+++ b/include/lo2s/trace/trace.hpp
@@ -19,6 +19,7 @@
  * along with lo2s.  If not, see <http://www.gnu.org/licenses/>.
  */
 #pragma once
+#include "otf2xx/definition/calling_context.hpp"
 #include <chrono>
 #include <lo2s/address.hpp>
 #include <lo2s/bfd_resolve.hpp>
@@ -131,12 +132,16 @@ class Trace
     otf2::definition::mapping_table merge_syscall_contexts(const std::set<int64_t>& used_syscalls);
 
     otf2::writer::local& sample_writer(const ExecutionScope& scope);
+    otf2::writer::local& cuda_writer(const Thread& thread);
     otf2::writer::local& metric_writer(const MeasurementScope& scope);
     otf2::writer::local& syscall_writer(const Cpu& cpu);
     otf2::writer::local& bio_writer(BlockDevice dev);
     otf2::writer::local& create_metric_writer(const std::string& name);
     otf2::writer::local& nec_writer(NecDevice device, const Thread& nec_thread);
 
+    otf2::definition::calling_context& cuda_calling_context(std::string& exe,
+                                                            std::string& function);
+
     otf2::definition::io_handle& block_io_handle(BlockDevice dev);
 
     otf2::definition::metric_member
diff --git a/include/lo2s/util.hpp b/include/lo2s/util.hpp
index 2d85569f..efd3fca3 100644
--- a/include/lo2s/util.hpp
+++ b/include/lo2s/util.hpp
@@ -122,4 +122,6 @@ Thread gettid();
 
 std::set<std::uint32_t> parse_list(std::string list);
 std::set<std::uint32_t> parse_list_from_file(std::filesystem::path file);
+
+int timerfd_from_ns(std::chrono::nanoseconds duration);
 } // namespace lo2s
diff --git a/man/lo2s.1.pod b/man/lo2s.1.pod
index 7b00d2b8..b1c51d57 100644
--- a/man/lo2s.1.pod
+++ b/man/lo2s.1.pod
@@ -28,6 +28,7 @@ S<[B<--metric-count> I<N> | B<--metric-frequency> I<HZ>]>
 S<[B<-x> I<KNOB>]>
 S<[B<-X>]>
 S<[B<-s SYSCALL>]>
+S<[B<--accel ACCEL>]>
 S<{ I<PROCESS_MONITORING> | I<SYSTEM_MONITORING> }>
 
 =item I<PROCESS_MONITORING> := { I<COMMAND> | B<--> I<COMMAND> [I<ARGS>...] | B<-p> I<PID> }
@@ -387,6 +388,24 @@ Record measurements for each sensor found by L<sensors(1)>.
 
 =back
 
+=head2 B<Accelerator> options
+
+=over
+
+=item B<--accel> I<ACCEL>
+
+Record activity events (instruction samples or kernel execution information) for the given accelerator. Usable accelerators are "nec" for NEC SX-Aurora and "nvidia" for NVidia CUDA accelerators.
+
+=item B<--nec-readout-interval> I<USEC>
+
+Set the interval (in microseconds) between NEC SX-Aurora instruction samples.
+
+=item B<--nec-check-interval> I<MSEC>
+
+Set the interval (in milliseconds) between checks for new NEC SX-Aurora processes.
+
+=back
+
 =head2 Arguments to options
 
 =over
diff --git a/src/config.cpp b/src/config.cpp
index d4399d6d..55505876 100644
--- a/src/config.cpp
+++ b/src/config.cpp
@@ -164,7 +164,7 @@ void parse_program_options(int argc, const char** argv)
     auto& x86_energy_options = parser.group("x86_energy options");
     auto& sensors_options = parser.group("sensors options");
     auto& io_options = parser.group("I/O recording options");
-    auto& nec_options = parser.group("NEC SX-Aurora Tsubasa recording options");
+    auto& accel_options = parser.group("Accelerator options");
 
     lo2s::Config config;
 
@@ -346,16 +346,42 @@ void parse_program_options(int argc, const char** argv)
     io_options.toggle("block-io",
                       "Enable recording of block I/O events (requires access to debugfs)");
 
-    nec_options.toggle("nec", "Enable NEC Vector Engine sampling");
-    nec_options.option("nec-readout-interval", "NEC sampling interval")
+    std::vector<std::string> accelerators;
+
+#ifdef HAVE_CUDA
+    accelerators.push_back("nvidia");
+#endif
+#ifdef HAVE_VEOSINFO
+    accelerators.push_back("nec");
+#endif
+
+    accel_options
+        .multi_option(
+            "accel",
+            fmt::format("Accelerator to record execution events for. Available accelerators: {}",
+                        fmt::join(accelerators, ", ")))
+        .metavar("ACCEL")
+        .optional();
+
+    accel_options.option("nec-readout-interval", "Accelerator sampling interval")
         .optional()
         .metavar("USEC")
         .default_value("1");
-    nec_options.option("nec-check-interval", "The interval between checks for new VE processes")
+    accel_options.option("nec-check-interval", "The interval between checks for new VE processes")
         .optional()
         .metavar("MSEC")
         .default_value("100");
 
+    accel_options.option("nvidia-injection-path", "path to the lo2s cupti injection library")
+        .optional()
+        .metavar("PATH")
+        .default_value(LO2S_CUDA_INJECTIONLIB_PATH);
+
+    accel_options.option("nvidia-ringbuf-size", "Size of the injection library ring-buffer")
+        .optional()
+        .metavar("BYTE")
+        .default_value("65536");
+
     nitro::options::arguments arguments;
     try
     {
@@ -382,7 +408,10 @@ void parse_program_options(int argc, const char** argv)
     config.use_x86_energy = arguments.given("x86-energy");
     config.use_sensors = arguments.given("sensors");
     config.use_block_io = arguments.given("block-io");
-    config.use_nec = arguments.given("nec");
+
+#ifdef HAVE_CUDA
+    config.cuda_injectionlib_path = arguments.get("nvidia-injection-path");
+#endif
     config.command = arguments.positionals();
 
     if (arguments.given("help"))
@@ -514,6 +543,35 @@ void parse_program_options(int argc, const char** argv)
         }
     }
 
+    for (const auto& accel : arguments.get_all("accel"))
+    {
+        if (accel == "nec")
+        {
+#ifdef HAVE_VEOSINFO
+            config.use_nec = true;
+#else
+            std::cerr << "lo2s was built without support for NEC SX-Aurora sampling\n";
+            std::exit(EXIT_FAILURE);
+#endif
+        }
+        else if (accel == "nvidia")
+        {
+#ifdef HAVE_CUDA
+            config.use_nvidia = true;
+            config.nvidia_ringbuf_size = arguments.as<uint64_t>("nvidia-ringbuf-size");
+#else
+            std::cerr << "lo2s was built without support for CUDA kernel recording\n";
+            std::exit(EXIT_FAILURE);
+#endif
+        }
+        else
+        {
+            std::cerr << "Unknown Accelerator " << accel << "!";
+            parser.usage();
+            std::exit(EXIT_FAILURE);
+        }
+    }
+
     std::vector<std::string> perf_group_events = arguments.get_all("metric-event");
     std::vector<std::string> perf_userspace_events = arguments.get_all("userspace-metric-event");
 
diff --git a/src/cupti/lib.cpp b/src/cupti/lib.cpp
new file mode 100644
index 00000000..ca100f1e
--- /dev/null
+++ b/src/cupti/lib.cpp
@@ -0,0 +1,220 @@
+/*
+ * This file is part of the lo2s software.
+ * Linux OTF2 sampling
+ *
+ * Copyright (c) 2016,
+ *    Technische Universitaet Dresden, Germany
+ *
+ * lo2s is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * lo2s is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with lo2s.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <lo2s/cupti/events.hpp>
+#include <lo2s/ringbuf.hpp>
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+extern "C"
+{
+#include <cupti.h>
+#include <time.h>
+#include <unistd.h>
+}
+
+// Allocate 8 MiB every time CUPTI asks for more event memory
+constexpr size_t CUPTI_BUFFER_SIZE = 8 * 1024 * 1024;
+
+std::unique_ptr<lo2s::RingBufWriter> rb_writer = nullptr;
+CUpti_SubscriberHandle subscriber = nullptr;
+
+clockid_t clockid = CLOCK_MONOTONIC_RAW;
+
+static void atExitHandler(void)
+{
+    // Flush all remaining activity records
+    cuptiActivityFlushAll(1);
+}
+
+// Through bufferRequested, CUPTI asks for more memory to fill with events
+static void CUPTIAPI bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords)
+{
+    assert(buffer != nullptr && size != nullptr && maxNumRecords != nullptr);
+
+    *maxNumRecords = 0;
+    *size = CUPTI_BUFFER_SIZE;
+    *buffer = static_cast<uint8_t*>(malloc(*size));
+
+    if (*buffer == nullptr)
+    {
+        std::cerr << "Error: Out of memory.\n";
+        exit(-1);
+    }
+}
+
+// bufferCompleted is called when a requested buffer (created through bufferRequested) has
+//  been filled with event data or the end of measurement is reached. We then can process the events
+//  in that CUPTI event buffer and write them to the lo2s ring-buffer
+static void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size,
+                                     size_t validSize)
+{
+    CUpti_Activity* record = nullptr;
+
+    size_t ringbuf_full_dropped = 0;
+    while (cuptiActivityGetNextRecord(buffer, validSize, &record) == CUPTI_SUCCESS)
+    {
+        switch (record->kind)
+        {
+        case CUPTI_ACTIVITY_KIND_KERNEL:
+        case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
+        {
+            CUpti_ActivityKernel6* kernel = reinterpret_cast<CUpti_ActivityKernel6*>(record);
+
+            uint64_t name_len = strlen(kernel->name);
+
+            struct lo2s::cupti::event_kernel* ev =
+                reinterpret_cast<struct lo2s::cupti::event_kernel*>(
+                    rb_writer->reserve(sizeof(struct lo2s::cupti::event_kernel) + name_len));
+
+            if (ev == nullptr)
+            {
+                ringbuf_full_dropped++;
+                continue;
+            }
+
+            ev->header.type = lo2s::cupti::EventType::CUPTI_KERNEL;
+            ev->header.size = sizeof(struct lo2s::cupti::event_kernel) + name_len;
+            ev->start = kernel->start;
+            ev->end = kernel->end;
+            memcpy(ev->name, kernel->name, name_len + 1);
+
+            rb_writer->commit();
+            break;
+        }
+        default:
+            break;
+        }
+    }
+
+    size_t cupti_dropped;
+    cuptiActivityGetNumDroppedRecords(ctx, streamId, &cupti_dropped);
+    if (cupti_dropped != 0)
+    {
+        std::cerr << "Dropped " << cupti_dropped << " activity records in CUPTI.\n";
+    }
+
+    if (ringbuf_full_dropped != 0)
+    {
+        std::cerr << "lo2s Ringbuffer full, dropped" << ringbuf_full_dropped
+                  << " events. Try to increase --nvidia-ringbuf-size!"
+
+                  << std::endl;
+    }
+
+    free(buffer);
+}
+
+// callbackHandler is our universal callback handler for the callback based part of the CUPTI
+//  event API. We attach it to the following events:
+//
+//  - cuProfilerStart -> enable the CUPTI Activity API tracing for the given cuContext
+//  - cuProfilerStop -> flush event buffers and disable CUPTI Activity API tracing
+//  - cudaDeviceReset -> flush event buffers
+void CUPTIAPI callbackHandler(void* userdata, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
+                              void* cbdata)
+{
+    const CUpti_CallbackData* cbInfo = (CUpti_CallbackData*)cbdata;
+
+    if (domain == CUPTI_CB_DOMAIN_DRIVER_API)
+    {
+        if (cbid == CUPTI_DRIVER_TRACE_CBID_cuProfilerStart)
+        {
+            if (cbInfo->callbackSite == CUPTI_API_EXIT)
+            {
+                cuptiActivityEnableContext(cbInfo->context, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+            }
+        }
+        else if (cbid == CUPTI_DRIVER_TRACE_CBID_cuProfilerStop)
+        {
+            if (cbInfo->callbackSite == CUPTI_API_ENTER)
+            {
+                cuptiActivityFlushAll(0);
+                cuptiEnableCallback(0, subscriber, CUPTI_CB_DOMAIN_RUNTIME_API,
+                                    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020);
+
+                cuptiActivityDisableContext(cbInfo->context, CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+            }
+        }
+    }
+
+    // Also flush on CUDA device reset
+    else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API)
+    {
+        if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020)
+        {
+            if (cbInfo->callbackSite == CUPTI_API_ENTER)
+            {
+                cuptiActivityFlushAll(0);
+            }
+        }
+    }
+}
+
+uint64_t timestampfunc()
+{
+    struct timespec ts;
+    clock_gettime(clockid, &ts);
+    uint64_t res = ts.tv_sec * 1000000000 + ts.tv_nsec;
+    return res;
+}
+
+extern "C" int InitializeInjection(void)
+{
+
+    std::string rb_size_str;
+    rb_writer = std::make_unique<lo2s::RingBufWriter>("cupti", getpid(), false);
+    char* clockid_str = getenv("LO2S_CLOCKID");
+
+    if (clockid_str != nullptr)
+    {
+        clockid = std::stoi(clockid_str);
+    }
+
+    // Register an atexit() handler for clean-up
+    atexit(&atExitHandler);
+
+    cuptiSubscribe(&subscriber, (CUpti_CallbackFunc)callbackHandler, nullptr);
+
+    // Supply our own timestamp generation function. Saves us the work of converting timestamps
+    cuptiActivityRegisterTimestampCallback(timestampfunc);
+
+    // Register CUDA API callbacks for us to attach to new CUDA contexts
+    cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                        CUPTI_DRIVER_TRACE_CBID_cuProfilerStart);
+    cuptiEnableCallback(1, subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
+                        CUPTI_DRIVER_TRACE_CBID_cuProfilerStop);
+
+    cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL);
+
+    // Register buffer callbacks. When cupti needs a new buffer for recording date, it calls
+    // bufferRequested. When the buffer is full, bufferCompleted is used to write the data to the
+    // lo2s ring-buffer
+    cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted);
+
+    return 1;
+}
diff --git a/src/monitor/poll_monitor.cpp b/src/monitor/poll_monitor.cpp
index d19effa5..03d0a6e4 100644
--- a/src/monitor/poll_monitor.cpp
+++ b/src/monitor/poll_monitor.cpp
@@ -22,12 +22,9 @@
 #include <lo2s/config.hpp>
 #include <lo2s/error.hpp>
 #include <lo2s/monitor/poll_monitor.hpp>
+#include <lo2s/util.hpp>
 
 #include <cmath>
-extern "C"
-{
-#include <sys/timerfd.h>
-}
 
 namespace lo2s
 {
@@ -51,18 +48,9 @@ PollMonitor::PollMonitor(trace::Trace& trace, const std::string& name,
     if (read_interval.count() != 0)
     {
 
-        tspec.it_value.tv_nsec = 1;
-
-        tspec.it_interval.tv_sec =
-            std::chrono::duration_cast<std::chrono::seconds>(read_interval).count();
-
-        tspec.it_interval.tv_nsec = (read_interval % std::chrono::seconds(1)).count();
-
-        timer_pfd().fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+        timer_pfd().fd = timerfd_from_ns(read_interval);
         timer_pfd().events = POLLIN;
         timer_pfd().revents = 0;
-
-        timerfd_settime(timer_pfd().fd, TFD_TIMER_ABSTIME, &tspec, NULL);
     }
     else
     {
diff --git a/src/monitor/process_monitor.cpp b/src/monitor/process_monitor.cpp
index 3b7cbeee..f4d66047 100644
--- a/src/monitor/process_monitor.cpp
+++ b/src/monitor/process_monitor.cpp
@@ -38,10 +38,11 @@ void ProcessMonitor::insert_process(Process parent, Process process, std::string
                                     bool spawn)
 {
     trace_.add_process(parent, process, proc_name);
-    insert_thread(process, process.as_thread(), proc_name, spawn);
+    insert_thread(process, process.as_thread(), proc_name, spawn, true);
 }
 
-void ProcessMonitor::insert_thread(Process process, Thread thread, std::string name, bool spawn)
+void ProcessMonitor::insert_thread(Process process, Thread thread, std::string name, bool spawn,
+                                   bool is_process)
 {
     trace_.add_thread(thread, name);
 
@@ -56,9 +57,9 @@ void ProcessMonitor::insert_thread(Process process, Thread thread, std::string n
     {
         try
         {
-            auto inserted =
-                threads_.emplace(std::piecewise_construct, std::forward_as_tuple(thread),
-                                 std::forward_as_tuple(ExecutionScope(thread), *this, spawn));
+            auto inserted = threads_.emplace(
+                std::piecewise_construct, std::forward_as_tuple(thread),
+                std::forward_as_tuple(ExecutionScope(thread), *this, spawn, is_process));
             assert(inserted.second);
             // actually start thread
             inserted.first->second.start();
diff --git a/src/monitor/process_monitor_main.cpp b/src/monitor/process_monitor_main.cpp
index aa4a4819..7927c801 100644
--- a/src/monitor/process_monitor_main.cpp
+++ b/src/monitor/process_monitor_main.cpp
@@ -19,6 +19,7 @@
  * along with lo2s.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <filesystem>
 #include <lo2s/monitor/process_monitor_main.hpp>
 
 #include <lo2s/monitor/abstract_process_monitor.hpp>
@@ -26,6 +27,7 @@
 #include <lo2s/process_controller.hpp>
 #include <lo2s/util.hpp>
 
+#include <lo2s/build_config.hpp>
 #include <lo2s/config.hpp>
 #include <lo2s/error.hpp>
 #include <lo2s/log.hpp>
@@ -117,6 +119,19 @@ static void drop_privileges()
     assert(getgid() != 0);
 }
 
+std::vector<char*> to_vector_of_c_str(const std::vector<std::string>& vec)
+{
+    std::vector<char*> res;
+    std::transform(vec.begin(), vec.end(), std::back_inserter(res), [](const std::string& s) {
+        char* pc = new char[s.size() + 1];
+        std::strcpy(pc, s.c_str());
+        return pc;
+    });
+    res.push_back(nullptr);
+
+    return res;
+}
+
 [[noreturn]] static void run_command(const std::vector<std::string>& command_and_args)
 {
     struct rlimit initial_rlimit = initial_rlimit_fd();
@@ -137,14 +152,20 @@ static void drop_privileges()
     /* we need ptrace to get fork/clone/... */
     ptrace(PTRACE_TRACEME, 0, NULL, NULL);
 
-    std::vector<char*> tmp;
-    std::transform(command_and_args.begin(), command_and_args.end(), std::back_inserter(tmp),
-                   [](const std::string& s) {
-                       char* pc = new char[s.size() + 1];
-                       std::strcpy(pc, s.c_str());
-                       return pc;
-                   });
-    tmp.push_back(nullptr);
+    std::vector<std::string> env;
+#ifdef HAVE_CUDA
+    if (config().use_nvidia)
+    {
+        env = { "CUDA_INJECTION64_PATH=" + config().cuda_injectionlib_path };
+
+        if (config().use_clockid)
+        {
+            env.push_back("LO2S_CLOCKID=" + std::to_string(config().clockid));
+        }
+    }
+#endif
+    std::vector<char*> c_env = to_vector_of_c_str(env);
+    std::vector<char*> c_args = to_vector_of_c_str(command_and_args);
 
     Log::debug() << "Execute the command: " << nitro::lang::join(command_and_args);
 
@@ -158,13 +179,19 @@ static void drop_privileges()
     }
 
     // run the application which should be sampled
-    execvp(tmp[0], &tmp[0]);
+    execvpe(c_args[0], &c_args[0], &c_env[0]);
 
     // should not be executed -> exec failed, let's clean up anyway.
-    for (auto cp : tmp)
+    for (auto cp : c_args)
+    {
+        delete[] cp;
+    }
+
+    for (auto cp : c_env)
     {
         delete[] cp;
     }
+
     Log::error() << "Could not execute the command: " << nitro::lang::join(command_and_args);
     throw_errno();
 }
diff --git a/src/monitor/scope_monitor.cpp b/src/monitor/scope_monitor.cpp
index c97c3d1b..af9f6af9 100644
--- a/src/monitor/scope_monitor.cpp
+++ b/src/monitor/scope_monitor.cpp
@@ -28,9 +28,6 @@
 #include <lo2s/time/time.hpp>
 
 #include <memory>
-#include <string>
-
-#include <cassert>
 
 extern "C"
 {
@@ -42,7 +39,8 @@ namespace lo2s
 namespace monitor
 {
 
-ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec)
+ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enable_on_exec,
+                           bool is_process)
 : PollMonitor(parent.trace(), scope.name(), config().perf_read_interval), scope_(scope)
 {
     if (config().sampling || scope.is_cpu())
@@ -72,6 +70,13 @@ ScopeMonitor::ScopeMonitor(ExecutionScope scope, MainMonitor& parent, bool enabl
         add_fd(userspace_counter_writer_->fd());
     }
 
+    if (config().use_nvidia && is_process)
+    {
+        cupti_reader_ =
+            std::make_unique<cupti::Reader>(parent.trace(), scope.as_thread().as_process());
+        add_fd(cupti_reader_->fd());
+    }
+
     // note: start() can now be called
 }
 
@@ -95,6 +100,11 @@ void ScopeMonitor::monitor(int fd)
         try_pin_to_scope(scope_);
     }
 
+    if (cupti_reader_ && (fd == cupti_reader_->fd() || fd == stop_pfd().fd))
+    {
+        cupti_reader_->read();
+    }
+
     if (syscall_writer_ &&
         (fd == timer_pfd().fd || fd == stop_pfd().fd || syscall_writer_->fd() == fd))
     {
diff --git a/src/monitor/system_process_monitor.cpp b/src/monitor/system_process_monitor.cpp
index b87001c5..a86d07d6 100644
--- a/src/monitor/system_process_monitor.cpp
+++ b/src/monitor/system_process_monitor.cpp
@@ -34,7 +34,8 @@ void SystemProcessMonitor::insert_process([[maybe_unused]] Process parent,
 }
 
 void SystemProcessMonitor::insert_thread([[maybe_unused]] Process process, Thread thread,
-                                         std::string name, [[maybe_unused]] bool spawn)
+                                         std::string name, [[maybe_unused]] bool spawn,
+                                         [[maybe_unused]] bool is_process)
 {
     // in system monitoring, we only need to track the threads spawned from the process lo2s spawned
     // itself. Without this, these threads end up as "<unknown thread>". Sad times.
diff --git a/src/perf/counter/userspace/reader.cpp b/src/perf/counter/userspace/reader.cpp
index 32b82f95..985397e3 100644
--- a/src/perf/counter/userspace/reader.cpp
+++ b/src/perf/counter/userspace/reader.cpp
@@ -31,7 +31,6 @@
 
 extern "C"
 {
-#include <sys/timerfd.h>
 #include <unistd.h>
 }
 
@@ -43,26 +42,14 @@ namespace counter
 {
 namespace userspace
 {
-
 template <class T>
 Reader<T>::Reader(ExecutionScope scope)
 : counter_collection_(
       CounterProvider::instance().collection_for(MeasurementScope::userspace_metric(scope))),
-  counter_buffer_(counter_collection_.counters.size()), data_(counter_collection_.counters.size())
+  counter_buffer_(counter_collection_.counters.size()),
+  timer_fd_(timerfd_from_ns(config().userspace_read_interval)),
+  data_(counter_collection_.counters.size())
 {
-    struct itimerspec tspec;
-    memset(&tspec, 0, sizeof(struct itimerspec));
-    tspec.it_value.tv_nsec = 1;
-
-    tspec.it_interval.tv_sec =
-        std::chrono::duration_cast<std::chrono::seconds>(config().userspace_read_interval).count();
-
-    tspec.it_interval.tv_nsec =
-        (config().userspace_read_interval % std::chrono::seconds(1)).count();
-    timer_fd_ = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
-
-    timerfd_settime(timer_fd_, TFD_TIMER_ABSTIME, &tspec, NULL);
-
     for (auto& event : counter_collection_.counters)
     {
         counter_fds_.emplace_back(perf_event_description_open(scope, event, -1));
diff --git a/src/trace/trace.cpp b/src/trace/trace.cpp
index 46df1980..a3f7ecba 100644
--- a/src/trace/trace.cpp
+++ b/src/trace/trace.cpp
@@ -19,7 +19,6 @@
  * along with lo2s.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "otf2xx/chrono/duration.hpp"
 #include <lo2s/trace/trace.hpp>
 
 #include <lo2s/address.hpp>
@@ -411,6 +410,21 @@ otf2::writer::local& Trace::sample_writer(const ExecutionScope& writer_scope)
     return archive_(location(writer_scope));
 }
 
+otf2::writer::local& Trace::cuda_writer(const Thread& thread)
+{
+    MeasurementScope scope = MeasurementScope::cuda(thread.as_scope());
+
+    const auto& cuda_location_group = registry_.emplace<otf2::definition::location_group>(
+        ByMeasurementScope(scope), intern(scope.name()),
+        otf2::common::location_group_type::accelerator, system_tree_root_node_);
+
+    const auto& intern_location = registry_.emplace<otf2::definition::location>(
+        ByMeasurementScope(scope), intern(scope.name()), cuda_location_group,
+        otf2::definition::location::location_type::accelerator_stream);
+
+    return archive_(intern_location);
+}
+
 otf2::writer::local& Trace::nec_writer(NecDevice device, const Thread& nec_thread)
 {
 
@@ -489,6 +503,15 @@ otf2::writer::local& Trace::create_metric_writer(const std::string& name)
     return archive_(location);
 }
 
+otf2::definition::calling_context& Trace::cuda_calling_context(std::string& file,
+                                                               std::string& function)
+{
+    LineInfo info = LineInfo::for_function(file.c_str(), function.c_str(), 0, "");
+
+    return registry_.emplace<otf2::definition::calling_context>(
+        ByLineInfo(info), intern_region(info), intern_scl(info));
+}
+
 otf2::definition::io_handle& Trace::block_io_handle(BlockDevice dev)
 {
 
diff --git a/src/util.cpp b/src/util.cpp
index ff4fc632..43c9dfaa 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -27,6 +27,7 @@ extern "C"
 #include <sys/syscall.h>
 #include <sys/sysmacros.h>
 #include <sys/time.h>
+#include <sys/timerfd.h>
 #include <sys/types.h>
 #include <sys/utsname.h>
 #include <unistd.h>
@@ -411,4 +412,27 @@ void bump_rlimit_fd()
                        "resource limit.";
     }
 }
+
+int timerfd_from_ns(std::chrono::nanoseconds duration)
+{
+    int timerfd;
+    struct itimerspec tspec;
+    memset(&tspec, 0, sizeof(struct itimerspec));
+
+    tspec.it_interval.tv_sec = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
+    tspec.it_interval.tv_nsec = (duration % std::chrono::seconds(1)).count();
+
+    timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+
+    if (timerfd == -1)
+    {
+        throw_errno();
+    }
+
+    if (timerfd_settime(timerfd, TFD_TIMER_ABSTIME, &tspec, NULL) == -1)
+    {
+        throw_errno();
+    }
+    return timerfd;
+}
 } // namespace lo2s