Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fetch spdlog without depending on cudf #2679

Merged
merged 4 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions src/main/cpp/faultinj/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#=============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,9 +28,5 @@ add_library(
)

target_link_libraries(
cufaultinj PRIVATE spdlog::spdlog_header_only
)

target_link_libraries(
cufaultinj PRIVATE CUDA::cupti_static
cufaultinj PRIVATE CUDA::cupti_static cudf::cudf_logger
)
120 changes: 60 additions & 60 deletions src/main/cpp/faultinj/faultinj.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include <cudf/logger.hpp>
ttnghia marked this conversation as resolved.
Show resolved Hide resolved

#include <cuda.h>

#include <assert.h>
Expand All @@ -28,29 +30,28 @@
#define BOOST_SPIRIT_THREADSAFE
#include <boost/property_tree/json_parser.hpp>
#include <boost/property_tree/ptree.hpp>
#include <spdlog/spdlog.h>
#include <sys/inotify.h>
#include <sys/time.h>

// Format enums for logging
auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); }
// auto format_as(CUpti_CallbackDomain domain) { return fmt::underlying(domain); }

namespace {

#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char* errstr; \
cuptiGetResultString(_status, &errstr); \
spdlog::error("function {} failed with error {}", #call, errstr); \
} \
#define CUPTI_CALL(call) \
do { \
CUptiResult _status = call; \
if (_status != CUPTI_SUCCESS) { \
const char* errstr; \
cuptiGetResultString(_status, &errstr); \
CUDF_LOG_ERROR("function {} failed with error {}", #call, errstr); \
} \
} while (0)

#define PTHREAD_CALL(call) \
do { \
int _status = call; \
if (_status != 0) { spdlog::error("function {} failed with error code {}", #call, _status); } \
#define PTHREAD_CALL(call) \
do { \
int _status = call; \
if (_status != 0) { CUDF_LOG_ERROR("function {} failed with error code {}", #call, _status); } \
} while (0)

typedef enum { FI_TRAP, FI_ASSERT, FI_RETURN_VALUE } FaultInjectionType;
Expand Down Expand Up @@ -89,16 +90,16 @@ void* dynamicReconfig(void* args);

void globalControlInit(void)
{
spdlog::debug("globalControlInit of fault injection");
CUDF_LOG_DEBUG("globalControlInit of fault injection");
globalControl.initialized = 0;
globalControl.subscriber = 0;
globalControl.terminateThread = 0;
spdlog::trace("checking environment {}", configFilePathEnv);
CUDF_LOG_TRACE("checking environment {}", configFilePathEnv);
const char* configFilePath = std::getenv(configFilePathEnv.c_str());
spdlog::debug("{} is {}", configFilePathEnv, configFilePath);
CUDF_LOG_DEBUG("{} is {}", configFilePathEnv, configFilePath);
if (configFilePath) {
globalControl.configFilePath = std::string(configFilePath);
spdlog::debug("will init config from {}", globalControl.configFilePath);
CUDF_LOG_DEBUG("will init config from {}", globalControl.configFilePath);
}
readFaultInjectorConfig();
globalControl.initialized = 1;
Expand All @@ -115,10 +116,10 @@ void atExitHandler(void)
if (globalControl.dynamic) {
globalControl.terminateThread = 1;
PTHREAD_CALL(pthread_join(globalControl.dynamicThread, nullptr));
spdlog::info("reconfig thread shut down ... exiting");
CUDF_LOG_INFO("reconfig thread shut down ... exiting");
}

spdlog::debug("atExitHandler: cuptiFinalize");
CUDF_LOG_DEBUG("atExitHandler: cuptiFinalize");
CUPTI_CALL(cuptiFinalize());
}

Expand Down Expand Up @@ -200,9 +201,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
// case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz:
if (std::string(cbInfo->symbolName)
.compare(0, faultInjectorKernelPrefix.size(), faultInjectorKernelPrefix) == 0) {
spdlog::debug("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
CUDF_LOG_DEBUG("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
break;
}
// intentional fallthrough
Expand All @@ -225,9 +226,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
case CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000:
if (std::string(cbInfo->symbolName)
.compare(0, faultInjectorKernelPrefix.size(), faultInjectorKernelPrefix) == 0) {
spdlog::debug("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
CUDF_LOG_DEBUG("rejecting fake launch functionName={} symbol={}",
cbInfo->functionName,
cbInfo->symbolName);
break;
}
// intentional fallthrough
Expand Down Expand Up @@ -261,7 +262,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
const int interceptionCount =
(*matchedFaultConfig).get_optional<int>(interceptionCountKey).value_or(INT_MAX);

spdlog::trace(
CUDF_LOG_TRACE(
"considered config domain={} function={} injectionType={} probability={} "
"interceptionCount={}",
domain,
Expand All @@ -271,7 +272,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
interceptionCount);

if (interceptionCount <= 0) {
spdlog::trace(
CUDF_LOG_TRACE(
"skipping interception because hit count reached 0, "
"domain={} function={} injectionType={} probability={} "
"interceptionCount={}",
Expand All @@ -287,9 +288,9 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
if (injectionProbability <= 0) { return; }
const int rand10000 = std::rand() % 10000;
const int skipThreshold = injectionProbability * 10000 / 100;
spdlog::trace("rand1000={} skipThreshold={}", rand10000, skipThreshold);
CUDF_LOG_TRACE("rand1000={} skipThreshold={}", rand10000, skipThreshold);
if (rand10000 >= skipThreshold) { return; }
spdlog::debug(
CUDF_LOG_DEBUG(
"matched config based on rand10000={} skipThreshold={} "
"domain={} function={} injectionType={} probability={}",
rand10000,
Expand All @@ -299,7 +300,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
injectionType,
injectionProbability);
} else {
spdlog::debug(
CUDF_LOG_DEBUG(
"matched 100% config domain={} function={} injectionType={} "
"probability={}",
domain,
Expand All @@ -310,7 +311,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,

// update counter if not unlimited
if (interceptionCount != INT_MAX) {
spdlog::debug("updating interception count {}: before locking", interceptionCount);
CUDF_LOG_DEBUG("updating interception count {}: before locking", interceptionCount);
// TODO the lock is too coarse-grained.
PTHREAD_CALL(pthread_rwlock_wrlock(&globalControl.configLock));
const int interceptionCount = (*matchedFaultConfig).get<int>("interceptionCount");
Expand All @@ -335,7 +336,7 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
*cuResPtr = static_cast<CUresult>(substituteReturnCode);
} else if (domain == CUPTI_CB_DOMAIN_RUNTIME_API) {
cudaError_t* cudaErrPtr = static_cast<cudaError_t*>(cbInfo->functionReturnValue);
spdlog::error("updating runtime return value DOES NOT WORK, use trap or assert");
CUDF_LOG_ERROR("updating runtime return value DOES NOT WORK, use trap or assert");
*cudaErrPtr = static_cast<cudaError_t>(substituteReturnCode);
break;
}
Expand All @@ -350,17 +351,17 @@ void CUPTIAPI faultInjectionCallbackHandler(void*,
void readFaultInjectorConfig(void)
{
if (globalControl.configFilePath.empty()) {
spdlog::error("specify convig via environment {}", configFilePathEnv);
CUDF_LOG_ERROR("specify convig via environment {}", configFilePathEnv);
return;
}
std::ifstream jsonStream(globalControl.configFilePath);
if (!jsonStream.good()) {
spdlog::error("check file exists {}", globalControl.configFilePath);
CUDF_LOG_ERROR("check file exists {}", globalControl.configFilePath);
return;
}

// to retrieve and the numeric value of spdlog:level::level_enum
// https://github.com/gabime/spdlog/blob/d546201f127c306ec8a0082d57562a05a049af77/include/spdlog/common.h#L198-L204
// The numeric value of level_enum is retrieved from
// https://github.com/rapidsai/rapids-logger/blob/main/logger.hpp.in#L40
const std::string logLevelKey = "logLevel";

// A Boolean flag as to whether to watch for config file modifications
Expand Down Expand Up @@ -392,29 +393,28 @@ void readFaultInjectorConfig(void)

const unsigned seed =
globalControl.configRoot.get_optional<unsigned>(seedKey).value_or(std::time(0));
spdlog::info("Seeding std::srand with {}", seed);
CUDF_LOG_INFO("Seeding std::srand with {}", seed);
std::srand(seed);

const spdlog::level::level_enum logLevelEnum = static_cast<spdlog::level::level_enum>(logLevel);
spdlog::info("changed log level to {}", logLevel);
spdlog::set_level(logLevelEnum);
CUDF_LOG_INFO("changed log level to {}", logLevel);
cudf::default_logger().set_level(static_cast<cudf::level_enum>(logLevel));
traceConfig(globalControl.configRoot);

globalControl.driverFaultConfigs = globalControl.configRoot.get_child_optional(driverFaultsKey);
globalControl.runtimeFaultConfigs =
globalControl.configRoot.get_child_optional(runtimeFaultsKey);
} catch (boost::property_tree::json_parser::json_parser_error& error) {
spdlog::error("error parsing fault injector config, still editing? {}", error.what());
CUDF_LOG_ERROR("error parsing fault injector config, still editing? {}", error.what());
}
PTHREAD_CALL(pthread_rwlock_unlock(&globalControl.configLock));
jsonStream.close();
spdlog::debug("readFaultInjectorConfig from {} DONE", globalControl.configFilePath);
CUDF_LOG_DEBUG("readFaultInjectorConfig from {} DONE", globalControl.configFilePath);
}

void traceConfig(boost::property_tree::ptree const& pTree)
{
for (auto it = pTree.begin(); it != pTree.end(); ++it) {
spdlog::trace("congig key={} value={}", it->first, it->second.get_value<std::string>());
CUDF_LOG_TRACE("congig key={} value={}", it->first, it->second.get_value<std::string>());
traceConfig(it->second);
}
}
Expand All @@ -432,17 +432,17 @@ int eventCheck(int fd)

void* dynamicReconfig(void*)
{
spdlog::debug("config watcher thread: inotify_init()");
CUDF_LOG_DEBUG("config watcher thread: inotify_init()");
const int inotifyFd = inotify_init();
if (inotifyFd < 0) {
spdlog::error("inotify_init() failed");
CUDF_LOG_ERROR("inotify_init() failed");
return nullptr;
}
spdlog::debug("config watcher thread: inotify_add_watch {}", globalControl.configFilePath);
CUDF_LOG_DEBUG("config watcher thread: inotify_add_watch {}", globalControl.configFilePath);
const int watchFd = inotify_add_watch(inotifyFd, globalControl.configFilePath.c_str(), IN_MODIFY);
if (watchFd < 0) {
spdlog::error("config watcher thread: inotify_add_watch {} failed",
globalControl.configFilePath);
CUDF_LOG_ERROR("config watcher thread: inotify_add_watch {} failed",
globalControl.configFilePath);
return nullptr;
}

Expand All @@ -454,32 +454,32 @@ void* dynamicReconfig(void*)
char eventBuffer[BUF_LEN];

while (!globalControl.terminateThread) {
spdlog::trace("about to call eventCheck");
CUDF_LOG_TRACE("about to call eventCheck");
const int eventCheckRes = eventCheck(inotifyFd);
spdlog::trace("eventCheck returned {}", eventCheckRes);
CUDF_LOG_TRACE("eventCheck returned {}", eventCheckRes);
if (eventCheckRes > 0) {
const int length = read(inotifyFd, eventBuffer, BUF_LEN);
spdlog::debug("config watcher thread: read {} bytes", length);
CUDF_LOG_DEBUG("config watcher thread: read {} bytes", length);
if (length < EVENT_SIZE) { continue; }
for (int i = 0; i < length;) {
struct inotify_event* event = (struct inotify_event*)&eventBuffer[i];
spdlog::debug("modfiled file detected: {}", event->name);
CUDF_LOG_DEBUG("modfiled file detected: {}", event->name);
i += EVENT_SIZE + event->len;
}
readFaultInjectorConfig();
}
}

if (watchFd >= 0) {
spdlog::debug("config watcher thread: inotify_rm_watch {} {}", inotifyFd, watchFd);
CUDF_LOG_DEBUG("config watcher thread: inotify_rm_watch {} {}", inotifyFd, watchFd);
inotify_rm_watch(inotifyFd, watchFd);
}
if (inotifyFd >= 0) {
spdlog::debug("config watcher thread: close {}", inotifyFd);
CUDF_LOG_DEBUG("config watcher thread: close {}", inotifyFd);
close(inotifyFd);
}
spdlog::info("exiting dynamic reconfig thread: terminateThread={}",
globalControl.terminateThread);
CUDF_LOG_INFO("exiting dynamic reconfig thread: terminateThread={}",
globalControl.terminateThread);
return nullptr;
}

Expand All @@ -490,9 +490,9 @@ void* dynamicReconfig(void*)
*/
extern "C" int InitializeInjection(void)
{
spdlog::info("cuInit entry point for libcufaultinj InitializeInjection");
CUDF_LOG_INFO("cuInit entry point for libcufaultinj InitializeInjection");
// intial log level is trace until the config is read
spdlog::set_level(spdlog::level::trace);
cudf::default_logger().set_level(cudf::level_enum::trace);

if (globalControl.initialized) { return 1; }
// Init globalControl
Expand All @@ -501,7 +501,7 @@ extern "C" int InitializeInjection(void)
registerAtExitHandler();

if (globalControl.dynamic) {
spdlog::debug("creating a thread to watch the fault injector config interactively");
CUDF_LOG_DEBUG("creating a thread to watch the fault injector config interactively");
PTHREAD_CALL(pthread_create(&globalControl.dynamicThread, nullptr, dynamicReconfig, nullptr));
}

Expand Down
Loading
Loading