/************************************************************************* * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "debug.h" #include #include #include #include #include int mscclppDebugLevel = -1; static int pid = -1; static char hostname[1024]; thread_local int mscclppDebugNoWarn = 0; char mscclppLastError[1024] = ""; // Global string for the last error in human readable form uint64_t mscclppDebugMask = MSCCLPP_INIT; // Default debug sub-system mask is INIT FILE* mscclppDebugFile = stdout; mscclppLogHandler_t mscclppDebugLogHandler = NULL; pthread_mutex_t mscclppDebugLock = PTHREAD_MUTEX_INITIALIZER; std::chrono::steady_clock::time_point mscclppEpoch; static __thread int tid = -1; void mscclppDebugDefaultLogHandler(const char* msg) { fwrite(msg, 1, strlen(msg), mscclppDebugFile); } void mscclppDebugInit() { pthread_mutex_lock(&mscclppDebugLock); if (mscclppDebugLevel != -1) { pthread_mutex_unlock(&mscclppDebugLock); return; } const char* mscclpp_debug = getenv("MSCCLPP_DEBUG"); int tempNcclDebugLevel = -1; if (mscclpp_debug == NULL) { tempNcclDebugLevel = MSCCLPP_LOG_NONE; } else if (strcasecmp(mscclpp_debug, "VERSION") == 0) { tempNcclDebugLevel = MSCCLPP_LOG_VERSION; } else if (strcasecmp(mscclpp_debug, "WARN") == 0) { tempNcclDebugLevel = MSCCLPP_LOG_WARN; } else if (strcasecmp(mscclpp_debug, "INFO") == 0) { tempNcclDebugLevel = MSCCLPP_LOG_INFO; } else if (strcasecmp(mscclpp_debug, "ABORT") == 0) { tempNcclDebugLevel = MSCCLPP_LOG_ABORT; } else if (strcasecmp(mscclpp_debug, "TRACE") == 0) { tempNcclDebugLevel = MSCCLPP_LOG_TRACE; } /* Parse the MSCCLPP_DEBUG_SUBSYS env var * This can be a comma separated list such as INIT,COLL * or ^INIT,COLL etc */ char* mscclppDebugSubsysEnv = getenv("MSCCLPP_DEBUG_SUBSYS"); if (mscclppDebugSubsysEnv != NULL) { int invert = 0; if (mscclppDebugSubsysEnv[0] == '^') { invert = 1; mscclppDebugSubsysEnv++; } mscclppDebugMask = invert ? ~0ULL : 0ULL; char* mscclppDebugSubsys = strdup(mscclppDebugSubsysEnv); char* subsys = strtok(mscclppDebugSubsys, ","); while (subsys != NULL) { uint64_t mask = 0; if (strcasecmp(subsys, "INIT") == 0) { mask = MSCCLPP_INIT; } else if (strcasecmp(subsys, "COLL") == 0) { mask = MSCCLPP_COLL; } else if (strcasecmp(subsys, "P2P") == 0) { mask = MSCCLPP_P2P; } else if (strcasecmp(subsys, "SHM") == 0) { mask = MSCCLPP_SHM; } else if (strcasecmp(subsys, "NET") == 0) { mask = MSCCLPP_NET; } else if (strcasecmp(subsys, "GRAPH") == 0) { mask = MSCCLPP_GRAPH; } else if (strcasecmp(subsys, "TUNING") == 0) { mask = MSCCLPP_TUNING; } else if (strcasecmp(subsys, "ENV") == 0) { mask = MSCCLPP_ENV; } else if (strcasecmp(subsys, "ALLOC") == 0) { mask = MSCCLPP_ALLOC; } else if (strcasecmp(subsys, "CALL") == 0) { mask = MSCCLPP_CALL; } else if (strcasecmp(subsys, "ALL") == 0) { mask = MSCCLPP_ALL; } if (mask) { if (invert) mscclppDebugMask &= ~mask; else mscclppDebugMask |= mask; } subsys = strtok(NULL, ","); } free(mscclppDebugSubsys); } // Cache pid and hostname getHostName(hostname, 1024, '.'); pid = getpid(); /* Parse and expand the MSCCLPP_DEBUG_FILE path and * then create the debug file. But don't bother unless the * MSCCLPP_DEBUG level is > VERSION */ const char* mscclppDebugFileEnv = getenv("MSCCLPP_DEBUG_FILE"); if (tempNcclDebugLevel > MSCCLPP_LOG_VERSION && mscclppDebugFileEnv != NULL) { int c = 0; char debugFn[PATH_MAX + 1] = ""; char* dfn = debugFn; while (mscclppDebugFileEnv[c] != '\0' && c < PATH_MAX) { if (mscclppDebugFileEnv[c++] != '%') { *dfn++ = mscclppDebugFileEnv[c - 1]; continue; } switch (mscclppDebugFileEnv[c++]) { case '%': // Double % *dfn++ = '%'; break; case 'h': // %h = hostname dfn += snprintf(dfn, PATH_MAX, "%s", hostname); break; case 'p': // %p = pid dfn += snprintf(dfn, PATH_MAX, "%d", pid); break; default: // Echo everything we don't understand *dfn++ = '%'; *dfn++ = mscclppDebugFileEnv[c - 1]; break; } } *dfn = '\0'; if (debugFn[0] != '\0') { FILE* file = fopen(debugFn, "w"); if (file != nullptr) { setbuf(file, nullptr); // disable buffering mscclppDebugFile = file; } } } if (mscclppDebugLogHandler == NULL) mscclppDebugLogHandler = mscclppDefaultLogHandler; mscclppEpoch = std::chrono::steady_clock::now(); __atomic_store_n(&mscclppDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE); pthread_mutex_unlock(&mscclppDebugLock); } /* Common logging function used by the INFO, WARN and TRACE macros * Also exported to the dynamically loadable Net transport modules so * they can share the debugging mechanisms and output files */ void mscclppDebugLog(mscclppDebugLogLevel level, unsigned long flags, const char* filefunc, int line, const char* fmt, ...) { if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) mscclppDebugInit(); if (mscclppDebugNoWarn != 0 && level == MSCCLPP_LOG_WARN) { level = MSCCLPP_LOG_INFO; flags = mscclppDebugNoWarn; } // Save the last error (WARN) as a human readable string if (level == MSCCLPP_LOG_WARN) { pthread_mutex_lock(&mscclppDebugLock); va_list vargs; va_start(vargs, fmt); (void)vsnprintf(mscclppLastError, sizeof(mscclppLastError), fmt, vargs); va_end(vargs); pthread_mutex_unlock(&mscclppDebugLock); } if (mscclppDebugLevel < level || ((flags & mscclppDebugMask) == 0)) return; if (tid == -1) { tid = syscall(SYS_gettid); } int cudaDev; if (!(level == MSCCLPP_LOG_TRACE && flags == MSCCLPP_CALL)) { cudaGetDevice(&cudaDev); } char buffer[1024]; size_t len = 0; if (level == MSCCLPP_LOG_WARN) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %s:%d MSCCLPP WARN ", hostname, pid, tid, cudaDev, filefunc, line); } else if (level == MSCCLPP_LOG_INFO) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] MSCCLPP INFO ", hostname, pid, tid, cudaDev); } else if (level == MSCCLPP_LOG_TRACE && flags == MSCCLPP_CALL) { len = snprintf(buffer, sizeof(buffer), "%s:%d:%d MSCCLPP CALL ", hostname, pid, tid); } else if (level == MSCCLPP_LOG_TRACE) { auto delta = std::chrono::steady_clock::now() - mscclppEpoch; double timestamp = std::chrono::duration_cast>(delta).count() * 1000; len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d MSCCLPP TRACE ", hostname, pid, tid, cudaDev, timestamp, filefunc, line); } if (len > 0) { va_list vargs; va_start(vargs, fmt); int ret = vsnprintf(buffer + len, sizeof(buffer) - len, fmt, vargs); va_end(vargs); if (ret >= 0) { len += ret; buffer[len++] = '\n'; buffer[len] = '\0'; mscclppDebugLogHandler(buffer); } } } mscclppResult_t mscclppDebugSetLogHandler(mscclppLogHandler_t handler) { if (__atomic_load_n(&mscclppDebugLevel, __ATOMIC_ACQUIRE) == -1) mscclppDebugInit(); if (handler == NULL) return mscclppInvalidArgument; pthread_mutex_lock(&mscclppDebugLock); mscclppDebugLogHandler = handler; pthread_mutex_unlock(&mscclppDebugLock); return mscclppSuccess; } void mscclppSetThreadName(pthread_t thread, const char* fmt, ...) { // pthread_setname_np is nonstandard GNU extension // needs the following feature test macro #ifdef _GNU_SOURCE char threadName[MSCCLPP_THREAD_NAMELEN]; va_list vargs; va_start(vargs, fmt); vsnprintf(threadName, MSCCLPP_THREAD_NAMELEN, fmt, vargs); va_end(vargs); pthread_setname_np(thread, threadName); #endif }