CUDA call tracer

This commit is contained in:
Iwan Kawrakow
2025-05-21 08:40:25 +03:00
parent a2b5057a0c
commit a8e3e33503
2 changed files with 46 additions and 0 deletions

View File

@@ -98,6 +98,17 @@ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
}
}
void Tracer::print_calls() const {
if (num_calls == 0) return;
GGML_CUDA_LOG_ERROR("========================== CUDA trace: %zu previous calls\n", num_calls);
int first = std::max(int64_t(0), num_calls - 1 - kNumStored);
for (int64_t i = num_calls-1; i >= first; --i) {
auto& call = calls[i%kNumStored];
GGML_CUDA_LOG_ERROR("%12zu: function %s, file %s, line %d\n", uint64_t(i), call.func.c_str(), call.file.c_str(), call.line);
}
}
[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
int id = -1; // in case cudaGetDevice fails
@@ -106,6 +117,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_CUDA_LOG_ERROR(" %s\n", stmt);
Tracer::instance().print_calls();
// abort with GGML_ASSERT to get a stack trace
GGML_ABORT("CUDA error");
}

View File

@@ -25,6 +25,7 @@
#include <cfloat>
#include <string>
#include <vector>
#include <mutex>
#if defined(GGML_USE_HIPBLAS)
#include "vendors/hip.h"
@@ -66,11 +67,44 @@
[[noreturn]]
void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
struct Tracer {
constexpr static int kNumStored = 32;
struct Call {
std::string func;
std::string file;
int line;
};
std::mutex mutex;
std::array<Call, kNumStored> calls;
int64_t num_calls = 0;
inline void add_call(const char * func, const char * file, int line) {
std::lock_guard<std::mutex> lock(mutex);
calls[num_calls%kNumStored] = {{func}, {file}, line};
++num_calls;
}
static Tracer& instance() {
static Tracer tracer;
return tracer;
}
void print_calls() const;
static inline void register_call(const char * func, const char * file, int line) {
instance().add_call(func, file, line);
}
~Tracer() { print_calls(); }
};
#define CUDA_CHECK_GEN(err, success, error_fn) \
do { \
auto err_ = (err); \
if (err_ != (success)) { \
ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \
} else { \
Tracer::register_call(__func__, __FILE__, __LINE__); \
} \
} while (0)