diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu
index 9c8c91f4..5faba723 100644
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -98,6 +98,17 @@ static void ggml_cuda_log(enum ggml_log_level level, const char * format, ...) {
     }
 }
 
+void Tracer::print_calls() const {
+    if (num_calls == 0) return;
+    GGML_CUDA_LOG_ERROR("========================== CUDA trace: %zu previous calls\n", num_calls);
+    int first = std::max(int64_t(0), num_calls - 1 - kNumStored);
+    for (int64_t i = num_calls-1; i >= first; --i) {
+        auto& call = calls[i%kNumStored];
+        GGML_CUDA_LOG_ERROR("%12zu: function %s, file %s, line %d\n", uint64_t(i), call.func.c_str(), call.file.c_str(), call.line);
+    }
+}
+
+
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
     int id = -1; // in case cudaGetDevice fails
@@ -106,6 +117,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
     GGML_CUDA_LOG_ERROR("CUDA error: %s\n", msg);
     GGML_CUDA_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
     GGML_CUDA_LOG_ERROR("  %s\n", stmt);
+    Tracer::instance().print_calls();
     // abort with GGML_ASSERT to get a stack trace
     GGML_ABORT("CUDA error");
 }
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index a04a1929..db45a55f 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -25,6 +25,7 @@
 #include <cfloat>
 #include <string>
 #include <vector>
+#include <mutex>
 
 #if defined(GGML_USE_HIPBLAS)
 #include "vendors/hip.h"
@@ -66,11 +67,44 @@
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg);
 
+struct Tracer {
+    constexpr static int kNumStored = 32;
+    struct Call {
+        std::string func;
+        std::string file;
+        int         line;
+    };
+    std::mutex mutex;
+    std::array<Call, kNumStored> calls;
+    int64_t num_calls = 0;
+
+    inline void add_call(const char * func, const char * file, int line) {
+        std::lock_guard<std::mutex> lock(mutex);
+        calls[num_calls%kNumStored] = {{func}, {file}, line};
+        ++num_calls;
+    }
+
+    static Tracer& instance() {
+        static Tracer tracer;
+        return tracer;
+    }
+
+    void print_calls() const;
+
+    static inline void register_call(const char * func, const char * file, int line) {
+        instance().add_call(func, file, line);
+    }
+
+    ~Tracer() { print_calls(); }
+};
+
 #define CUDA_CHECK_GEN(err, success, error_fn)                                      \
      do {                                                                           \
         auto err_ = (err);                                                          \
         if (err_ != (success)) {                                                    \
             ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_));    \
+        } else {                                                                    \
+            Tracer::register_call(__func__, __FILE__, __LINE__);                  \
         }                                                                           \
     } while (0)