Add timing info to CUDA graph evaluation

2026-02-24 23:24:13 +00:00 · 2025-02-25 08:01:25 +02:00
parent d7ef3a53a7
commit c2a02dfd09
1 changed files with 11 additions and 0 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -50,6 +50,8 @@
 #include <string>
 #include <vector>

+#define IK_PRINT_TIMING 0
+
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");

 static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
@@ -2446,6 +2448,10 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
    }

+#if IK_PRINT_TIMING
+    int64_t tim1 = ggml_time_us();
+#endif
+
    switch (dst->op) {
        case GGML_OP_REPEAT:
            ggml_cuda_op_repeat(ctx, dst);
@@ -2618,6 +2624,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        CUDA_CHECK(err);
    }

+#if IK_PRINT_TIMING
+    int64_t tim2 = ggml_time_us();
+    printf("%s(%s): %d us\n", ggml_op_name(dst->op), dst->name, (int)(tim2 - tim1));
+#endif
+
    return true;
 }