mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-04-28 10:21:48 +00:00
Add timing info to CUDA graph evaluation
This commit is contained in:
@@ -50,6 +50,8 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#define IK_PRINT_TIMING 0
|
||||||
|
|
||||||
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
static void ggml_cuda_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
||||||
@@ -2446,6 +2448,10 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if IK_PRINT_TIMING
|
||||||
|
int64_t tim1 = ggml_time_us();
|
||||||
|
#endif
|
||||||
|
|
||||||
switch (dst->op) {
|
switch (dst->op) {
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
ggml_cuda_op_repeat(ctx, dst);
|
ggml_cuda_op_repeat(ctx, dst);
|
||||||
@@ -2618,6 +2624,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||||||
CUDA_CHECK(err);
|
CUDA_CHECK(err);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if IK_PRINT_TIMING
|
||||||
|
int64_t tim2 = ggml_time_us();
|
||||||
|
printf("%s(%s): %d us\n", ggml_op_name(dst->op), dst->name, (int)(tim2 - tim1));
|
||||||
|
#endif
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user