WIP

2026-03-04 19:10:03 +00:00 · 2025-02-12 10:04:33 +02:00
parent 54252d0256
commit 8438b16281
1 changed files with 15 additions and 2 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2547,7 +2547,7 @@ struct llama_layer {
    struct ggml_tensor * wkv_a_mqa;
    struct ggml_tensor * wkv_b;
    struct ggml_tensor * wk_b;
-    struct ggml_tensor * wv_b;    
+    struct ggml_tensor * wv_b;
    struct ggml_tensor * wq_cross;
    struct ggml_tensor * wk_cross;
    struct ggml_tensor * wv_cross;
@@ -13504,12 +13504,21 @@ struct llm_build_context {
                                0);
                    cb(kr_cache, "kr_cache", il);

-                    struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0);
+                    printf("kv_lora_rank = %d, n_head = %d, n_embd_head_qk_nope = %d, n_embd_head_v = %d\n", kv_lora_rank, (int)n_head, n_embd_head_qk_nope, (int)n_embd_head_v);
+                    printf("wk_b: %d x %d x %d x %d, wkv_b: %d x %d x %d x %d\n",
+                            (int)model.layers[il].wk_b->ne[0], (int)model.layers[il].wk_b->ne[1], (int)model.layers[il].wk_b->ne[2], (int)model.layers[il].wk_b->ne[3],
+                            (int)model.layers[il].wkv_b->ne[0], (int)model.layers[il].wkv_b->ne[1], (int)model.layers[il].wkv_b->ne[2], (int)model.layers[il].wkv_b->ne[3]);
+                    struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank) * n_embd_head_qk_nope, 0);
                    cb(wk_b, "wk_b", il);

                    q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
                    cb(q_nope, "q_nope_perm", il);

+                    //ggml_tensor * wkv_b = ggml_view_2d(ctx0, model.layers[il].wkv_b, kv_lora_rank, n_embd_head_qk_nope*n_head,
+                    //        ggml_row_size(model.layers[il].wkv_b->type, kv_lora_rank), 0);
+                    //ggml_tensor * ik1 = ggml_mul_mat(ctx0, wkv_b, kv_cache);
+                    //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope, 
+
                    struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
                    cb(q_nope2, "q_nope2", il);

@@ -13519,6 +13528,10 @@ struct llm_build_context {
                    }
                    struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2);
                    cb(kq_nope, "kq_nope", il);
+                    printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n",
+                            (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3],
+                            (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3],
+                            (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]);

                    if (!pp_opt) {
                        kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3);