WIP

2026-05-11 00:20:19 +00:00 · 2025-10-29 09:45:30 +02:00
parent d73914c70b
commit 446b4a4da3
1 changed files with 75 additions and 26 deletions
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -28,6 +28,8 @@ struct create_tensors_helper : public create_tensors_helper_interface {

    virtual size_t get_ctx_size() const override { return ctx_size; }

+    bool merge_qkv(const LLM_TN & tn, int i, bool bias);
+
    bool create_tensors() override;

    bool create_llama_tensors(const LLM_TN & tn);
@@ -1044,33 +1046,8 @@ bool create_tensors_helper::create_qwen3_moe_tensors(const LLM_TN & tn) {

        layer.attn_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});

-        auto wq_name = tn(LLM_TENSOR_ATTN_Q, "weight", i);
-        auto wk_name = tn(LLM_TENSOR_ATTN_K, "weight", i);
-        auto wv_name = tn(LLM_TENSOR_ATTN_V, "weight", i);
-        auto wq = ml.require_tensor_meta(wq_name.c_str());
-        auto wk = ml.require_tensor_meta(wk_name.c_str());
-        auto wv = ml.require_tensor_meta(wv_name.c_str());
+        if (merge_qkv(tn, i, false)) use_mmap_buffer = false;

-        bool fused_qkv = false;
-        if (wq->type == wk->type && wq->type == wv->type) {
-            GGML_ASSERT(wq->ne[0] == n_embd && wq->ne[1] == n_head * n_rot);
-            GGML_ASSERT(wk->ne[0] == n_embd && wk->ne[1] == n_head_kv * n_rot);
-            GGML_ASSERT(wv->ne[0] == n_embd && wv->ne[1] == n_head_kv * n_rot);
-            layer.wqkv = ggml_new_tensor_2d(ctx_split, wq->type, n_embd, n_rot * (n_head + n_head_kv + n_head_kv));
-            ggml_set_name(layer.wqkv, tn(LLM_TENSOR_ATTN_QKV, "weight", i).c_str());
-            layer.wq = ml.create_tensor_as_view(ctx_split, layer.wqkv, wq_name.c_str(), { wq->ne[0], wq->ne[1] }, 0);
-            layer.wk = ml.create_tensor_as_view(ctx_split, layer.wqkv, wk_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]);
-            layer.wv = ml.create_tensor_as_view(ctx_split, layer.wqkv, wv_name.c_str(), { wv->ne[0], wv->ne[1] }, wq->ne[1]*wq->nb[1] + wk->ne[1]*wk->nb[1] );
-            fused_qkv = true;
-            use_mmap_buffer = false;
-            printf("Created fused qkv %s\n", layer.wqkv->name);
-        }
-
-        if (!fused_qkv) {
-            layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
-            layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
-            layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
-        }
        layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});

        layer.attn_k_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k});
@@ -2445,6 +2422,78 @@ bool create_tensors_helper::create_openai_moe_tensors(const LLM_TN & tn) {
    return use_mmap_buffer;
 }

+bool create_tensors_helper::merge_qkv(const LLM_TN & tn, int i, bool bias) {
+    auto& hparams = model.hparams;
+    const int64_t n_head        = hparams.n_head();
+    const int64_t n_head_kv     = hparams.n_head_kv();
+    const int64_t n_embd        = hparams.n_embd;
+    const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
+    const int64_t n_embd_head_k = hparams.n_embd_head_k;
+    const int64_t n_embd_gqa    = n_embd_v_gqa;
+    const int64_t n_rot         = hparams.n_rot;
+
+    ggml_context * ctx_layer = ctx_for_layer(i);
+    ggml_context * ctx_split = ctx_for_layer_split(i);
+
+    auto & layer = model.layers[i];
+
+    auto wq_name = tn(LLM_TENSOR_ATTN_Q, "weight", i);
+    auto wk_name = tn(LLM_TENSOR_ATTN_K, "weight", i);
+    auto wv_name = tn(LLM_TENSOR_ATTN_V, "weight", i);
+    auto wq = ml.require_tensor_meta(wq_name.c_str());
+    auto wk = ml.require_tensor_meta(wk_name.c_str());
+    auto wv = ml.require_tensor_meta(wv_name.c_str());
+    GGML_ASSERT(wq && wk && wv);
+
+    bool fused_qkv = false;
+    if (wq->type == wk->type && wq->type == wv->type) {
+        GGML_ASSERT(wq->ne[0] == n_embd && wq->ne[1] == n_head * n_rot);
+        GGML_ASSERT(wk->ne[0] == n_embd && wk->ne[1] == n_head_kv * n_rot);
+        GGML_ASSERT(wv->ne[0] == n_embd && wv->ne[1] == n_head_kv * n_rot);
+        layer.wqkv = ggml_new_tensor_2d(ctx_split, wq->type, n_embd, n_rot * (n_head + n_head_kv + n_head_kv));
+        snprintf(layer.wqkv->name, GGML_MAX_NAME, "blk.%d.attn_qkv.weight", i);
+        // This does not work. If we are doing this merge manually, it basically means that the arch does not have
+        // an LLM_TENSOR_ATTN_QKV entry, so we will get __missing__ as the tensor name.
+        //ggml_set_name(layer.wqkv, tn(LLM_TENSOR_ATTN_QKV, "weight", i).c_str());
+        layer.wq = ml.create_tensor_as_view(ctx_split, layer.wqkv, wq_name.c_str(), { wq->ne[0], wq->ne[1] }, 0);
+        layer.wk = ml.create_tensor_as_view(ctx_split, layer.wqkv, wk_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]);
+        layer.wv = ml.create_tensor_as_view(ctx_split, layer.wqkv, wv_name.c_str(), { wv->ne[0], wv->ne[1] }, wq->ne[1]*wq->nb[1] + wk->ne[1]*wk->nb[1] );
+        fused_qkv = true;
+        printf("Created fused qkv %s\n", layer.wqkv->name);
+        if (bias) {
+            auto bq_name = tn(LLM_TENSOR_ATTN_Q, "bias", i);
+            auto bk_name = tn(LLM_TENSOR_ATTN_K, "bias", i);
+            auto bv_name = tn(LLM_TENSOR_ATTN_V, "bias", i);
+            auto bq = ml.require_tensor_meta(bq_name.c_str());
+            auto bk = ml.require_tensor_meta(bk_name.c_str());
+            auto bv = ml.require_tensor_meta(bv_name.c_str());
+            GGML_ASSERT(bq && bk && bv);
+            GGML_ASSERT(bq->type == GGML_TYPE_F32 && bk->type == GGML_TYPE_F32 && bv->type == GGML_TYPE_F32);
+            GGML_ASSERT(ggml_nrows(bq) == 1 && bq->ne[0] == wq->ne[1]);
+            GGML_ASSERT(ggml_nrows(bk) == 1 && bk->ne[0] == wk->ne[1]);
+            GGML_ASSERT(ggml_nrows(bv) == 1 && bv->ne[0] == wv->ne[1]);
+            layer.bqkv = ggml_new_tensor_1d(ctx_layer, bq->type, n_rot * (n_head + n_head_kv + n_head_kv));
+            snprintf(layer.bqkv->name, GGML_MAX_NAME, "blk.%d.attn_qkv.bias", i);
+            layer.bq = ml.create_tensor_as_view(ctx_layer, layer.bqkv, bq_name.c_str(), { bq->ne[0] }, 0);
+            layer.bk = ml.create_tensor_as_view(ctx_layer, layer.bqkv, bk_name.c_str(), { bk->ne[0] }, bq->ne[0]*bq->nb[0]);
+            layer.bv = ml.create_tensor_as_view(ctx_layer, layer.bqkv, bv_name.c_str(), { bv->ne[0] }, bq->ne[0]*bq->nb[0] + bk->ne[0]*bk->nb[0] );
+        }
+    }
+
+    if (!fused_qkv) {
+        layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd_head_k * n_head});
+        layer.wk = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+        layer.wv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+        if (bias) {
+            layer.bq = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_head * n_rot}, 0);
+            layer.bk = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_head_kv * n_rot}, 0);
+            layer.bv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_head_kv * n_rot}, 0);
+        }
+    }
+
+    return fused_qkv;
+}
+
 bool create_tensors_helper::create_tensors() {
    const auto tn = LLM_TN(model.arch);
    bool use_mmap_buffer = true;