From b73d66f76e1ea08f259ff2d89c590ece9f1e1c84 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Nov 2025 19:09:04 +0200 Subject: [PATCH] Formatting --- src/llama-build-context.cpp | 83 ++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 0e2019df..77792f95 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -5963,73 +5963,72 @@ ggml_cgraph * llm_build_context::build_deepseek2() { if (model.layers[il].wkq_a_mqa) { auto mqa = ggml_mul_mat(ctx0, model.layers[il].wkq_a_mqa, cur); cb(mqa, "mqa", il); + size_t qnb1; if (!is_lite) { q = ggml_view_2d(ctx0, mqa, q_lora_rank, n_tokens, mqa->nb[1], 0); q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il); q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + qnb1 = q->nb[1]; cb(q, "q", il); - kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], q_lora_rank*ggml_element_size(mqa)); + kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], + q_lora_rank*ggml_element_size(mqa)); } else { q = ggml_view_2d(ctx0, mqa, n_embd_k_gqa, n_tokens, mqa->nb[1], 0); - kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], n_embd_k_gqa*ggml_element_size(mqa)); - //printf("mqa: %ld x %ld, q: %ld x %ld, kvrc: %ld x %ld, src0: %ld x %ld\n", mqa->ne[0], mqa->ne[1], q->ne[0], q->ne[1], kv_rope_compressed->ne[0], kv_rope_compressed->ne[1], model.layers[il].wkq_a_mqa->ne[0], model.layers[il].wkq_a_mqa->ne[1]); + kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], + n_embd_k_gqa*ggml_element_size(mqa)); + qnb1 = mqa->nb[1]; } q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], 0); + ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, 0); q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], ggml_row_size(q->type, n_embd_head_qk_nope)); + ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, ggml_row_size(q->type, n_embd_head_qk_nope)); k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens, mqa->nb[1], mqa->nb[1], ggml_row_size(kv_rope_compressed->type, kv_lora_rank)); kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, mqa->nb[1], 0); } else { - if (!is_lite) { - // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); - cb(q, "q", il); + if (!is_lite) { + q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); + cb(q, "q", il); - kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_rope_compressed, "kv_rope_compressed", il); + kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_rope_compressed, "kv_rope_compressed", il); - ggml_build_forward_expand(gf, q); - ggml_build_forward_expand(gf, kv_rope_compressed); + ggml_build_forward_expand(gf, q); + ggml_build_forward_expand(gf, kv_rope_compressed); - q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il); - cb(q, "q", il); + q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il); + cb(q, "q", il); - // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} - q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); - cb(q, "q", il); - } else { - q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(q, "q", il); + q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); + cb(q, "q", il); + } else { + q = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(q, "q", il); - kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_rope_compressed, "kv_rope_compressed", il); + kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_rope_compressed, "kv_rope_compressed", il); - ggml_build_forward_expand(gf, q); - ggml_build_forward_expand(gf, kv_rope_compressed); - } + ggml_build_forward_expand(gf, q); + ggml_build_forward_expand(gf, kv_rope_compressed); + } - // split into {n_head * n_embd_head_qk_nope, n_tokens} - q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0); + q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0); - // and {n_head * n_embd_head_qk_rope, n_tokens} - q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), - ggml_row_size(q->type, n_embd_head_qk_nope)); + q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, hparams.n_embd_head_k), + ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, n_embd_head_qk_nope)); - // and {n_embd_head_qk_rope, n_tokens} - k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens, - kv_rope_compressed->nb[1], - kv_rope_compressed->nb[1], - ggml_row_size(kv_rope_compressed->type, kv_lora_rank)); + k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens, + kv_rope_compressed->nb[1], + kv_rope_compressed->nb[1], + ggml_row_size(kv_rope_compressed->type, kv_lora_rank)); - kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, - kv_rope_compressed->nb[1], 0); + kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, + kv_rope_compressed->nb[1], 0); } cb(q_nope, "q_nope", il); cb(q_rope, "q_rope", il);