Formatting

This commit is contained in:
Iwan Kawrakow
2025-11-11 19:09:04 +02:00
parent 0576d42183
commit b73d66f76e

View File

@@ -5963,73 +5963,72 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
if (model.layers[il].wkq_a_mqa) {
auto mqa = ggml_mul_mat(ctx0, model.layers[il].wkq_a_mqa, cur);
cb(mqa, "mqa", il);
size_t qnb1;
if (!is_lite) {
q = ggml_view_2d(ctx0, mqa, q_lora_rank, n_tokens, mqa->nb[1], 0);
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
qnb1 = q->nb[1];
cb(q, "q", il);
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], q_lora_rank*ggml_element_size(mqa));
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1],
q_lora_rank*ggml_element_size(mqa));
} else {
q = ggml_view_2d(ctx0, mqa, n_embd_k_gqa, n_tokens, mqa->nb[1], 0);
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], n_embd_k_gqa*ggml_element_size(mqa));
//printf("mqa: %ld x %ld, q: %ld x %ld, kvrc: %ld x %ld, src0: %ld x %ld\n", mqa->ne[0], mqa->ne[1], q->ne[0], q->ne[1], kv_rope_compressed->ne[0], kv_rope_compressed->ne[1], model.layers[il].wkq_a_mqa->ne[0], model.layers[il].wkq_a_mqa->ne[1]);
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1],
n_embd_k_gqa*ggml_element_size(mqa));
qnb1 = mqa->nb[1];
}
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], 0);
ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, 0);
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], ggml_row_size(q->type, n_embd_head_qk_nope));
ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, ggml_row_size(q->type, n_embd_head_qk_nope));
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
mqa->nb[1], mqa->nb[1], ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, mqa->nb[1], 0);
}
else {
if (!is_lite) {
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il);
if (!is_lite) {
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
cb(q, "q", il);
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_rope_compressed, "kv_rope_compressed", il);
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_rope_compressed, "kv_rope_compressed", il);
ggml_build_forward_expand(gf, q);
ggml_build_forward_expand(gf, kv_rope_compressed);
ggml_build_forward_expand(gf, q);
ggml_build_forward_expand(gf, kv_rope_compressed);
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
cb(q, "q", il);
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
cb(q, "q", il);
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
} else {
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(q, "q", il);
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
cb(q, "q", il);
} else {
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
cb(q, "q", il);
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_rope_compressed, "kv_rope_compressed", il);
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
cb(kv_rope_compressed, "kv_rope_compressed", il);
ggml_build_forward_expand(gf, q);
ggml_build_forward_expand(gf, kv_rope_compressed);
}
ggml_build_forward_expand(gf, q);
ggml_build_forward_expand(gf, kv_rope_compressed);
}
// split into {n_head * n_embd_head_qk_nope, n_tokens}
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0);
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0);
// and {n_head * n_embd_head_qk_rope, n_tokens}
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
ggml_row_size(q->type, hparams.n_embd_head_k),
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
ggml_row_size(q->type, n_embd_head_qk_nope));
// and {n_embd_head_qk_rope, n_tokens}
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
kv_rope_compressed->nb[1],
kv_rope_compressed->nb[1],
ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
kv_rope_compressed->nb[1],
kv_rope_compressed->nb[1],
ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens,
kv_rope_compressed->nb[1], 0);
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens,
kv_rope_compressed->nb[1], 0);
}
cb(q_nope, "q_nope", il);
cb(q_rope, "q_rope", il);