mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-23 14:44:09 +00:00
Formatting
This commit is contained in:
@@ -5963,73 +5963,72 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
if (model.layers[il].wkq_a_mqa) {
|
||||
auto mqa = ggml_mul_mat(ctx0, model.layers[il].wkq_a_mqa, cur);
|
||||
cb(mqa, "mqa", il);
|
||||
size_t qnb1;
|
||||
if (!is_lite) {
|
||||
q = ggml_view_2d(ctx0, mqa, q_lora_rank, n_tokens, mqa->nb[1], 0);
|
||||
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
||||
qnb1 = q->nb[1];
|
||||
cb(q, "q", il);
|
||||
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], q_lora_rank*ggml_element_size(mqa));
|
||||
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1],
|
||||
q_lora_rank*ggml_element_size(mqa));
|
||||
} else {
|
||||
q = ggml_view_2d(ctx0, mqa, n_embd_k_gqa, n_tokens, mqa->nb[1], 0);
|
||||
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], n_embd_k_gqa*ggml_element_size(mqa));
|
||||
//printf("mqa: %ld x %ld, q: %ld x %ld, kvrc: %ld x %ld, src0: %ld x %ld\n", mqa->ne[0], mqa->ne[1], q->ne[0], q->ne[1], kv_rope_compressed->ne[0], kv_rope_compressed->ne[1], model.layers[il].wkq_a_mqa->ne[0], model.layers[il].wkq_a_mqa->ne[1]);
|
||||
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1],
|
||||
n_embd_k_gqa*ggml_element_size(mqa));
|
||||
qnb1 = mqa->nb[1];
|
||||
}
|
||||
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], 0);
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, 0);
|
||||
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k), qnb1, ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
|
||||
mqa->nb[1], mqa->nb[1], ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
|
||||
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, mqa->nb[1], 0);
|
||||
}
|
||||
else {
|
||||
if (!is_lite) {
|
||||
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
||||
cb(q, "q", il);
|
||||
if (!is_lite) {
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
||||
cb(q, "q", il);
|
||||
|
||||
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||
cb(kv_rope_compressed, "kv_rope_compressed", il);
|
||||
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||
cb(kv_rope_compressed, "kv_rope_compressed", il);
|
||||
|
||||
ggml_build_forward_expand(gf, q);
|
||||
ggml_build_forward_expand(gf, kv_rope_compressed);
|
||||
ggml_build_forward_expand(gf, q);
|
||||
ggml_build_forward_expand(gf, kv_rope_compressed);
|
||||
|
||||
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(q, "q", il);
|
||||
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(q, "q", il);
|
||||
|
||||
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
||||
cb(q, "q", il);
|
||||
} else {
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(q, "q", il);
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
||||
cb(q, "q", il);
|
||||
} else {
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(q, "q", il);
|
||||
|
||||
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||
cb(kv_rope_compressed, "kv_rope_compressed", il);
|
||||
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||
cb(kv_rope_compressed, "kv_rope_compressed", il);
|
||||
|
||||
ggml_build_forward_expand(gf, q);
|
||||
ggml_build_forward_expand(gf, kv_rope_compressed);
|
||||
}
|
||||
ggml_build_forward_expand(gf, q);
|
||||
ggml_build_forward_expand(gf, kv_rope_compressed);
|
||||
}
|
||||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0);
|
||||
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0);
|
||||
|
||||
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
||||
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
|
||||
// and {n_embd_head_qk_rope, n_tokens}
|
||||
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
|
||||
kv_rope_compressed->nb[1],
|
||||
kv_rope_compressed->nb[1],
|
||||
ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
|
||||
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
|
||||
kv_rope_compressed->nb[1],
|
||||
kv_rope_compressed->nb[1],
|
||||
ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
|
||||
|
||||
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens,
|
||||
kv_rope_compressed->nb[1], 0);
|
||||
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens,
|
||||
kv_rope_compressed->nb[1], 0);
|
||||
}
|
||||
cb(q_nope, "q_nope", il);
|
||||
cb(q_rope, "q_rope", il);
|
||||
|
||||
Reference in New Issue
Block a user