mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-02-26 16:14:10 +00:00
Merge Q and K for DeepSeek
This commit is contained in:
@@ -262,6 +262,7 @@ enum llm_tensor {
|
||||
LLM_TENSOR_ATTN_Q_A,
|
||||
LLM_TENSOR_ATTN_Q_B,
|
||||
LLM_TENSOR_ATTN_KV_A_MQA,
|
||||
LLM_TENSOR_ATTN_KQ_A_MQA,
|
||||
LLM_TENSOR_ATTN_KV_B,
|
||||
LLM_TENSOR_ATTN_K_B,
|
||||
LLM_TENSOR_ATTN_V_B,
|
||||
|
||||
@@ -5927,6 +5927,7 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
||||
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
||||
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
||||
const uint32_t q_lora_rank = hparams.n_lora_q;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
@@ -5953,12 +5954,46 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
|
||||
// self_attention
|
||||
{
|
||||
struct ggml_tensor * q = NULL;
|
||||
ggml_tensor * q = nullptr;
|
||||
ggml_tensor * kv_rope_compressed = nullptr;
|
||||
ggml_tensor * q_rope;
|
||||
ggml_tensor * q_nope;
|
||||
ggml_tensor * k_rope;
|
||||
ggml_tensor * kv_compressed;
|
||||
if (model.layers[il].wkq_a_mqa) {
|
||||
auto mqa = ggml_mul_mat(ctx0, model.layers[il].wkq_a_mqa, cur);
|
||||
cb(mqa, "mqa", il);
|
||||
if (!is_lite) {
|
||||
q = ggml_view_2d(ctx0, mqa, q_lora_rank, n_tokens, mqa->nb[1], 0);
|
||||
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
||||
cb(q, "q", il);
|
||||
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], q_lora_rank*ggml_element_size(mqa));
|
||||
} else {
|
||||
q = ggml_view_2d(ctx0, mqa, n_embd_k_gqa, n_tokens, mqa->nb[1], 0);
|
||||
kv_rope_compressed = ggml_view_2d(ctx0, mqa, kv_lora_rank + n_embd_head_qk_rope, n_tokens, mqa->nb[1], n_embd_k_gqa*ggml_element_size(mqa));
|
||||
//printf("mqa: %ld x %ld, q: %ld x %ld, kvrc: %ld x %ld, src0: %ld x %ld\n", mqa->ne[0], mqa->ne[1], q->ne[0], q->ne[1], kv_rope_compressed->ne[0], kv_rope_compressed->ne[1], model.layers[il].wkq_a_mqa->ne[0], model.layers[il].wkq_a_mqa->ne[1]);
|
||||
}
|
||||
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], 0);
|
||||
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k), mqa->nb[1], ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
|
||||
mqa->nb[1], mqa->nb[1], ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
|
||||
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens, mqa->nb[1], 0);
|
||||
}
|
||||
else {
|
||||
if (!is_lite) {
|
||||
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
||||
cb(q, "q", il);
|
||||
|
||||
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||
cb(kv_rope_compressed, "kv_rope_compressed", il);
|
||||
|
||||
ggml_build_forward_expand(gf, q);
|
||||
ggml_build_forward_expand(gf, kv_rope_compressed);
|
||||
|
||||
q = llm_build_norm(ctx0, q, hparams, model.layers[il].attn_q_a_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(q, "q", il);
|
||||
|
||||
@@ -5968,54 +6003,49 @@ ggml_cgraph * llm_build_context::build_deepseek2() {
|
||||
} else {
|
||||
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(q, "q", il);
|
||||
|
||||
kv_rope_compressed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||
cb(kv_rope_compressed, "kv_rope_compressed", il);
|
||||
|
||||
ggml_build_forward_expand(gf, q);
|
||||
ggml_build_forward_expand(gf, kv_rope_compressed);
|
||||
}
|
||||
|
||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
0);
|
||||
cb(q_nope, "q_nope", il);
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head), 0);
|
||||
|
||||
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
||||
struct ggml_tensor * q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||
ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||
cb(q_rope, "q_rope", il);
|
||||
|
||||
q_rope = ggml_rope_ext(
|
||||
ctx0, q_rope, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||
);
|
||||
cb(q_rope, "q_rope", il);
|
||||
|
||||
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
||||
struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||
cb(kv_rope_compresseed, "kv_rope_compresseed", il);
|
||||
|
||||
// and {n_embd_head_qk_rope, n_tokens}
|
||||
struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
||||
kv_rope_compresseed->nb[1],
|
||||
kv_rope_compresseed->nb[1],
|
||||
ggml_row_size(kv_rope_compresseed->type, kv_lora_rank));
|
||||
cb(k_rope, "k_rope", il);
|
||||
k_rope = ggml_view_3d(ctx0, kv_rope_compressed, n_embd_head_qk_rope, 1, n_tokens,
|
||||
kv_rope_compressed->nb[1],
|
||||
kv_rope_compressed->nb[1],
|
||||
ggml_row_size(kv_rope_compressed->type, kv_lora_rank));
|
||||
|
||||
// shared RoPE key
|
||||
k_rope = ggml_rope_ext(
|
||||
ctx0, k_rope, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||
);
|
||||
kv_compressed = ggml_view_2d(ctx0, kv_rope_compressed, kv_lora_rank, n_tokens,
|
||||
kv_rope_compressed->nb[1], 0);
|
||||
}
|
||||
cb(q_nope, "q_nope", il);
|
||||
cb(q_rope, "q_rope", il);
|
||||
cb(k_rope, "k_rope", il);
|
||||
|
||||
// split into {kv_lora_rank, n_tokens}
|
||||
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens,
|
||||
kv_rope_compresseed->nb[1],
|
||||
0);
|
||||
cb(kv_compressed, "kv_compressed", il);
|
||||
|
||||
q_rope = ggml_rope_ext(ctx0, q_rope, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow);
|
||||
cb(q_rope, "q_rope", il);
|
||||
|
||||
k_rope = ggml_rope_ext(ctx0, k_rope, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow);
|
||||
cb(k_rope, "k_rope", il);
|
||||
|
||||
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, model.layers[il].attn_kv_a_norm, NULL, LLM_NORM_RMS, cb, il);
|
||||
cb(kv_compressed, "kv_compressed", il);
|
||||
|
||||
|
||||
@@ -1645,14 +1645,43 @@ bool create_tensors_helper::create_deepseek2_tensors(const LLM_TN & tn) {
|
||||
|
||||
layer.attn_kv_a_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank});
|
||||
|
||||
bool merged = false;
|
||||
if (ml.merge_qkv) {
|
||||
auto q_name = is_lite ? tn(LLM_TENSOR_ATTN_Q, "weight", i) : tn(LLM_TENSOR_ATTN_Q_A, "weight", i);
|
||||
auto k_name = tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i);
|
||||
auto wq = ml.require_tensor_meta(q_name.c_str());
|
||||
auto wk = ml.require_tensor_meta(k_name.c_str());
|
||||
GGML_ASSERT(wq && wk);
|
||||
if (wq->type == wk->type) {
|
||||
GGML_ASSERT(wq->ne[0] == wk->ne[0]);
|
||||
layer.wkq_a_mqa = ggml_new_tensor_2d(ctx_split, wq->type, wq->ne[0], wq->ne[1] + wk->ne[1]);
|
||||
snprintf(layer.wkq_a_mqa->name, GGML_MAX_NAME, "blk.%d.attn_qk_a_mqa.weight", i);
|
||||
if (is_lite) {
|
||||
layer.wq = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0);
|
||||
} else {
|
||||
layer.wq_a = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, q_name.c_str(), { wq->ne[0], wq->ne[1] }, 0);
|
||||
}
|
||||
layer.wkv_a_mqa = ml.create_tensor_as_view(ctx_split, layer.wkq_a_mqa, k_name.c_str(), { wk->ne[0], wk->ne[1] }, wq->ne[1]*wq->nb[1]);
|
||||
merged = true;
|
||||
use_mmap_buffer = false;
|
||||
printf("============== Merged %s (%ld x %ld) and %s (%ld x %ld)\n", q_name.c_str(),
|
||||
wq->ne[0], wq->ne[1], k_name.c_str(), wk->ne[0], wk->ne[1]);
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_lite) {
|
||||
layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
||||
if (!merged) {
|
||||
layer.wq_a = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank});
|
||||
}
|
||||
layer.wq_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k});
|
||||
} else {
|
||||
} else if (!merged) {
|
||||
layer.wq = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa});
|
||||
}
|
||||
|
||||
layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
|
||||
if (!merged) {
|
||||
layer.wkv_a_mqa = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i),{n_embd, kv_lora_rank + (n_embd_head_qk_rope)});
|
||||
}
|
||||
|
||||
layer.wkv_b = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
|
||||
{kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||
if (!layer.wkv_b) {
|
||||
|
||||
@@ -805,6 +805,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
|
||||
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
|
||||
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
|
||||
{ LLM_TENSOR_ATTN_KQ_A_MQA, "blk.%d.attn_kq_a_mqa" },
|
||||
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
|
||||
{ LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" },
|
||||
{ LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" },
|
||||
|
||||
@@ -160,6 +160,7 @@ struct llama_layer {
|
||||
struct ggml_tensor * wq_a = nullptr;
|
||||
struct ggml_tensor * wq_b = nullptr;
|
||||
struct ggml_tensor * wkv_a_mqa = nullptr;
|
||||
struct ggml_tensor * wkq_a_mqa = nullptr;
|
||||
struct ggml_tensor * wkv_b = nullptr;
|
||||
struct ggml_tensor * wk_b = nullptr;
|
||||
struct ggml_tensor * wv_b = nullptr;
|
||||
|
||||
Reference in New Issue
Block a user