This commit is contained in:
Kawrakow
2025-11-30 16:12:20 +00:00
parent 4fe175b555
commit eb9882407f
3 changed files with 15 additions and 3 deletions

View File

@@ -9220,11 +9220,15 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
else if (cur->type != GGML_TYPE_F32) {
cur = ggml_cast(ctx0, cur, GGML_TYPE_F32);
}
auto the_q_norm = model.layers[il].attn_q_norm ? model.layers[il].attn_q_norm->extra ?
((ggml_split_tensor_t *)model.layers[il].attn_q_norm->extra)->splits[id] : model.layers[il].attn_q_norm : nullptr;
auto the_k_norm = model.layers[il].attn_k_norm ? model.layers[il].attn_k_norm->extra ?
((ggml_split_tensor_t *)model.layers[il].attn_k_norm->extra)->splits[id] : model.layers[il].attn_k_norm : nullptr;
auto [Qcur, Kcur, Vcur] = llm_build_mul_mat_qkv(gf, cur, nullptr, nullptr, nullptr, nullptr,
split_wq, bq ? bq->splits[id] : nullptr,
split_wk, bk ? bk->splits[id] : nullptr,
split_wv, bv ? bv->splits[id] : nullptr,
model.layers[il].attn_q_norm, model.layers[il].attn_k_norm, f_attn_scale, il_cb);
the_q_norm, the_k_norm, f_attn_scale, il_cb);
auto rope_factors = rope_factors_in;
if (!rope_factors && model.layers[il].rope_freqs && model.layers[il].rope_freqs->extra) {
auto extra = (ggml_split_tensor_t *)model.layers[il].rope_freqs->extra;

View File

@@ -1858,9 +1858,9 @@ bool create_tensors_helper::create_glm4_moe_tensors(const LLM_TN & tn) {
layer.wo = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
// K/Q norm tensors (optional for GLM-4.5 355B variant)
layer.attn_q_norm = create_tensor(ctx_layer,
layer.attn_q_norm = create_tensor(ctx_split,
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags);
layer.attn_k_norm = create_tensor(ctx_layer,
layer.attn_k_norm = create_tensor(ctx_split,
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, llama_model_loader::TENSOR_NOT_REQUIRED | flags);
// Why are we adding an additional tensor type?
@@ -2945,6 +2945,9 @@ bool create_tensors_helper::create_tensors() {
if (layer.bq) {
prepare_split_tensors(0, ctx_split, layer.bq, layer.split_bq, split, mem_used);
}
if (layer.attn_q_norm) {
prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, split, mem_used);
}
for (auto & s : split) s /= gqa_ratio;
prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split, mem_used);
@@ -2954,6 +2957,9 @@ bool create_tensors_helper::create_tensors() {
if (layer.bv) {
prepare_split_tensors(0, ctx_split, layer.bv, layer.split_bv, split, mem_used);
}
if (layer.attn_k_norm) {
prepare_split_tensors(-1, ctx_split, layer.attn_k_norm, layer.split_k_norm, split, mem_used);
}
}
if (layer.ffn_norm) {

View File

@@ -198,6 +198,8 @@ struct llama_layer {
llama_split_tensor split_bqkv;
llama_split_tensor split_bqk;
llama_split_tensor split_bkv;
llama_split_tensor split_q_norm;
llama_split_tensor split_k_norm;
// relative position bias
struct ggml_tensor * attn_rel_b = nullptr;