This commit is contained in:
Kawrakow
2026-01-05 08:42:02 +02:00
parent 9cf8c0cdde
commit 066bf766d2
2 changed files with 27 additions and 103 deletions

View File

@@ -1394,13 +1394,6 @@ static ggml_tensor * llm_build_kqv(
auto kq_size = k->ne[1]*q->ne[1]*q->ne[2]*sizeof(float)/(1024*1024);
if (cparams.attn_max_batch == 0 || cparams.attn_max_batch >= kq_size || k->ne[2] != q->ne[2] || v->ne[2] != q->ne[2] || sinks) {
//if (n_swa > 0 && k->ne[1] > n_swa + q->ne[1]) {
// auto nton = n_swa + q->ne[1];
// auto first = k->ne[1] - nton;
// k = ggml_view_3d(ctx, k, k->ne[0], nton, k->ne[2], k->nb[1], k->nb[2], k->nb[1]*first);
// v = ggml_view_3d(ctx, v, v->ne[0], nton, v->ne[2], v->nb[1], v->nb[2], v->nb[1]*first);
// kq_mask = ggml_view_3d(ctx, kq_mask, nton, kq_mask->ne[1], kq_mask->ne[2], kq_mask->nb[1], kq_mask->nb[2], kq_mask->nb[0]*first);
//}
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
@@ -9430,10 +9423,9 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
float freq_base_l = n_swa > 0 ? hparams.rope_freq_base_train_swa : cparams.rope_freq_base;
float freq_scale_l = n_swa > 0 ? hparams.rope_freq_scale_train_swa : hparams.rope_freq_scale_train;
if (!model.layers[il].wqkv && !model.layers[il].wqk && //cparams.flash_attn &&
if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
//printf("%s: %s\n", __func__, ggml_op_name(input->op));
ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr;
auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra;
auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra;
@@ -9565,68 +9557,39 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
ggml_row_size(split_kl->type, n_embd_head_k), 0);
cb(k, "k", il_cb);
auto v = ggml_view_3d(ctx0, split_vl, n_embd_head_v, n_kv, n_head_kv,
ggml_row_size(split_vl->type, split_wv->ne[1]),
ggml_row_size(split_vl->type, n_embd_head_v), 0);
cb(v, "v", il_cb);
#ifdef GGML_USE_VULKAN
constexpr bool use_f32_precision = true;
#else
constexpr bool use_f32_precision = false;
#endif
if (cparams.flash_attn) {
auto v = ggml_view_3d(ctx0, split_vl, n_embd_head_v, n_kv, n_head_kv,
ggml_row_size(split_vl->type, split_wv->ne[1]),
ggml_row_size(split_vl->type, n_embd_head_v), 0);
cb(v, "v", il_cb);
cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask, KQ_scale, hparams.f_max_alibi_bias,
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
cb(cur, "flash_attn", il_cb);
if (model.layers[il].attn_sinks && model.layers[il].attn_sinks->extra) {
auto split = (ggml_split_tensor_t *)model.layers[il].attn_sinks->extra;
GGML_ASSERT(split->n_device == wq->n_device);
GGML_ASSERT(split->splits[id]);
ggml_flash_attn_ext_add_sinks(cur, split->splits[id]);
//printf("%s(%d): added sink %d\n", __func__, il, id);
} else {
ggml_flash_attn_ext_add_sinks(cur, sinks);
}
if (n_swa > 0) {
((int32_t *)cur->op_params)[4] = n_swa;
}
// Some models produced NaNs/gibberish when FA is computed with f16 precision on CUDA
if (use_f32_precision || model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX ||
(model.arch == LLM_ARCH_DEEPSEEK2 && q->ne[1] <= 8) || model.arch == LLM_ARCH_COHERE2 || model.arch == LLM_ARCH_GLM4 ||
model.arch == LLM_ARCH_GLM4_MOE) {
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
}
cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
cb(cur, "flash_attn_reshaped", il_cb);
cur = ggml_flash_attn_ext(ctx0, q, k, v, KQ_mask, KQ_scale, hparams.f_max_alibi_bias,
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
cb(cur, "flash_attn", il_cb);
if (model.layers[il].attn_sinks && model.layers[il].attn_sinks->extra) {
auto split = (ggml_split_tensor_t *)model.layers[il].attn_sinks->extra;
GGML_ASSERT(split->n_device == wq->n_device);
GGML_ASSERT(split->splits[id]);
ggml_flash_attn_ext_add_sinks(cur, split->splits[id]);
} else {
int nhead_v = split_wv->ne[1]/n_embd_head_v;
auto v = ggml_view_3d(ctx0, split_vl,
n_kv, n_embd_head_v, nhead_v,
ggml_element_size(split_vl)*n_ctx,
ggml_element_size(split_vl)*n_ctx*nhead_v, 0);
cb(v, "v", il);
auto kq = ggml_mul_mat(ctx0, q, k);
cb(kq, "kq", il_cb);
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, KQ_scale, hparams.f_max_alibi_bias);
if (model.layers[il].attn_sinks && model.layers[il].attn_sinks->extra) {
auto split = (ggml_split_tensor_t *)model.layers[il].attn_sinks->extra;
GGML_ASSERT(split->n_device == wq->n_device);
GGML_ASSERT(split->splits[id]);
ggml_soft_max_add_sinks(kq, split->splits[id]);
//printf("%s(%d): added sink %d\n", __func__, il, id);
} else {
ggml_soft_max_add_sinks(kq, sinks);
}
cb(kq, "kq_soft_max_ext", il_cb);
auto kqv = ggml_mul_mat(ctx0, v, kq);
cb(kqv, "kqv", il_cb);
auto kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
cb(kqv_merged, "kqv_merged", il_cb);
cur = ggml_cont_2d(ctx0, kqv_merged, split_wo->ne[0], n_tokens);
cb(cur, "kqv_merged_cont", il_cb);
ggml_flash_attn_ext_add_sinks(cur, sinks);
}
if (n_swa > 0) {
((int32_t *)cur->op_params)[4] = n_swa;
}
// Some models produced NaNs/gibberish when FA is computed with f16 precision on CUDA
if (use_f32_precision || model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX ||
(model.arch == LLM_ARCH_DEEPSEEK2 && q->ne[1] <= 8) || model.arch == LLM_ARCH_COHERE2 || model.arch == LLM_ARCH_GLM4 ||
model.arch == LLM_ARCH_GLM4_MOE) {
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
}
cur = ggml_reshape_2d(ctx0, cur, split_wo->ne[0], n_tokens);
cb(cur, "flash_attn_reshaped", il_cb);
cur = llm_build_lora_mm(lctx, ctx0, split_wo, cur);
if (lctx.model.arch == LLM_ARCH_GLM4 || lctx.model.arch == LLM_ARCH_GLM4_MOE) {

View File

@@ -3049,9 +3049,6 @@ bool create_tensors_helper::create_tensors() {
prepare_split_tensors(-1, ctx_split, layer.rope_freqs, layer.split_rope_freqs, split, mem_used);
}
if (layer.wo && layer.wq && layer.wk && layer.wv) {
// TODO: fix this logic. It only works whe K and V head size is the same
//printf("Layer %d: q = %ld x %ld, k = %ld x %ld, v = %ld x %ld, qo = %ld x %ld\n", il, layer.wq->ne[0], layer.wq->ne[1],
// layer.wk->ne[0], layer.wk->ne[1], layer.wv->ne[0], layer.wv->ne[1], layer.wo->ne[0], layer.wo->ne[1]);
auto granularity_kq = hparams.n_embd_head_k * gqa_ratio;
auto granularity_vo = hparams.n_embd_head_v * gqa_ratio;
if (ggml_is_quantized(layer.wo->type)) {
@@ -3074,12 +3071,9 @@ bool create_tensors_helper::create_tensors() {
}
if (layer.attn_sinks) {
auto split_sinks = split_kq;
//printf("Attention sinks for layer %d:", il);
for (auto & s : split_sinks) {
s /= hparams.n_embd_head_k;
//printf(" %d", s);
}
//printf("\n");
prepare_split_tensors(0, ctx_split, layer.attn_sinks, layer.split_sinks, split_sinks, mem_used);
}
for (auto & s : split_kq) s /= gqa_ratio;
@@ -3095,39 +3089,6 @@ bool create_tensors_helper::create_tensors() {
if (layer.attn_k_norm) {
prepare_split_tensors(-1, ctx_split, layer.attn_k_norm, layer.split_k_norm, split_kq, mem_used);
}
/*
int attn_granularity = hparams.n_embd_head_v * gqa_ratio;
if (ggml_is_quantized(layer.wo->type)) {
auto tt = ggml_internal_get_type_traits(layer.wo->type);
if (tt.blck_size > attn_granularity) attn_granularity = tt.blck_size;
}
GGML_ASSERT(attn_granularity % hparams.n_embd_head_v == 0);
auto split = create_split(layer.wo->ne[0], attn_granularity, cur_splits, mem_used);
//printf("Split:"); for (auto s : split) printf(" %d", s); printf("\n");
prepare_split_tensors(0, ctx_split, layer.wo, layer.split_wo, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.wq, layer.split_wq, split, mem_used);
if (layer.bo) {
prepare_split_tensors(-1, ctx_split, layer.bo, layer.split_bo, split, mem_used);
}
if (layer.bq) {
prepare_split_tensors(0, ctx_split, layer.bq, layer.split_bq, split, mem_used);
}
if (layer.attn_q_norm) {
prepare_split_tensors(-1, ctx_split, layer.attn_q_norm, layer.split_q_norm, split, mem_used);
}
for (auto & s : split) s /= gqa_ratio;
prepare_split_tensors(1, ctx_split, layer.wk, layer.split_wk, split, mem_used);
prepare_split_tensors(1, ctx_split, layer.wv, layer.split_wv, split, mem_used);
if (layer.bk) {
prepare_split_tensors(0, ctx_split, layer.bk, layer.split_bk, split, mem_used);
}
if (layer.bv) {
prepare_split_tensors(0, ctx_split, layer.bv, layer.split_bv, split, mem_used);
}
if (layer.attn_k_norm) {
prepare_split_tensors(-1, ctx_split, layer.attn_k_norm, layer.split_k_norm, split, mem_used);
}
*/
}
if (layer.ffn_norm) {