From 985f80180ee63059eaaadf8eed101c38196ac3fb Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sat, 28 Feb 2026 10:10:41 +0000 Subject: [PATCH] Minor delta-net tweak --- src/llama-build-context.cpp | 7 +++++-- src/llama-delta-net.cpp | 20 +++++++++++++------- src/llama-delta-net.h | 2 +- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 4df4e0bd..54f6d00a 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -1552,6 +1552,7 @@ static ggml_tensor * llm_build_kqv( cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias, hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); + cb(cur, "fa", il); ggml_flash_attn_ext_add_sinks(cur, sinks); if (n_swa > 0) { ((int32_t *)cur->op_params)[4] = n_swa; @@ -1815,7 +1816,9 @@ std::tuple llm_build_con auto row_size = ggml_row_size(Qaux->type, n_embd_head_k); // TODO: check why CUDA performance suffers so much if we don't make these two tensors contiguous auto Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, Qaux, n_embd_head_k, Qaux->ne[0]/(2*n_embd_head_k), n_tokens, 2*row_size, Qaux->nb[1], 0)); + cb(Qcur, "Qcur_cont", il); auto gate = ggml_cont_2d(ctx0, ggml_view_3d(ctx0, Qaux, n_embd_head_k, Qaux->ne[0]/(2*n_embd_head_k), n_tokens, 2*row_size, Qaux->nb[1], row_size), Qaux->ne[0]/2, n_tokens); + cb(gate, "gate_cont", il); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, Kcur->ne[0]/n_embd_head_k, n_tokens); if (q_norm) { Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il); @@ -10384,8 +10387,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens ext_factor, attn_factor, beta_fast, beta_slow); } } - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); + cb(Qcur, "Qcur_roped", il); + cb(Kcur, "Kcur_roped", il); if (inp_attn_scale) { Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale); diff --git a/src/llama-delta-net.cpp b/src/llama-delta-net.cpp index b6566d05..4060f9b6 100644 --- a/src/llama-delta-net.cpp +++ b/src/llama-delta-net.cpp @@ -151,18 +151,18 @@ std::pair delta_net::build_fused_delta_net(ggml_co return {output_tokens, new_state}; } -std::pair delta_net::build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb) const { +std::pair delta_net::build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const { auto & model = lctx.model; const int64_t n_tok = input->ne[1]; if (model.layers[il].wqkv) { ggml_tensor * qkv_mixed = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, input); cb(qkv_mixed, "qkv_mixed", il); - qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_tok, 1); - cb(qkv_mixed, "linear_attn_qkv_mixed", il); - ggml_tensor * z = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, input); cb(z, "z", il); - + ggml_build_forward_expand(gf, qkv_mixed); + ggml_build_forward_expand(gf, z); + qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_tok, 1); + cb(qkv_mixed, "linear_attn_qkv_mixed", il); return { qkv_mixed, z }; } @@ -246,7 +246,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ const int64_t n_seqs = 1; const int64_t n_seq_tokens = n_tok; - auto [qkv_mixed, z] = build_qkvz(ctx0, cur, il, cb); + auto [qkv_mixed, z] = build_qkvz(ctx0, cur, il, cb, gf); ggml_tensor *alpha, *beta; if (model.layers[il].ssm_beta_alpha) { @@ -290,6 +290,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ ggml_build_forward_expand(gf, alpha); ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); + cb(alpha_biased, "alpha_biased", il); ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); cb(alpha_softplus, "a_softplus", il); ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); @@ -362,6 +363,8 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ ggml_tensor * q_repeated = ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1); ggml_tensor * k_repeated = ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1); + cb(q_repeated, "q_repeated", il); + cb(k_repeated, "k_repeated", il); q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1); k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1); @@ -392,8 +395,11 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ ggml_tensor * new_conv_flat = ggml_reshape_2d(ctx0, new_conv_states_cont, conv_state_dim, 1); ggml_tensor * new_ssm_flat = ggml_reshape_2d(ctx0, new_state, ssm_state_dim, 1); ggml_tensor * new_state_flat = ggml_concat(ctx0, new_conv_flat, new_ssm_flat, 0); + cb(new_state_flat, "new_state_flat", il); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_state_flat, state_dst)); + auto state_cpy = ggml_cpy(ctx0, new_state_flat, state_dst); + cb(state_cpy, "state_cpy", il); + ggml_build_forward_expand(gf, state_cpy); ggml_tensor * attn_out_2d = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_tok); ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_tok); diff --git a/src/llama-delta-net.h b/src/llama-delta-net.h index 1bd72e2c..c9acac1d 100644 --- a/src/llama-delta-net.h +++ b/src/llama-delta-net.h @@ -13,7 +13,7 @@ struct delta_net { ggml_tensor * g, ggml_tensor * beta, ggml_tensor * state, int il, const llm_build_cb & cb); - std::pair build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb) const; + std::pair build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const; ggml_tensor * build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,