Remove unused tensors from delta-net (#1350)

2026-05-11 08:30:19 +00:00 · 2026-03-02 16:02:40 +01:00
parent d4ac5f1566
commit 3735e88925
4 changed files with 9 additions and 58 deletions
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -4411,7 +4411,6 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
 }

 ggml_cgraph * llm_build_context::build_qwen3next() {
-    static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;

    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);

@@ -4429,18 +4428,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
    cb(lctx.inp_s_seq_qnext, "inp_s_seq_qnext", -1);
    ggml_set_input(lctx.inp_s_seq_qnext);

-    ggml_tensor * causal_mask = nullptr;
-    ggml_tensor * identity    = nullptr;
-    ggml_tensor * diag_mask   = nullptr;
-    causal_mask = ggml_tri(ctx0,
-            ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
-            GGML_TRI_TYPE_LOWER);
-    identity  = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
-    diag_mask = ggml_add(ctx0, causal_mask, identity);
-    ggml_build_forward_expand(gf, causal_mask);
-    ggml_build_forward_expand(gf, identity);
-    ggml_build_forward_expand(gf, diag_mask);
-
    float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

    ggml_tensor * cur = nullptr;
@@ -4478,7 +4465,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
            auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
            cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);
-            cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
+            cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
            if (il == n_layer - 1 && inp_out_ids) {
                cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -4530,7 +4517,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
 }

 ggml_cgraph * llm_build_context::build_qwen35moe() {
-    static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;

    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);

@@ -4550,18 +4536,6 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {

    float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

-    ggml_tensor * causal_mask = nullptr;
-    ggml_tensor * identity    = nullptr;
-    ggml_tensor * diag_mask   = nullptr;
-    causal_mask = ggml_tri(ctx0,
-            ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
-            GGML_TRI_TYPE_LOWER);
-    identity  = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
-    diag_mask = ggml_add(ctx0, causal_mask, identity);
-    ggml_build_forward_expand(gf, causal_mask);
-    ggml_build_forward_expand(gf, identity);
-    ggml_build_forward_expand(gf, diag_mask);
-
    ggml_tensor * cur = nullptr;

    for (int il = 0; il < n_layer; ++il) {
@@ -4582,7 +4556,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
            auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
            cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);
-            cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
+            cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
            if (il == n_layer - 1 && inp_out_ids) {
                cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -4623,7 +4597,6 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
 }

 ggml_cgraph * llm_build_context::build_qwen35() {
-    static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;

    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);

@@ -4643,18 +4616,6 @@ ggml_cgraph * llm_build_context::build_qwen35() {

    float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;

-    ggml_tensor * causal_mask = nullptr;
-    ggml_tensor * identity    = nullptr;
-    ggml_tensor * diag_mask   = nullptr;
-    causal_mask = ggml_tri(ctx0,
-            ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
-            GGML_TRI_TYPE_LOWER);
-    identity  = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
-    diag_mask = ggml_add(ctx0, causal_mask, identity);
-    ggml_build_forward_expand(gf, causal_mask);
-    ggml_build_forward_expand(gf, identity);
-    ggml_build_forward_expand(gf, diag_mask);
-
    ggml_tensor * cur = nullptr;

    for (int il = 0; il < n_layer; ++il) {
@@ -4675,7 +4636,7 @@ ggml_cgraph * llm_build_context::build_qwen35() {
            auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
            cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
            cb(cur, "attn_norm", il);
-            cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
+            cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
            if (il == n_layer - 1 && inp_out_ids) {
                cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
--- a/src/llama-delta-net.cpp
+++ b/src/llama-delta-net.cpp
@@ -222,8 +222,7 @@ std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_qkvz(ggml_context * ctx
 }

 ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf,
-            ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
-            ggml_tensor * diag_mask, ggml_tensor * inp_s_seq_qnext,
+            ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext,
            uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const {

    auto & model = lctx.model;
@@ -387,10 +386,6 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
    cb(k_conv, "k_conv_predelta", il);
    cb(v_conv, "v_conv_predelta", il);

-    GGML_ASSERT(causal_mask != nullptr);
-    GGML_ASSERT(identity    != nullptr);
-    GGML_ASSERT(diag_mask   != nullptr);
-
    auto [output, new_state] = build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb);

    cb(output, "attn_output", il);
@@ -429,8 +424,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
 }

 ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf,
-        ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
-        ggml_tensor * diag_mask, int il, const llm_build_cb & cb) const {
+        ggml_tensor * cur, int il, const llm_build_cb & cb) const {
    GGML_ASSERT(lctx.inp_s_seq_qnext != nullptr);

    auto & model = lctx.model;
@@ -448,7 +442,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap

    if (all_same_seq) {
        bool reset_state = batch.pos != nullptr && batch.pos[0] == 0;
-        return build_layer_attn_linear_core(ctx0, gf, cur, causal_mask, identity, diag_mask, lctx.inp_s_seq_qnext, token_seq_ids.front(), reset_state, il, cb);
+        return build_layer_attn_linear_core(ctx0, gf, cur, lctx.inp_s_seq_qnext, token_seq_ids.front(), reset_state, il, cb);
    }

    GGML_ASSERT(has_unique_seq_ids && "qwen3next mixed-sequence batches require unique sequence IDs per token");
@@ -460,7 +454,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap

        const bool reset_state_i = batch.pos != nullptr && batch.pos[i] == 0;
        const uint32_t state_seq_id_i = (uint32_t) token_seq_ids[i];
-        ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, causal_mask, identity, diag_mask, inp_s_seq_qnext_i, state_seq_id_i, reset_state_i, il, cb);
+        ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, inp_s_seq_qnext_i, state_seq_id_i, reset_state_i, il, cb);

        out = out == nullptr ? out_i : ggml_concat(ctx0, out, out_i, 1);
    }
--- a/src/llama-delta-net.h
+++ b/src/llama-delta-net.h
@@ -16,13 +16,11 @@ struct delta_net {
    std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const;

    ggml_tensor * build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf,
-            ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
-            ggml_tensor * diag_mask, ggml_tensor * inp_s_seq_qnext,
+            ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext,
            uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const;

    ggml_tensor * build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf,
-            ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
-            ggml_tensor * diag_mask, int il, const llm_build_cb & cb) const;
+            ggml_tensor * cur, int il, const llm_build_cb & cb) const;

 private:

--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4695,8 +4695,6 @@ struct llama_context * llama_init_from_model(
                 struct llama_model * model,
        struct llama_context_params   params) {

-    printf("===================================== %s: %s\n", __func__, ggml_type_name(params.type_reduce));
-
    if (!model) {
        LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
        return nullptr;