Remove unused tensors from delta-net (#1350)

This commit is contained in:
Kawrakow
2026-03-02 16:02:40 +01:00
committed by GitHub
parent d4ac5f1566
commit 3735e88925
4 changed files with 9 additions and 58 deletions

View File

@@ -4411,7 +4411,6 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
}
ggml_cgraph * llm_build_context::build_qwen3next() {
static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);
@@ -4429,18 +4428,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
cb(lctx.inp_s_seq_qnext, "inp_s_seq_qnext", -1);
ggml_set_input(lctx.inp_s_seq_qnext);
ggml_tensor * causal_mask = nullptr;
ggml_tensor * identity = nullptr;
ggml_tensor * diag_mask = nullptr;
causal_mask = ggml_tri(ctx0,
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER);
identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);
float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
ggml_tensor * cur = nullptr;
@@ -4478,7 +4465,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -4530,7 +4517,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
}
ggml_cgraph * llm_build_context::build_qwen35moe() {
static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);
@@ -4550,18 +4536,6 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
ggml_tensor * causal_mask = nullptr;
ggml_tensor * identity = nullptr;
ggml_tensor * diag_mask = nullptr;
causal_mask = ggml_tri(ctx0,
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER);
identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);
ggml_tensor * cur = nullptr;
for (int il = 0; il < n_layer; ++il) {
@@ -4582,7 +4556,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
@@ -4623,7 +4597,6 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
}
ggml_cgraph * llm_build_context::build_qwen35() {
static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);
@@ -4643,18 +4616,6 @@ ggml_cgraph * llm_build_context::build_qwen35() {
float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
ggml_tensor * causal_mask = nullptr;
ggml_tensor * identity = nullptr;
ggml_tensor * diag_mask = nullptr;
causal_mask = ggml_tri(ctx0,
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER);
identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);
ggml_tensor * cur = nullptr;
for (int il = 0; il < n_layer; ++il) {
@@ -4675,7 +4636,7 @@ ggml_cgraph * llm_build_context::build_qwen35() {
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);

View File

@@ -222,8 +222,7 @@ std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_qkvz(ggml_context * ctx
}
ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf,
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
ggml_tensor * diag_mask, ggml_tensor * inp_s_seq_qnext,
ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext,
uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const {
auto & model = lctx.model;
@@ -387,10 +386,6 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
cb(k_conv, "k_conv_predelta", il);
cb(v_conv, "v_conv_predelta", il);
GGML_ASSERT(causal_mask != nullptr);
GGML_ASSERT(identity != nullptr);
GGML_ASSERT(diag_mask != nullptr);
auto [output, new_state] = build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb);
cb(output, "attn_output", il);
@@ -429,8 +424,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
}
ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf,
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
ggml_tensor * diag_mask, int il, const llm_build_cb & cb) const {
ggml_tensor * cur, int il, const llm_build_cb & cb) const {
GGML_ASSERT(lctx.inp_s_seq_qnext != nullptr);
auto & model = lctx.model;
@@ -448,7 +442,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap
if (all_same_seq) {
bool reset_state = batch.pos != nullptr && batch.pos[0] == 0;
return build_layer_attn_linear_core(ctx0, gf, cur, causal_mask, identity, diag_mask, lctx.inp_s_seq_qnext, token_seq_ids.front(), reset_state, il, cb);
return build_layer_attn_linear_core(ctx0, gf, cur, lctx.inp_s_seq_qnext, token_seq_ids.front(), reset_state, il, cb);
}
GGML_ASSERT(has_unique_seq_ids && "qwen3next mixed-sequence batches require unique sequence IDs per token");
@@ -460,7 +454,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap
const bool reset_state_i = batch.pos != nullptr && batch.pos[i] == 0;
const uint32_t state_seq_id_i = (uint32_t) token_seq_ids[i];
ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, causal_mask, identity, diag_mask, inp_s_seq_qnext_i, state_seq_id_i, reset_state_i, il, cb);
ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, inp_s_seq_qnext_i, state_seq_id_i, reset_state_i, il, cb);
out = out == nullptr ? out_i : ggml_concat(ctx0, out, out_i, 1);
}

View File

@@ -16,13 +16,11 @@ struct delta_net {
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const;
ggml_tensor * build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf,
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
ggml_tensor * diag_mask, ggml_tensor * inp_s_seq_qnext,
ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext,
uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const;
ggml_tensor * build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf,
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
ggml_tensor * diag_mask, int il, const llm_build_cb & cb) const;
ggml_tensor * cur, int il, const llm_build_cb & cb) const;
private:

View File

@@ -4695,8 +4695,6 @@ struct llama_context * llama_init_from_model(
struct llama_model * model,
struct llama_context_params params) {
printf("===================================== %s: %s\n", __func__, ggml_type_name(params.type_reduce));
if (!model) {
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
return nullptr;