mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-06 12:00:29 +00:00
Remove unused tensors from delta-net (#1350)
This commit is contained in:
@@ -4411,7 +4411,6 @@ ggml_cgraph * llm_build_context::build_qwen3moe() {
|
||||
}
|
||||
|
||||
ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);
|
||||
|
||||
@@ -4429,18 +4428,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
cb(lctx.inp_s_seq_qnext, "inp_s_seq_qnext", -1);
|
||||
ggml_set_input(lctx.inp_s_seq_qnext);
|
||||
|
||||
ggml_tensor * causal_mask = nullptr;
|
||||
ggml_tensor * identity = nullptr;
|
||||
ggml_tensor * diag_mask = nullptr;
|
||||
causal_mask = ggml_tri(ctx0,
|
||||
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
|
||||
diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
ggml_build_forward_expand(gf, causal_mask);
|
||||
ggml_build_forward_expand(gf, identity);
|
||||
ggml_build_forward_expand(gf, diag_mask);
|
||||
|
||||
float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
ggml_tensor * cur = nullptr;
|
||||
@@ -4478,7 +4465,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
@@ -4530,7 +4517,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
}
|
||||
|
||||
ggml_cgraph * llm_build_context::build_qwen35moe() {
|
||||
static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);
|
||||
|
||||
@@ -4550,18 +4536,6 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
|
||||
|
||||
float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
ggml_tensor * causal_mask = nullptr;
|
||||
ggml_tensor * identity = nullptr;
|
||||
ggml_tensor * diag_mask = nullptr;
|
||||
causal_mask = ggml_tri(ctx0,
|
||||
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
|
||||
diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
ggml_build_forward_expand(gf, causal_mask);
|
||||
ggml_build_forward_expand(gf, identity);
|
||||
ggml_build_forward_expand(gf, diag_mask);
|
||||
|
||||
ggml_tensor * cur = nullptr;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4582,7 +4556,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
|
||||
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
@@ -4623,7 +4597,6 @@ ggml_cgraph * llm_build_context::build_qwen35moe() {
|
||||
}
|
||||
|
||||
ggml_cgraph * llm_build_context::build_qwen35() {
|
||||
static constexpr int QWEN3NEXT_CHUNK_SIZE = 64;
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model, n_tokens), false);
|
||||
|
||||
@@ -4643,18 +4616,6 @@ ggml_cgraph * llm_build_context::build_qwen35() {
|
||||
|
||||
float KQ_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
||||
|
||||
ggml_tensor * causal_mask = nullptr;
|
||||
ggml_tensor * identity = nullptr;
|
||||
ggml_tensor * diag_mask = nullptr;
|
||||
causal_mask = ggml_tri(ctx0,
|
||||
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE), 1.0f));
|
||||
diag_mask = ggml_add(ctx0, causal_mask, identity);
|
||||
ggml_build_forward_expand(gf, causal_mask);
|
||||
ggml_build_forward_expand(gf, identity);
|
||||
ggml_build_forward_expand(gf, diag_mask);
|
||||
|
||||
ggml_tensor * cur = nullptr;
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
@@ -4675,7 +4636,7 @@ ggml_cgraph * llm_build_context::build_qwen35() {
|
||||
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
|
||||
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
|
||||
cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb);
|
||||
if (il == n_layer - 1 && inp_out_ids) {
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
|
||||
@@ -222,8 +222,7 @@ std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_qkvz(ggml_context * ctx
|
||||
}
|
||||
|
||||
ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf,
|
||||
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask, ggml_tensor * inp_s_seq_qnext,
|
||||
ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext,
|
||||
uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const {
|
||||
|
||||
auto & model = lctx.model;
|
||||
@@ -387,10 +386,6 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
|
||||
cb(k_conv, "k_conv_predelta", il);
|
||||
cb(v_conv, "v_conv_predelta", il);
|
||||
|
||||
GGML_ASSERT(causal_mask != nullptr);
|
||||
GGML_ASSERT(identity != nullptr);
|
||||
GGML_ASSERT(diag_mask != nullptr);
|
||||
|
||||
auto [output, new_state] = build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb);
|
||||
|
||||
cb(output, "attn_output", il);
|
||||
@@ -429,8 +424,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
|
||||
}
|
||||
|
||||
ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf,
|
||||
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask, int il, const llm_build_cb & cb) const {
|
||||
ggml_tensor * cur, int il, const llm_build_cb & cb) const {
|
||||
GGML_ASSERT(lctx.inp_s_seq_qnext != nullptr);
|
||||
|
||||
auto & model = lctx.model;
|
||||
@@ -448,7 +442,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap
|
||||
|
||||
if (all_same_seq) {
|
||||
bool reset_state = batch.pos != nullptr && batch.pos[0] == 0;
|
||||
return build_layer_attn_linear_core(ctx0, gf, cur, causal_mask, identity, diag_mask, lctx.inp_s_seq_qnext, token_seq_ids.front(), reset_state, il, cb);
|
||||
return build_layer_attn_linear_core(ctx0, gf, cur, lctx.inp_s_seq_qnext, token_seq_ids.front(), reset_state, il, cb);
|
||||
}
|
||||
|
||||
GGML_ASSERT(has_unique_seq_ids && "qwen3next mixed-sequence batches require unique sequence IDs per token");
|
||||
@@ -460,7 +454,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap
|
||||
|
||||
const bool reset_state_i = batch.pos != nullptr && batch.pos[i] == 0;
|
||||
const uint32_t state_seq_id_i = (uint32_t) token_seq_ids[i];
|
||||
ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, causal_mask, identity, diag_mask, inp_s_seq_qnext_i, state_seq_id_i, reset_state_i, il, cb);
|
||||
ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, inp_s_seq_qnext_i, state_seq_id_i, reset_state_i, il, cb);
|
||||
|
||||
out = out == nullptr ? out_i : ggml_concat(ctx0, out, out_i, 1);
|
||||
}
|
||||
|
||||
@@ -16,13 +16,11 @@ struct delta_net {
|
||||
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const;
|
||||
|
||||
ggml_tensor * build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf,
|
||||
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask, ggml_tensor * inp_s_seq_qnext,
|
||||
ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext,
|
||||
uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const;
|
||||
|
||||
ggml_tensor * build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf,
|
||||
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
|
||||
ggml_tensor * diag_mask, int il, const llm_build_cb & cb) const;
|
||||
ggml_tensor * cur, int il, const llm_build_cb & cb) const;
|
||||
|
||||
private:
|
||||
|
||||
|
||||
@@ -4695,8 +4695,6 @@ struct llama_context * llama_init_from_model(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params) {
|
||||
|
||||
printf("===================================== %s: %s\n", __func__, ggml_type_name(params.type_reduce));
|
||||
|
||||
if (!model) {
|
||||
LLAMA_LOG_ERROR("%s: model cannot be NULL\n", __func__);
|
||||
return nullptr;
|
||||
|
||||
Reference in New Issue
Block a user