mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-05-11 00:20:19 +00:00
qwen3next: add decode-only fused delta mode
This commit is contained in:
@@ -113,3 +113,32 @@ Outcome:
|
||||
- No stable speed win in our setup after repeated runs.
|
||||
- Autoregressive rewrite specifically hurt TG throughput in non-fused mode and was reverted.
|
||||
- Final code keeps only the fused-default safety fix (non-fused by default).
|
||||
|
||||
## Decode-Only Fused Mode Trial (`LLAMA_QWEN3NEXT_FUSED_DELTA=2`)
|
||||
|
||||
Date: 2026-02-08
|
||||
|
||||
Code change:
|
||||
|
||||
- Added mode `2` for `LLAMA_QWEN3NEXT_FUSED_DELTA`:
|
||||
- prompt / multi-token path: non-fused
|
||||
- single-token decode path: fused
|
||||
|
||||
Perplexity validation (`-c 2048`, GPU config as above):
|
||||
|
||||
| Model | `=0` non-fused | `=2` decode-only fused |
|
||||
|---|---:|---:|
|
||||
| `/models/qwen3-next-coder.gguf` | `3.9378` | `3.9378` |
|
||||
| `/models/qwen-3-coder-next-mxfp4.gguf` | `3.9860` | `3.9860` |
|
||||
|
||||
`llama-bench` at `-p 8192 -n 128 -b 2048 -ub 512 -r 3 -rtr 1`:
|
||||
|
||||
| Mode | PP 8192 (tok/s) | TG 128 (tok/s) |
|
||||
|---|---:|---:|
|
||||
| `LLAMA_QWEN3NEXT_FUSED_DELTA=0` | `170.090` | `25.465` |
|
||||
| `LLAMA_QWEN3NEXT_FUSED_DELTA=2` | `166.212` | `29.599` |
|
||||
|
||||
Notes:
|
||||
|
||||
- Decode-only fused mode preserves prompt-quality metrics in this test.
|
||||
- TG improved significantly in this run; PP variance was higher, so PP delta should be treated as noisy.
|
||||
|
||||
@@ -4178,12 +4178,21 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
|
||||
const bool reset_state = batch.pos != nullptr && batch.pos[0] == 0;
|
||||
|
||||
// Keep legacy DeltaNet path as the default for correctness; enable fused path explicitly
|
||||
// with LLAMA_QWEN3NEXT_FUSED_DELTA=1 for controlled testing.
|
||||
const bool use_fused_delta_net = []() {
|
||||
enum class qwen3next_fused_delta_mode {
|
||||
off,
|
||||
on,
|
||||
tok1_only,
|
||||
};
|
||||
|
||||
// Keep legacy DeltaNet path as default for correctness.
|
||||
// LLAMA_QWEN3NEXT_FUSED_DELTA values:
|
||||
// unset / 0 : off
|
||||
// 1 : fused for all token counts
|
||||
// 2 : fused only for single-token decode steps
|
||||
const qwen3next_fused_delta_mode fused_delta_mode = []() {
|
||||
const char * env = std::getenv("LLAMA_QWEN3NEXT_FUSED_DELTA");
|
||||
if (env == nullptr || env[0] == '\0') {
|
||||
return false;
|
||||
return qwen3next_fused_delta_mode::off;
|
||||
}
|
||||
|
||||
switch (env[0]) {
|
||||
@@ -4192,11 +4201,14 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
case 'Y':
|
||||
case 't':
|
||||
case 'T':
|
||||
return true;
|
||||
return qwen3next_fused_delta_mode::on;
|
||||
case '2':
|
||||
return qwen3next_fused_delta_mode::tok1_only;
|
||||
default:
|
||||
return false;
|
||||
return qwen3next_fused_delta_mode::off;
|
||||
}
|
||||
}();
|
||||
const bool use_fused_delta_net_full = fused_delta_mode == qwen3next_fused_delta_mode::on;
|
||||
|
||||
auto get_slice_2d = [&](ggml_tensor * t, int64_t c) -> ggml_tensor * {
|
||||
return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
|
||||
@@ -4840,6 +4852,10 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
cb(v_conv, "v_conv_predelta", il);
|
||||
|
||||
std::pair<ggml_tensor *, ggml_tensor *> attn_out;
|
||||
const bool use_fused_delta_net =
|
||||
use_fused_delta_net_full ||
|
||||
(fused_delta_mode == qwen3next_fused_delta_mode::tok1_only && n_tok == 1);
|
||||
|
||||
if (use_fused_delta_net) {
|
||||
attn_out = build_delta_net_fused(q_conv, k_conv, v_conv, gate, beta, state, il);
|
||||
} else {
|
||||
@@ -4922,7 +4938,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
|
||||
ggml_tensor * causal_mask = nullptr;
|
||||
ggml_tensor * identity = nullptr;
|
||||
ggml_tensor * diag_mask = nullptr;
|
||||
if (!use_fused_delta_net) {
|
||||
if (!use_fused_delta_net_full) {
|
||||
causal_mask = ggml_tri(ctx0,
|
||||
ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, QWEN3NEXT_CHUNK_SIZE, QWEN3NEXT_CHUNK_SIZE), 1.0f),
|
||||
GGML_TRI_TYPE_LOWER);
|
||||
|
||||
Reference in New Issue
Block a user