From 1e6d36b1b4a4c99fac8e00b3a91deb3bd5ab601f Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Fri, 27 Feb 2026 07:03:25 +0100 Subject: [PATCH] Graph parallel for dense Qwen-3.5 models (#1331) * Graph parallel for idense Qwen-3.5 models * Cleanup --- src/llama-build-context.cpp | 14 +++++++++++++- src/llama-load-tensors.cpp | 23 ++++++++++++----------- src/llama.cpp | 5 +++-- 3 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index cae676ea..4df4e0bd 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -4649,7 +4649,19 @@ ggml_cgraph * llm_build_context::build_qwen35() { if (hparams.is_recurrent(il)) { ggml_tensor * inpSA = inpL; - cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il); + int idx = model.default_layer_device[il]; + if (inpL->op == GGML_OP_REDUCE) { + if (kv_self.s_l[il]) { + // This shouldn't be necessary, but just in case. + int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer); + if (idx_s_l >= 0) idx = idx_s_l; + } + if (inpL->src[idx]) { + inpL->view_src = inpL->src[idx]; + } + } + auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm; + cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il); cb(cur, "attn_norm", il); cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb); if (il == n_layer - 1 && inp_out_ids) { diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 5c8b820f..d6168bdb 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1490,6 +1490,7 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) { for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_split = ctx_for_layer_split(i); + ggml_context * ctx_layer = ctx_for_layer(i); auto & layer = model.layers[i]; @@ -1510,15 +1511,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) { } else { // Linear attention (gated delta net) specific tensors // Create tensors with calculated dimensions - layer.wqkv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_alpha = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_norm = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + layer.wqkv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); + layer.ssm_dt = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_beta = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); + layer.ssm_alpha = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); + layer.ssm_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); + layer.ssm_out = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); } layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); @@ -3610,7 +3611,7 @@ bool create_tensors_helper::create_tensors() { if (layer.wo && layer.wq && layer.wk && layer.wv) { auto granularity_kq = hparams.n_embd_head_k * gqa_ratio; int wq_ne1 = layer.wq->ne[1]; - if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) { + if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) { granularity_kq *= 2; wq_ne1 /= 2; } auto granularity_vo = hparams.n_embd_head_v * gqa_ratio; @@ -3666,7 +3667,7 @@ bool create_tensors_helper::create_tensors() { LLAMA_LOG_DEBUG("\n"); prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used); } - if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) { + if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) { for (auto & s : split_kq) s /= 2*gqa_ratio; } else { for (auto & s : split_kq) s /= gqa_ratio; diff --git a/src/llama.cpp b/src/llama.cpp index 4d9c08ee..facceb0d 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -661,7 +661,7 @@ llama_context::~llama_context() { // kv cache helpers // -static inline bool llama_qwen3next_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) { +static inline bool llama_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) { return hparams.is_recurrent(il); } @@ -836,7 +836,7 @@ static bool llama_kv_cache_init( int n_mla = 0; for (int i = 0; i < (int) n_layer; i++) { - const bool qnext_recurrent = llama_qwen3next_is_recurrent_layer(hparams, i); + const bool qnext_recurrent = llama_is_recurrent_layer(hparams, i); const uint32_t n_embd_v_row = llama_kv_v_row_embd(model, hparams, i); const uint32_t n_head_kv = hparams.n_head_kv(i); const uint32_t n_embd_head_k= hparams.n_embd_head_k; @@ -1937,6 +1937,7 @@ static bool is_model_split_supported(const llama_model & model) { LLM_ARCH_SEED_OSS, LLM_ARCH_STEP35, LLM_ARCH_QWEN3NEXT, + LLM_ARCH_QWEN35, }; auto it = k_supported.find(model.arch); return it != k_supported.end();