From 1e6d36b1b4a4c99fac8e00b3a91deb3bd5ab601f Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Fri, 27 Feb 2026 07:03:25 +0100
Subject: [PATCH] Graph parallel for dense Qwen-3.5 models (#1331)

* Graph parallel for idense Qwen-3.5 models

* Cleanup
---
 src/llama-build-context.cpp | 14 +++++++++++++-
 src/llama-load-tensors.cpp  | 23 ++++++++++++-----------
 src/llama.cpp               |  5 +++--
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index cae676ea..4df4e0bd 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -4649,7 +4649,19 @@ ggml_cgraph * llm_build_context::build_qwen35() {
 
         if (hparams.is_recurrent(il)) {
             ggml_tensor * inpSA = inpL;
-            cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
+            int idx = model.default_layer_device[il];
+            if (inpL->op == GGML_OP_REDUCE) {
+                if (kv_self.s_l[il]) {
+                    // This shouldn't be necessary, but just in case.
+                    int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer);
+                    if (idx_s_l >= 0) idx = idx_s_l;
+                }
+                if (inpL->src[idx]) {
+                    inpL->view_src = inpL->src[idx];
+                }
+            }
+            auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
+            cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
             cb(cur, "attn_norm", il);
             cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
             if (il == n_layer - 1 && inp_out_ids) {
diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp
index 5c8b820f..d6168bdb 100644
--- a/src/llama-load-tensors.cpp
+++ b/src/llama-load-tensors.cpp
@@ -1490,6 +1490,7 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
 
     for (int i = 0; i < n_layer; ++i) {
         ggml_context * ctx_split = ctx_for_layer_split(i);
+        ggml_context * ctx_layer = ctx_for_layer(i);
 
         auto & layer = model.layers[i];
 
@@ -1510,15 +1511,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
         } else {
             // Linear attention (gated delta net) specific tensors
             // Create tensors with calculated dimensions
-            layer.wqkv           = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
-            layer.wqkv_gate      = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
-            layer.ssm_conv1d     = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
-            layer.ssm_dt         = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
-            layer.ssm_a          = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
-            layer.ssm_beta       = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
-            layer.ssm_alpha      = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
-            layer.ssm_norm       = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
-            layer.ssm_out        = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
+            layer.wqkv           = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV,       "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
+            layer.wqkv_gate      = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_GATE,      "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
+            layer.ssm_conv1d     = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D,     "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
+            layer.ssm_dt         = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT,         "bias",   i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_a          = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_A_NOSCAN,             i), { hparams.ssm_dt_rank }, 0);
+            layer.ssm_beta       = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_BETA,       "weight", i), { n_embd, n_v_heads }, 0);
+            layer.ssm_alpha      = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_ALPHA,      "weight", i), { n_embd, n_v_heads }, 0);
+            layer.ssm_norm       = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_NORM,       "weight", i), { head_v_dim }, 0);
+            layer.ssm_out        = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_OUT,        "weight", i), { value_dim, n_embd }, 0);
         }
 
         layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
@@ -3610,7 +3611,7 @@ bool create_tensors_helper::create_tensors() {
             if (layer.wo && layer.wq && layer.wk && layer.wv) {
                 auto granularity_kq = hparams.n_embd_head_k * gqa_ratio;
                 int wq_ne1 = layer.wq->ne[1];
-                if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) {
+                if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) {
                     granularity_kq *= 2; wq_ne1 /= 2;
                 }
                 auto granularity_vo = hparams.n_embd_head_v * gqa_ratio;
@@ -3666,7 +3667,7 @@ bool create_tensors_helper::create_tensors() {
                     LLAMA_LOG_DEBUG("\n");
                     prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used);
                 }
-                if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) {
+                if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) {
                     for (auto & s : split_kq) s /= 2*gqa_ratio;
                 } else {
                     for (auto & s : split_kq) s /= gqa_ratio;
diff --git a/src/llama.cpp b/src/llama.cpp
index 4d9c08ee..facceb0d 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -661,7 +661,7 @@ llama_context::~llama_context() {
 // kv cache helpers
 //
 
-static inline bool llama_qwen3next_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
+static inline bool llama_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
     return hparams.is_recurrent(il);
 }
 
@@ -836,7 +836,7 @@ static bool llama_kv_cache_init(
 
     int n_mla = 0;
     for (int i = 0; i < (int) n_layer; i++) {
-        const bool qnext_recurrent = llama_qwen3next_is_recurrent_layer(hparams, i);
+        const bool qnext_recurrent = llama_is_recurrent_layer(hparams, i);
         const uint32_t n_embd_v_row = llama_kv_v_row_embd(model, hparams, i);
         const uint32_t n_head_kv    = hparams.n_head_kv(i);
         const uint32_t n_embd_head_k= hparams.n_embd_head_k;
@@ -1937,6 +1937,7 @@ static bool is_model_split_supported(const llama_model & model) {
         LLM_ARCH_SEED_OSS,
         LLM_ARCH_STEP35,
         LLM_ARCH_QWEN3NEXT,
+        LLM_ARCH_QWEN35,
     };
     auto it =  k_supported.find(model.arch);
     return it != k_supported.end();