From 627d46912ce061e390b0d3e73aeff477e4f18cbf Mon Sep 17 00:00:00 2001
From: yurko <yurko@pop-os.tail5a1a6b.ts.net>
Date: Sun, 8 Feb 2026 01:04:38 -0800
Subject: [PATCH] qwen3next: disable flash-attn for cpu-only contexts

---
 src/llama.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index a9f6fed9..766dcd93 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -4508,6 +4508,13 @@ struct llama_context * llama_new_context_with_model(
         params.flash_attn = false;
     }
 
+    // Qwen3Next currently has a CPU-only flash-attn assertion path in iqk FA kernels.
+    // Keep CPU runs safe by disabling FA when no layers are offloaded.
+    if (params.flash_attn && model->arch == LLM_ARCH_QWEN3NEXT && model->n_gpu_layers == 0) {
+        LLAMA_LOG_WARN("%s: flash_attn is unstable for CPU-only Qwen3Next - forcing off\n", __func__);
+        params.flash_attn = false;
+    }
+
     //if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) {
     //    LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__);
     //    params.flash_attn = false;