From d2d65c0d64f4e0c2e18f632e1d1b6760ca052b11 Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Wed, 18 Feb 2026 15:55:11 +0100
Subject: [PATCH] Better CPU performance for Qwen3-Next  (#1283)

* Better CPU silu - +4% PP

* Improve ggml_compute_forward_dup_bytes
---
 ggml/src/ggml.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7d028123..f68e8d03 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -11961,6 +11961,29 @@ static void ggml_compute_forward_dup_bytes(
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
 
+    if (src0->op == GGML_OP_TRANSPOSE && src0->ne[2]*src0->ne[3] >= nth/2 && src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        int elem_size = ggml_element_size(src0);
+        if ((size_t)src0->ne[0]*src0->ne[1]*elem_size == src0->nb[2] &&
+            (size_t)src0->ne[0]*src0->ne[1]*src0->ne[2]*elem_size == src0->nb[3]) {
+            int counter = 0;
+            for (int i3 = 0; i3 < src0->ne[3]; ++i3) {
+                for (int i2 = 0; i2 < src0->ne[2]; ++i2) {
+                    if (counter++ % nth == ith) {
+                        const char * x = (const char *)src0->data + i2*src0->nb[2] + i3*src0->nb[3];
+                        float * y = (float *)((char *)dst->data + i2*dst->nb[2] + i3*dst->nb[3]);
+                        for (int i1 = 0; i1 < dst->ne[1]; ++i1) {
+                            for (int i0 = 0; i0 < dst->ne[0]; ++i0) {
+                                memcpy(y, x + i0*src0->nb[0] + i1*src0->nb[1], sizeof(float));
+                                ++y;
+                            }
+                        }
+                    }
+                }
+            }
+            return;
+        }
+    }
+
     // parallelize by rows
     const int nr = ne01;
     // number of rows per thread
@@ -15435,6 +15458,20 @@ static void ggml_compute_forward_silu_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+        const int k_block_size = 1024;
+        int nelem = ggml_nelements(src0);
+        int nblock = (nelem + k_block_size - 1)/k_block_size;
+        for (int ib = ith; ib < nblock; ib += nth) {
+            int first = ib*k_block_size;
+            const float * x = (const float *)src0->data + first;
+                  float * y = (      float *) dst->data + first;
+            int n = first + k_block_size <= nelem ? k_block_size : nelem - first;
+            ggml_vec_silu_f32(n, y, x);
+        }
+        return;
+    }
+
     const int nc = src0->ne[0];
     const int nr = ggml_nrows(src0);