Better CPU performance for Qwen3-Next (#1283)

* Better CPU silu - +4% PP * Improve ggml_compute_forward_dup_bytes
2026-05-11 08:30:19 +00:00 · 2026-02-18 15:55:11 +01:00
parent 84831fc3ee
commit d2d65c0d64
1 changed files with 37 additions and 0 deletions
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -11961,6 +11961,29 @@ static void ggml_compute_forward_dup_bytes(
    const int ith = params->ith; // thread index
    const int nth = params->nth; // number of threads

+    if (src0->op == GGML_OP_TRANSPOSE && src0->ne[2]*src0->ne[3] >= nth/2 && src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        int elem_size = ggml_element_size(src0);
+        if ((size_t)src0->ne[0]*src0->ne[1]*elem_size == src0->nb[2] &&
+            (size_t)src0->ne[0]*src0->ne[1]*src0->ne[2]*elem_size == src0->nb[3]) {
+            int counter = 0;
+            for (int i3 = 0; i3 < src0->ne[3]; ++i3) {
+                for (int i2 = 0; i2 < src0->ne[2]; ++i2) {
+                    if (counter++ % nth == ith) {
+                        const char * x = (const char *)src0->data + i2*src0->nb[2] + i3*src0->nb[3];
+                        float * y = (float *)((char *)dst->data + i2*dst->nb[2] + i3*dst->nb[3]);
+                        for (int i1 = 0; i1 < dst->ne[1]; ++i1) {
+                            for (int i0 = 0; i0 < dst->ne[0]; ++i0) {
+                                memcpy(y, x + i0*src0->nb[0] + i1*src0->nb[1], sizeof(float));
+                                ++y;
+                            }
+                        }
+                    }
+                }
+            }
+            return;
+        }
+    }
+
    // parallelize by rows
    const int nr = ne01;
    // number of rows per thread
@@ -15435,6 +15458,20 @@ static void ggml_compute_forward_silu_f32(
    const int ith = params->ith;
    const int nth = params->nth;

+    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+        const int k_block_size = 1024;
+        int nelem = ggml_nelements(src0);
+        int nblock = (nelem + k_block_size - 1)/k_block_size;
+        for (int ib = ith; ib < nblock; ib += nth) {
+            int first = ib*k_block_size;
+            const float * x = (const float *)src0->data + first;
+                  float * y = (      float *) dst->data + first;
+            int n = first + k_block_size <= nelem ? k_block_size : nelem - first;
+            ggml_vec_silu_f32(n, y, x);
+        }
+        return;
+    }
+
    const int nc = src0->ne[0];
    const int nr = ggml_nrows(src0);