From 73b8fea90b66051651d065116848696fbd853d1c Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Sat, 17 Jan 2026 17:25:57 +0000
Subject: [PATCH] This finally works

---
 ggml/src/ggml-cuda/binbcast.cu  | 105 ++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/binbcast.cuh |   3 +
 ggml/src/ggml-cuda/norm.cu      |  10 ++-
 ggml/src/ggml-cuda/reduce.cu    |   9 ++-
 src/llama-build-context.cpp     |   2 +-
 5 files changed, 124 insertions(+), 5 deletions(-)
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
index 10df89a5..3440ac01 100644
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -330,7 +330,92 @@ static __global__ void k_fast_add_2(int64_t ne0, int64_t nelem, const src1_t * x
     z[i] = (dst_t)((float)x[i] + (float)y[i]);
 }
 
+template <int block_size, typename data_t>
+static __global__ void k_add_same(int64_t nelem, const data_t * x, const data_t * y, data_t * z) {
+    int64_t i = block_size*blockIdx.x + threadIdx.x;
+    if (i >= nelem) {
+        return;
+    }
+    z[i] = x[i] + y[i];
+}
+
+template <int block_size>
+static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) {
+    int i = blockIdx.x*block_size + threadIdx.x;
+    if (i >= nelem) return;
+    int ib = i / QK8_0;
+    int iq = i % QK8_0;
+    float sum = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq];
+    float asum = fabsf(sum);
+    float max = warp_reduce_max(asum);
+    float d = max / 127;
+    float id = d > 0 ? 1/d : 0;
+    z[ib].qs[iq] = roundf(sum * id);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        z[ib].d = (half)d;
+    }
+}
+
+template <int block_size>
+static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const float * y, block_q8_0 * z) {
+    int i = blockIdx.x*block_size + threadIdx.x;
+    if (i >= nelem) return;
+    int ib = i / QK8_0;
+    int iq = i % QK8_0;
+    float sum = (float)x[ib].d * x[ib].qs[iq] + y[i];
+    float asum = fabsf(sum);
+    float max = warp_reduce_max(asum);
+    float d = max / 127;
+    float id = d > 0 ? 1/d : 0;
+    z[ib].qs[iq] = roundf(sum * id);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        z[ib].d = (half)d;
+    }
+}
+
+//static __global__ void k_add_same_q8_0(const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) {
+//    int ib = blockIdx.x;
+//    int iq = threadIdx.x;
+//    float s  = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq];
+//    float as = fabsf(s);
+//    as = warp_reduce_max(as);
+//    float d = as / 127;
+//    float id = d > 0 ? 1/d : 0;
+//    z[ib].qs[iq] = roundf(s * id);
+//    if (threadIdx.x == 0) {
+//        z[ib].d = (half)d;
+//    }
+//}
+
+void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem,
+        const void * x, const void * y, void * z) {
+    constexpr int kBlockSize = 256;
+    int nblocks = (nelem + kBlockSize - 1)/kBlockSize;
+    if (type == GGML_TYPE_F32) {
+        k_add_same<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const float *)x, (const float *)y, (float *)z);
+    } else if (type == GGML_TYPE_F16) {
+        k_add_same<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const half *)x, (const half *)y, (half *)z);
+    } else if (type == GGML_TYPE_BF16) {
+        k_add_same<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const nv_bfloat16 *)x, (const nv_bfloat16 *)y, (nv_bfloat16 *)z);
+    } else if (type == GGML_TYPE_Q8_0) {
+        k_add_same_q8_0<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                (const block_q8_0 *)x, (const block_q8_0 *)y, (block_q8_0 *)z);
+    } else {
+        GGML_ABORT("Unsupported add operation");
+    }
+}
+
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    if (dst->src[0]->type == dst->src[1]->type && dst->src[0]->type == dst->type &&
+        ggml_is_contiguous(dst->src[0]) && ggml_is_contiguous(dst->src[1]) && ggml_is_contiguous(dst) &&
+        ggml_are_same_shape(dst->src[0], dst->src[1])) {
+        //printf("%s(%s, %s): using fast same\n", __func__, dst->name, ggml_type_name(dst->type));
+        ggml_op_add_same_type(ctx, dst->type, ggml_nelements(dst), dst->src[0]->data, dst->src[1]->data, dst->data);
+        return;
+    }
     if (ggml_nrows(dst->src[1]) == 1 && dst->src[0]->ne[0] == dst->src[1]->ne[0] &&
         dst->type == GGML_TYPE_F32 && dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32 &&
         ggml_are_same_shape(dst, dst->src[0]) && ggml_is_contiguous(dst)) {
@@ -361,6 +446,26 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
                 k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
                         (const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data);
             }
+        } else if (dst->type == GGML_TYPE_BF16) {
+            if (dst->src[0]->type == GGML_TYPE_BF16 && dst->src[1]->type == GGML_TYPE_BF16) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const nv_bfloat16 *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_BF16 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            }
+            else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            } else {
+                k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
+                        (const float *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (nv_bfloat16 *)dst->data);
+            }
+        } else if (dst->type == GGML_TYPE_Q8_0) {
+            GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0 && dst->src[1]->type == GGML_TYPE_F32);
+            k_add_same_q8_0<kBlockSize><<<nblocks, kBlockSize, 0, ctx.stream()>>>(nelem,
+                        (const block_q8_0 *)dst->src[0]->data, (const float *)dst->src[1]->data, (block_q8_0 *)dst->data);
         } else {
             if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) {
                 k_fast_add_2<<<nblocks, kBlockSize, 0, ctx.stream()>>>(dst->ne[0], nelem,
diff --git a/ggml/src/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh
index 4f63d637..6fb13413 100644
--- a/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@@ -4,3 +4,6 @@ void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem,
+        const void * x, const void * y, void * z);
diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu
index 86f311cd..b198b68e 100644
--- a/ggml/src/ggml-cuda/norm.cu
+++ b/ggml/src/ggml-cuda/norm.cu
@@ -244,7 +244,11 @@ static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, floa
             const float xi = (float)xr[col / QK8_0].d * xr[col / QK8_0].qs[col % QK8_0];
             tmp += xi * xi;
         }
-
+    } else if constexpr (std::is_same_v<src_t, nv_bfloat16>) {
+        for (int col = tid; col < ncols; col += block_size) {
+            const float xi = __bfloat162float(x[row*ncols + col]);
+            tmp += xi * xi;
+        }
     } else {
         for (int col = tid; col < ncols; col += block_size) {
             const float xi = (float)x[row*ncols + col];
@@ -274,6 +278,10 @@ static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, floa
         for (int col = tid; col < ncols; col += block_size) {
             dst[row*ncols + col] = scale * y[col] * (float)xr[col / QK8_0].d * xr[col / QK8_0].qs[col % QK8_0];
         }
+    } else if constexpr (std::is_same_v<src_t, nv_bfloat16>) {
+        for (int col = tid; col < ncols; col += block_size) {
+            dst[row*ncols + col] = scale * y[col] * __bfloat162float(x[row*ncols + col]);
+        }
     } else {
         for (int col = tid; col < ncols; col += block_size) {
             dst[row*ncols + col] = scale * y[col] * (float)x[row*ncols + col];
diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu
index 408ee2fd..18dc8969 100644
--- a/ggml/src/ggml-cuda/reduce.cu
+++ b/ggml/src/ggml-cuda/reduce.cu
@@ -6,19 +6,20 @@
 //
 
 #include "reduce.cuh"
+#include "binbcast.cuh"
 #include "ggml-common.h"
 
 #include <chrono>
 
 template <typename T, int block_size>
-static __global__ void k_add(int nelem, const T * src, T * dst) {
+static __global__ void k_add(int nelem, const T * __restrict__ src, T * __restrict__ dst) {
     int i = blockIdx.x*block_size + threadIdx.x;
     if (i >= nelem) return;
     dst[i] += src[i];
 }
 
 template <int block_size>
-static __global__ void k_add(int nelem, const block_q8_0 * src, block_q8_0 * dst) {
+static __global__ void k_add(int nelem, const block_q8_0 * __restrict__ src, block_q8_0 * __restrict__ dst) {
     int i = blockIdx.x*block_size + threadIdx.x;
     if (i >= nelem) return;
     int ib = i / QK8_0;
@@ -101,7 +102,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
     // It does not work at all if not all GPUs participate in the reduce op, and we
     // get suboptimal prompt processing performance when we have more than 2 GPUs.
     // Hence, if enabled, we use NCCL only for the cases where it works and performs well.
-    if (false && info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) {
+    if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) {
         GGML_ASSERT(info.have_nccl);
         GGML_ASSERT(info.device_count == nreduce);
         auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf;
@@ -276,6 +277,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_
                 auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device);
                 ggml_cuda_set_device(info.all_ctx[i]->device);
                 CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0));
+                //ggml_op_add_same_type(ctx, dst->type, this_nelem, info.all_ctx[i]->copy_buffer,
+                //        (const char *)dst->src[i]->data + ichunk*size_per_device, (char *)dst->src[i]->data + ichunk*size_per_device);
                 int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE;
                 if (dst->type == GGML_TYPE_F16) {
                     k_add<half, CUDA_REDUCE_BLOCK_SIZE><<<num_blocks, CUDA_REDUCE_BLOCK_SIZE, 0, info.all_ctx[i]->stream()>>>(this_nelem,
diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp
index d51f8980..e7d20b21 100644
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@@ -9190,7 +9190,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
     if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn &&
          model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) {
         if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) {
-            ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr;
+            //ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr;
             auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra;
             auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra;
             auto wv = (ggml_split_tensor_t *)model.layers[il].wv->extra;