From 73b8fea90b66051651d065116848696fbd853d1c Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Sat, 17 Jan 2026 17:25:57 +0000 Subject: [PATCH] This finally works --- ggml/src/ggml-cuda/binbcast.cu | 105 ++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/binbcast.cuh | 3 + ggml/src/ggml-cuda/norm.cu | 10 ++- ggml/src/ggml-cuda/reduce.cu | 9 ++- src/llama-build-context.cpp | 2 +- 5 files changed, 124 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 10df89a5..3440ac01 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -330,7 +330,92 @@ static __global__ void k_fast_add_2(int64_t ne0, int64_t nelem, const src1_t * x z[i] = (dst_t)((float)x[i] + (float)y[i]); } +template +static __global__ void k_add_same(int64_t nelem, const data_t * x, const data_t * y, data_t * z) { + int64_t i = block_size*blockIdx.x + threadIdx.x; + if (i >= nelem) { + return; + } + z[i] = x[i] + y[i]; +} + +template +static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= nelem) return; + int ib = i / QK8_0; + int iq = i % QK8_0; + float sum = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq]; + float asum = fabsf(sum); + float max = warp_reduce_max(asum); + float d = max / 127; + float id = d > 0 ? 1/d : 0; + z[ib].qs[iq] = roundf(sum * id); + if (threadIdx.x % WARP_SIZE == 0) { + z[ib].d = (half)d; + } +} + +template +static __global__ void k_add_same_q8_0(int nelem, const block_q8_0 * x, const float * y, block_q8_0 * z) { + int i = blockIdx.x*block_size + threadIdx.x; + if (i >= nelem) return; + int ib = i / QK8_0; + int iq = i % QK8_0; + float sum = (float)x[ib].d * x[ib].qs[iq] + y[i]; + float asum = fabsf(sum); + float max = warp_reduce_max(asum); + float d = max / 127; + float id = d > 0 ? 1/d : 0; + z[ib].qs[iq] = roundf(sum * id); + if (threadIdx.x % WARP_SIZE == 0) { + z[ib].d = (half)d; + } +} + +//static __global__ void k_add_same_q8_0(const block_q8_0 * x, const block_q8_0 * y, block_q8_0 * z) { +// int ib = blockIdx.x; +// int iq = threadIdx.x; +// float s = (float)x[ib].d * x[ib].qs[iq] + (float)y[ib].d * y[ib].qs[iq]; +// float as = fabsf(s); +// as = warp_reduce_max(as); +// float d = as / 127; +// float id = d > 0 ? 1/d : 0; +// z[ib].qs[iq] = roundf(s * id); +// if (threadIdx.x == 0) { +// z[ib].d = (half)d; +// } +//} + +void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem, + const void * x, const void * y, void * z) { + constexpr int kBlockSize = 256; + int nblocks = (nelem + kBlockSize - 1)/kBlockSize; + if (type == GGML_TYPE_F32) { + k_add_same<<>>(nelem, + (const float *)x, (const float *)y, (float *)z); + } else if (type == GGML_TYPE_F16) { + k_add_same<<>>(nelem, + (const half *)x, (const half *)y, (half *)z); + } else if (type == GGML_TYPE_BF16) { + k_add_same<<>>(nelem, + (const nv_bfloat16 *)x, (const nv_bfloat16 *)y, (nv_bfloat16 *)z); + } else if (type == GGML_TYPE_Q8_0) { + k_add_same_q8_0<<>>(nelem, + (const block_q8_0 *)x, (const block_q8_0 *)y, (block_q8_0 *)z); + } else { + GGML_ABORT("Unsupported add operation"); + } +} + void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + if (dst->src[0]->type == dst->src[1]->type && dst->src[0]->type == dst->type && + ggml_is_contiguous(dst->src[0]) && ggml_is_contiguous(dst->src[1]) && ggml_is_contiguous(dst) && + ggml_are_same_shape(dst->src[0], dst->src[1])) { + //printf("%s(%s, %s): using fast same\n", __func__, dst->name, ggml_type_name(dst->type)); + ggml_op_add_same_type(ctx, dst->type, ggml_nelements(dst), dst->src[0]->data, dst->src[1]->data, dst->data); + return; + } if (ggml_nrows(dst->src[1]) == 1 && dst->src[0]->ne[0] == dst->src[1]->ne[0] && dst->type == GGML_TYPE_F32 && dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32 && ggml_are_same_shape(dst, dst->src[0]) && ggml_is_contiguous(dst)) { @@ -361,6 +446,26 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { k_fast_add_2<<>>(dst->ne[0], nelem, (const float *)dst->src[0]->data, (const half *)dst->src[1]->data, (half *)dst->data); } + } else if (dst->type == GGML_TYPE_BF16) { + if (dst->src[0]->type == GGML_TYPE_BF16 && dst->src[1]->type == GGML_TYPE_BF16) { + k_fast_add_2<<>>(dst->ne[0], nelem, + (const nv_bfloat16 *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (nv_bfloat16 *)dst->data); + } + else if (dst->src[0]->type == GGML_TYPE_BF16 && dst->src[1]->type == GGML_TYPE_F32) { + k_fast_add_2<<>>(dst->ne[0], nelem, + (const nv_bfloat16 *)dst->src[0]->data, (const float *)dst->src[1]->data, (nv_bfloat16 *)dst->data); + } + else if (dst->src[0]->type == GGML_TYPE_F32 && dst->src[1]->type == GGML_TYPE_F32) { + k_fast_add_2<<>>(dst->ne[0], nelem, + (const float *)dst->src[0]->data, (const float *)dst->src[1]->data, (nv_bfloat16 *)dst->data); + } else { + k_fast_add_2<<>>(dst->ne[0], nelem, + (const float *)dst->src[0]->data, (const nv_bfloat16 *)dst->src[1]->data, (nv_bfloat16 *)dst->data); + } + } else if (dst->type == GGML_TYPE_Q8_0) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_Q8_0 && dst->src[1]->type == GGML_TYPE_F32); + k_add_same_q8_0<<>>(nelem, + (const block_q8_0 *)dst->src[0]->data, (const float *)dst->src[1]->data, (block_q8_0 *)dst->data); } else { if (dst->src[0]->type == GGML_TYPE_F16 && dst->src[1]->type == GGML_TYPE_F16) { k_fast_add_2<<>>(dst->ne[0], nelem, diff --git a/ggml/src/ggml-cuda/binbcast.cuh b/ggml/src/ggml-cuda/binbcast.cuh index 4f63d637..6fb13413 100644 --- a/ggml/src/ggml-cuda/binbcast.cuh +++ b/ggml/src/ggml-cuda/binbcast.cuh @@ -4,3 +4,6 @@ void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_op_add_same_type(ggml_backend_cuda_context & ctx, enum ggml_type type, size_t nelem, + const void * x, const void * y, void * z); diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu index 86f311cd..b198b68e 100644 --- a/ggml/src/ggml-cuda/norm.cu +++ b/ggml/src/ggml-cuda/norm.cu @@ -244,7 +244,11 @@ static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, floa const float xi = (float)xr[col / QK8_0].d * xr[col / QK8_0].qs[col % QK8_0]; tmp += xi * xi; } - + } else if constexpr (std::is_same_v) { + for (int col = tid; col < ncols; col += block_size) { + const float xi = __bfloat162float(x[row*ncols + col]); + tmp += xi * xi; + } } else { for (int col = tid; col < ncols; col += block_size) { const float xi = (float)x[row*ncols + col]; @@ -274,6 +278,10 @@ static __global__ void fused_rms_norm_f32(const src_t * x, const float * y, floa for (int col = tid; col < ncols; col += block_size) { dst[row*ncols + col] = scale * y[col] * (float)xr[col / QK8_0].d * xr[col / QK8_0].qs[col % QK8_0]; } + } else if constexpr (std::is_same_v) { + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = scale * y[col] * __bfloat162float(x[row*ncols + col]); + } } else { for (int col = tid; col < ncols; col += block_size) { dst[row*ncols + col] = scale * y[col] * (float)x[row*ncols + col]; diff --git a/ggml/src/ggml-cuda/reduce.cu b/ggml/src/ggml-cuda/reduce.cu index 408ee2fd..18dc8969 100644 --- a/ggml/src/ggml-cuda/reduce.cu +++ b/ggml/src/ggml-cuda/reduce.cu @@ -6,19 +6,20 @@ // #include "reduce.cuh" +#include "binbcast.cuh" #include "ggml-common.h" #include template -static __global__ void k_add(int nelem, const T * src, T * dst) { +static __global__ void k_add(int nelem, const T * __restrict__ src, T * __restrict__ dst) { int i = blockIdx.x*block_size + threadIdx.x; if (i >= nelem) return; dst[i] += src[i]; } template -static __global__ void k_add(int nelem, const block_q8_0 * src, block_q8_0 * dst) { +static __global__ void k_add(int nelem, const block_q8_0 * __restrict__ src, block_q8_0 * __restrict__ dst) { int i = blockIdx.x*block_size + threadIdx.x; if (i >= nelem) return; int ib = i / QK8_0; @@ -101,7 +102,7 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ // It does not work at all if not all GPUs participate in the reduce op, and we // get suboptimal prompt processing performance when we have more than 2 GPUs. // Hence, if enabled, we use NCCL only for the cases where it works and performs well. - if (false && info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) { + if (info.have_nccl && dst->type != GGML_TYPE_Q8_0 && nhave == nreduce && (nhave == 2 || dst->ne[1] < 32)) { GGML_ASSERT(info.have_nccl); GGML_ASSERT(info.device_count == nreduce); auto data_type = dst->type == GGML_TYPE_F32 ? ncclFloat : dst->type == GGML_TYPE_BF16 ? ncclBfloat16 : ncclHalf; @@ -276,6 +277,8 @@ void ggml_cuda_op_reduce([[maybe_unused]] ggml_backend_cuda_context & ctx, ggml_ auto this_nelem = std::min(nelem_per_device, nelem - ichunk*nelem_per_device); ggml_cuda_set_device(info.all_ctx[i]->device); CUDA_CHECK(cudaStreamWaitEvent(info.all_ctx[i]->stream(), info.all_ctx[peer]->copy_event, 0)); + //ggml_op_add_same_type(ctx, dst->type, this_nelem, info.all_ctx[i]->copy_buffer, + // (const char *)dst->src[i]->data + ichunk*size_per_device, (char *)dst->src[i]->data + ichunk*size_per_device); int num_blocks = (this_nelem + CUDA_REDUCE_BLOCK_SIZE - 1)/CUDA_REDUCE_BLOCK_SIZE; if (dst->type == GGML_TYPE_F16) { k_add<<stream()>>>(this_nelem, diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index d51f8980..e7d20b21 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -9190,7 +9190,7 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens if (!model.layers[il].wqkv && !model.layers[il].wqk && cparams.flash_attn && model.layers[il].wq->extra && model.layers[il].wk->extra && model.layers[il].wv->extra && model.layers[il].wo->extra) { if (kv_self.k_l[il]->extra && kv_self.v_l[il]->extra) { - ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr; + //ggml_split_tensor_t * attn_norm = the_attn_norm ? (ggml_split_tensor_t *)the_attn_norm->extra : nullptr; auto wq = (ggml_split_tensor_t *)model.layers[il].wq->extra; auto wk = (ggml_split_tensor_t *)model.layers[il].wk->extra; auto wv = (ggml_split_tensor_t *)model.layers[il].wv->extra;