From 5b1efbe4988208223eb3ca9b165a30584e5dddaa Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 23 Oct 2025 11:01:59 +0300 Subject: [PATCH] fused mul+multi_add: CUDA --- ggml/src/ggml-cuda.cu | 5 ++ ggml/src/ggml-cuda/multiadd.cu | 87 +++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/multiadd.cuh | 14 ++++++ ggml/src/ggml-cuda/unary.cu | 36 -------------- 4 files changed, 106 insertions(+), 36 deletions(-) create mode 100644 ggml/src/ggml-cuda/multiadd.cu create mode 100644 ggml/src/ggml-cuda/multiadd.cuh diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 29f9d26c..9f7fd33f 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -46,6 +46,7 @@ #include "ggml-cuda/conv2d-dw.cuh" #include "ggml-cuda/set-rows.cuh" #include "ggml-cuda/argmax.cuh" +#include "ggml-cuda/multiadd.cuh" #include #include @@ -3178,6 +3179,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_MULTI_ADD: ggml_cuda_op_multi_add(ctx, dst); break; + case GGML_OP_MUL_MULTI_ADD: + ggml_cuda_op_mul_multi_add(ctx, dst); + break; case GGML_OP_ACC: ggml_cuda_op_acc(ctx, dst); break; @@ -4408,6 +4412,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_ADD: case GGML_OP_ADD_ID: case GGML_OP_MULTI_ADD: + case GGML_OP_MUL_MULTI_ADD: case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_FUSED_RMS_NORM: diff --git a/ggml/src/ggml-cuda/multiadd.cu b/ggml/src/ggml-cuda/multiadd.cu new file mode 100644 index 00000000..fba7271a --- /dev/null +++ b/ggml/src/ggml-cuda/multiadd.cu @@ -0,0 +1,87 @@ +#include "multiadd.cuh" + +static __global__ void multi_add_f32(int nused, int64_t ne0, int64_t ne1, int64_t nb1, int64_t nb01, const char * src0, char * dst) { + const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; + int64_t k = ne0*ne1; + if (i >= k) { + return; + } + int i1 = i / ne0; + int i0 = i % ne0; + float * result = (float *)(dst + i1*nb1); + const float * s = (const float *)(src0 + i1*nb01) + i0; + if (nused == 1) { + result[i0] = s[0]; + } else { + float sum = s[0] + s[ne0]; + for (int j = 2; j < nused; ++j) sum += s[j*ne0]; + result[i0] = sum; + } +} + +static void multi_add_f32_cuda(int nused, int64_t ne0, int64_t ne1, int64_t nb1, int64_t nb01, const char * src0, char * dst, cudaStream_t stream) { + int64_t k = ne0 * ne1; + const int num_blocks = (k + CUDA_MULTI_ADD_BLOCK_SIZE - 1) / CUDA_MULTI_ADD_BLOCK_SIZE; + multi_add_f32<<>>(nused, ne0, ne1, nb1, nb01, src0, dst); +} + +void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->ne[2] == 1 && dst->ne[3] == 1); + GGML_ASSERT(dst->nb[0] == sizeof(float)); + int nused = dst->op_params[0]; + GGML_ASSERT(nused >= 1); + const char * src0 = (const char *)dst->src[0]->data; + cudaStream_t stream = ctx.stream(); + multi_add_f32_cuda(nused, dst->ne[0], dst->ne[1], dst->nb[1], dst->src[0]->nb[1], src0, (char *)dst->data, stream); +} + + +static __global__ void mul_multi_add_f32(int nused, int64_t ne0, int64_t ne1, int64_t nb1, int64_t nb01, int64_t nb02, int64_t nb11, int64_t nb12, const char * src0, const char * src1, char * dst) { + const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; + int64_t k = ne0*ne1; + if (i >= k) { + return; + } + int i1 = i / ne0; + int i0 = i % ne0; + float * result = (float *)(dst + i1*nb1); + + auto c0 = src0 + i1*nb02; + auto c1 = src1 + i1*nb12; + + float sum = 0; + for (int j = 0; j < nused; ++j) { + auto x0 = (const float *)c0; + auto x1 = (const float *)c1; + sum += x0[i0] * x1[0]; + c0 += nb01; + c1 += nb11; + } + result[i0] = sum; +} + +static void mul_multi_add_f32_cuda(int nused, int64_t ne0, int64_t ne1, int64_t nb1, int64_t nb01, int64_t nb02, int64_t nb11, int64_t nb12, + const char * src0, const char * src1, char * dst, cudaStream_t stream) { + int64_t k = ne0 * ne1; + const int num_blocks = (k + CUDA_MULTI_ADD_BLOCK_SIZE - 1) / CUDA_MULTI_ADD_BLOCK_SIZE; + mul_multi_add_f32<<>>(nused, ne0, ne1, nb1, nb01, nb02, nb11, nb12, src0, src1, dst); +} + +void ggml_cuda_op_mul_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + auto src0 = dst->src[0]; + auto src1 = dst->src[1]; + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[0] == dst->ne[0]); + GGML_ASSERT(src0->ne[2] == dst->ne[1]); + GGML_ASSERT(src0->ne[1] == src1->ne[1]); + GGML_ASSERT(src0->ne[2] == src1->ne[2]); + GGML_ASSERT(src0->ne[3] == src1->ne[3]); + GGML_ASSERT(src0->ne[3] == 1); + GGML_ASSERT(src1->ne[0] == 1); + + mul_multi_add_f32_cuda(src0->ne[1], dst->ne[0], dst->ne[1], dst->nb[1], src0->nb[1], src0->nb[2], src1->nb[1], src1->nb[2], + (const char *)src0->data, (const char *)src1->data, (char *)dst->data, ctx.stream()); +} diff --git a/ggml/src/ggml-cuda/multiadd.cuh b/ggml/src/ggml-cuda/multiadd.cuh new file mode 100644 index 00000000..f923597b --- /dev/null +++ b/ggml/src/ggml-cuda/multiadd.cuh @@ -0,0 +1,14 @@ +// +// Copyright (C) 2023-2024 The ggml authors +// Copyright (C) 2024 Iwan Kawrakow +// MIT license +// SPDX-License-Identifier: MIT +// + +#include "common.cuh" + +#define CUDA_MULTI_ADD_BLOCK_SIZE 256 + +void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_mul_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 090f5e86..49f22b98 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -59,25 +59,6 @@ static __global__ void fused_mul_silu_f32(const float * x, const float * y, floa dst[i] = x[i] * y[i] / (1.0f + expf(-x[i])); } -static __global__ void multi_add_f32(int nused, int64_t ne0, int64_t ne1, int64_t nb1, int64_t nb01, const char * src0, char * dst) { - const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; - int64_t k = ne0*ne1; - if (i >= k) { - return; - } - int i1 = i / ne0; - int i0 = i % ne0; - float * result = (float *)(dst + i1*nb1); - const float * s = (const float *)(src0 + i1*nb01) + i0; - if (nused == 1) { - result[i0] = s[0]; - } else { - float sum = s[0] + s[ne0]; - for (int j = 2; j < nused; ++j) sum += s[j*ne0]; - result[i0] = sum; - } -} - static __global__ void fused_mul_relu_f32(const float * x, const float * y, float * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -261,23 +242,6 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_ sqrt_f32<<>>(x, dst, k); } -static void multi_add_f32_cuda(int nused, int64_t ne0, int64_t ne1, int64_t nb1, int64_t nb01, const char * src0, char * dst, cudaStream_t stream) { - int64_t k = ne0 * ne1; - const int num_blocks = (k + CUDA_MULTI_ADD_BLOCK_SIZE - 1) / CUDA_MULTI_ADD_BLOCK_SIZE; - multi_add_f32<<>>(nused, ne0, ne1, nb1, nb01, src0, dst); -} - -void ggml_cuda_op_multi_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(dst->ne[2] == 1 && dst->ne[3] == 1); - GGML_ASSERT(dst->nb[0] == sizeof(float)); - int nused = dst->op_params[0]; - GGML_ASSERT(nused >= 1); - const char * src0 = (const char *)dst->src[0]->data; - cudaStream_t stream = ctx.stream(); - multi_add_f32_cuda(nused, dst->ne[0], dst->ne[1], dst->nb[1], dst->src[0]->nb[1], src0, (char *)dst->data, stream); -} - void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const float * src0_d = (const float *)src0->data;