diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index f594cd26..9474eba2 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -24,6 +24,7 @@ #include "ggml-cuda/quantize.cuh" #include "ggml-cuda/rope.cuh" #include "ggml-cuda/scale.cuh" +#include "ggml-cuda/softcap.cuh" #include "ggml-cuda/softmax.cuh" #include "ggml-cuda/sumrows.cuh" #include "ggml-cuda/tsembd.cuh" @@ -2261,6 +2262,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_SCALE: ggml_cuda_op_scale(ctx, dst); break; + case GGML_OP_SOFTCAP: + ggml_cuda_op_softcap(ctx, dst); + break; case GGML_OP_SQR: ggml_cuda_op_sqr(ctx, dst); break; diff --git a/ggml/src/ggml-cuda/softcap.cu b/ggml/src/ggml-cuda/softcap.cu new file mode 100644 index 00000000..3b3e975f --- /dev/null +++ b/ggml/src/ggml-cuda/softcap.cu @@ -0,0 +1,31 @@ +#include "softcap.cuh" + +static __global__ void softcap_f32(const float * x, float * dst, const float * scales, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = scales[1] * tanh(scales[0]*x[i]); +} + +static void softcap_f32_cuda(const float * x, float * dst, const float * scales, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE; + softcap_f32<<>>(x, dst, scales, k); +} + +void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float scales[2]; + memcpy(scales, dst->op_params, sizeof(scales)); + + softcap_f32_cuda(src0_d, dst_d, scales, ggml_nelements(src0), stream); +} diff --git a/ggml/src/ggml-cuda/softcap.cuh b/ggml/src/ggml-cuda/softcap.cuh new file mode 100644 index 00000000..2b875bfb --- /dev/null +++ b/ggml/src/ggml-cuda/softcap.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_SOFTCAP_BLOCK_SIZE 256 + +void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst);