diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 9474eba2..73ab0b73 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2869,6 +2869,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_DIV: case GGML_OP_RMS_NORM: case GGML_OP_SCALE: + case GGML_OP_SOFTCAP: case GGML_OP_SQR: case GGML_OP_SQRT: case GGML_OP_CLAMP: diff --git a/ggml/src/ggml-cuda/softcap.cu b/ggml/src/ggml-cuda/softcap.cu index 3b3e975f..499025d1 100644 --- a/ggml/src/ggml-cuda/softcap.cu +++ b/ggml/src/ggml-cuda/softcap.cu @@ -1,18 +1,19 @@ #include "softcap.cuh" -static __global__ void softcap_f32(const float * x, float * dst, const float * scales, const int k) { +static __global__ void softcap_f32(const float * x, float * dst, float s_before, float s_after, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= k) { return; } - dst[i] = scales[1] * tanh(scales[0]*x[i]); + float xi = s_before*x[i]; + dst[i] = s_after * tanh(xi); } -static void softcap_f32_cuda(const float * x, float * dst, const float * scales, const int k, cudaStream_t stream) { +static void softcap_f32_cuda(const float * x, float * dst, float s_before, float s_after, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_SOFTCAP_BLOCK_SIZE - 1) / CUDA_SOFTCAP_BLOCK_SIZE; - softcap_f32<<>>(x, dst, scales, k); + softcap_f32<<>>(x, dst, s_before, s_after, k); } void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -27,5 +28,5 @@ void ggml_cuda_op_softcap(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { float scales[2]; memcpy(scales, dst->op_params, sizeof(scales)); - softcap_f32_cuda(src0_d, dst_d, scales, ggml_nelements(src0), stream); + softcap_f32_cuda(src0_d, dst_d, scales[0], scales[1], ggml_nelements(src0), stream); }