From df066ced5ea114f2b379a803eb91ec71ceb77b68 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Sat, 30 Aug 2025 11:29:27 +0300 Subject: [PATCH] Seems to be working on CUDA For a dense model we get 2-3% speedup for PP and ~0.6% for TG. --- ggml/src/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 0d584b29..d8b1a2aa 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2996,7 +2996,7 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor ggml_cuda_pool_alloc dst_up(ctx.pool(), ggml_nelements(dst)); ggml_cuda_pool_alloc src1_quantized(ctx.pool(), quantized_size); if (src1->ne[1] <= 8) { - quantize_row_q8_1_cuda((const float *)src1->data, (void *)src1_quantized.get(), src1->ne[0], src1->ne[1], 1, nb10_padded, + quantize_row_q8_1_cuda((const float *)src1->data, (void *)src1_quantized.get(), src1->ne[0], src1->ne[1], 1, ne10_padded, src0_1->type, stream); CUDA_CHECK(cudaGetLastError());