Use fused gemv+add only for TG (#933)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2026-03-13 15:30:03 +00:00 · 2025-11-10 08:34:24 +02:00
parent 56ee303254
commit bf474e9bff
1 changed files with 1 additions and 1 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2067,7 +2067,7 @@ static int ggml_cuda_mul_mat_q(ggml_backend_cuda_context & ctx, const ggml_tenso

    auto stream = ctx.stream();

-    auto fusion = ctx.fusion;
+    auto fusion = ctx.fusion && src1->ne[1] == 1;

    auto ne10_padded = GGML_PAD(src1->ne[0], MATRIX_ROW_PADDING);
    auto nb10_padded = ne10_padded*sizeof(block_q8_1)/QK8_1;