From 2cf2fc2a2f20035fe96fa9be7f624160b793151e Mon Sep 17 00:00:00 2001 From: Kawrakow Date: Fri, 8 Aug 2025 13:51:14 +0300 Subject: [PATCH] Fix quantized K cache without FA (#680) * Prevent assert with quantized K cache and no FA * Fix MMQ when running with quantized K cache without FA --------- Co-authored-by: Iwan Kawrakow --- ggml/src/ggml-cuda.cu | 2 +- ggml/src/ggml-cuda/mmq.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 7fee71d8..67d9828c 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat( } const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; - if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1) { + if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) { //printf("invoking fast path for %s x %s\n", src0->name, src1->name); int id = ctx.device; char * src0_dd_i = dev[id].src0_dd; diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index a0e7da12..1e3accf0 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -14,7 +14,7 @@ void ggml_cuda_op_mul_mat_q( const int64_t src1_padded_row_size, cudaStream_t stream) { const int64_t ne00 = src0->ne[0]; - const int64_t nb01 = src0->nb[1]; + const int64_t nb01 = ggml_row_size(src0->type, ne00); const int64_t ne10 = src1->ne[0]; const int64_t ne11 = src1->ne[1];