Fix MMQ when running with quantized K cache without FA

2026-02-23 22:54:10 +00:00 · 2025-08-08 13:42:03 +03:00
parent 0dce7f9128
commit f747cbca08
1 changed files with 1 additions and 1 deletions
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -14,7 +14,7 @@ void ggml_cuda_op_mul_mat_q(
    const int64_t src1_padded_row_size, cudaStream_t stream) {

    const int64_t ne00 = src0->ne[0];
-    const int64_t nb01 = src0->nb[1];
+    const int64_t nb01 = ggml_row_size(src0->type, ne00);

    const int64_t ne10 = src1->ne[0];
    const int64_t ne11 = src1->ne[1];