Fix quantized K cache without FA (#680)

* Prevent assert with quantized K cache and no FA

* Fix MMQ when running with quantized K cache without FA

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
Kawrakow
2025-08-08 13:51:14 +03:00
committed by Iwan Kawrakow
parent 7388d9be8d
commit 2cf2fc2a2f
2 changed files with 2 additions and 2 deletions

View File

@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
}
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1) {
if (!(split && used_devices > 1) && quantization_done && ne11 == 1 && ne12 > 1 && ne13 == 1 && ne02 == ne12 && ne02 == dst->ne[2]) {
//printf("invoking fast path for %s x %s\n", src0->name, src1->name);
int id = ctx.device;
char * src0_dd_i = dev[id].src0_dd;

View File

@@ -14,7 +14,7 @@ void ggml_cuda_op_mul_mat_q(
const int64_t src1_padded_row_size, cudaStream_t stream) {
const int64_t ne00 = src0->ne[0];
const int64_t nb01 = src0->nb[1];
const int64_t nb01 = ggml_row_size(src0->type, ne00);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];