iqk_r4 quants: use MMQ only for batches < 1024 tokens

This commit is contained in:
Iwan Kawrakow
2025-06-25 14:47:59 +03:00
parent 9b273bf437
commit b3417c9366

View File

@@ -196,11 +196,13 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
case GGML_TYPE_IQ2_KT:
case GGML_TYPE_IQ3_KT:
case GGML_TYPE_IQ4_KT:
mmq_supported = true;
break;
case GGML_TYPE_IQ2_K_R4:
case GGML_TYPE_IQ3_K_R4:
case GGML_TYPE_IQ4_K_R4:
case GGML_TYPE_IQ5_K_R4:
mmq_supported = true;
mmq_supported = ne11 < 1024;
break;
default:
mmq_supported = false;