mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-03-11 06:20:09 +00:00
iqk_r4 quants: use MMQ only for batches < 1024 tokens
This commit is contained in:
@@ -196,11 +196,13 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
|
||||
case GGML_TYPE_IQ2_KT:
|
||||
case GGML_TYPE_IQ3_KT:
|
||||
case GGML_TYPE_IQ4_KT:
|
||||
mmq_supported = true;
|
||||
break;
|
||||
case GGML_TYPE_IQ2_K_R4:
|
||||
case GGML_TYPE_IQ3_K_R4:
|
||||
case GGML_TYPE_IQ4_K_R4:
|
||||
case GGML_TYPE_IQ5_K_R4:
|
||||
mmq_supported = true;
|
||||
mmq_supported = ne11 < 1024;
|
||||
break;
|
||||
default:
|
||||
mmq_supported = false;
|
||||
|
||||
Reference in New Issue
Block a user