mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
llama: enable K-shift for quantized KV cache for cuda (#760)
cuda: add q8_0->f32 cpy operation (#9571) It will fail on unsupported backends or quant types. Co-authored-by: Ivan <nekotekina@gmail.com>
This commit is contained in:
@@ -4119,6 +4119,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_F32) {
|
||||
return true;
|
||||
}
|
||||
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user