mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 09:09:50 +00:00
Add condition
This commit is contained in:
@@ -217,7 +217,8 @@ void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tens
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
|
||||
if (gqa_ratio == 12) {
|
||||
if (gqa_ratio == 12 && Q->ne[1] == 1 && K->ne[1]*K->ne[2] >= 65536) {
|
||||
// This is a hack to improve GLM-4.5/4.6/4.7/AIR TG performance
|
||||
glm45_flash_attention(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user