mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-01-26 17:20:01 +00:00
Add condition
This commit is contained in:
@@ -217,7 +217,8 @@ void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tens
|
|||||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||||
|
|
||||||
if (gqa_ratio == 12) {
|
if (gqa_ratio == 12 && Q->ne[1] == 1 && K->ne[1]*K->ne[2] >= 65536) {
|
||||||
|
// This is a hack to improve GLM-4.5/4.6/4.7/AIR TG performance
|
||||||
glm45_flash_attention(ctx, dst);
|
glm45_flash_attention(ctx, dst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user