Add condition

2026-03-12 23:10:01 +00:00 · 2026-01-25 06:52:04 +00:00
parent d08481d0f4
commit aff7aa0cf6
1 changed files with 2 additions and 1 deletions
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cu
@@ -217,7 +217,8 @@ void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tens
    GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
    const int gqa_ratio = Q->ne[2] / K->ne[2];

-    if (gqa_ratio == 12) {
+    if (gqa_ratio == 12 && Q->ne[1] == 1 && K->ne[1]*K->ne[2] >= 65536) {
+        // This is a hack to improve GLM-4.5/4.6/4.7/AIR TG performance
        glm45_flash_attention(ctx, dst);
        return;
    }