From eb1686a84074e0d0ef9c72831fa641ee8ad31a5d Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Tue, 3 Mar 2026 22:33:58 +0100 Subject: [PATCH] GatedDeltaNet: Set chunked seqlen threshold to num_v_heads (prevents warning from FLA) --- exllamav3/modules/gated_delta_net.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exllamav3/modules/gated_delta_net.py b/exllamav3/modules/gated_delta_net.py index 0d1982e..bd89173 100644 --- a/exllamav3/modules/gated_delta_net.py +++ b/exllamav3/modules/gated_delta_net.py @@ -618,7 +618,7 @@ class GatedDeltaNet(Module): # Use chunked rule when advantageous and available # TODO: At least warn if chunked rule (i.e. flash-linear-attention) is not available # since performance will tank on prompt ingestion - if seqlen >= 32 and chunk_gated_delta_rule is not None: + if seqlen >= self.num_v_heads and chunk_gated_delta_rule is not None: mixed_qkv = mixed_qkv.transpose(1, 2) q, k, v = torch.split(mixed_qkv, [self.k_dim, self.k_dim, self.v_dim], dim = -1)