From 60afe8d983e29fe322b458954fa99fce2c491ab7 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Sat, 7 Mar 2026 20:48:24 +0100 Subject: [PATCH] Update README.md --- README.md | 9 ++++----- exllamav3/modules/gated_delta_net.py | 3 +-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 00e4f66..8a2ee5d 100644 --- a/README.md +++ b/README.md @@ -17,11 +17,10 @@ The official and recommended backend server for ExLlamaV3 is [TabbyAPI](https:// ### ⚠️ Important -- **Qwen3-Next** support is currently experimental and still requires some optimization, so don't expect - optimal performance just yet. [Flash Linear Attention](https://github.com/fla-org/flash-linear-attention) is required - and this in turn requires Triton. [causal-conv1d](https://github.com/Dao-AILab/causal-conv1d) is supported and - recommended but not required. -- **Qwen3-Next** currently does not support tensor/expert parallelism. +- **Qwen3-Next** and **Qwen3.5** can take advantage of [Flash Linear Attention](https://github.com/fla-org/flash-linear-attention), though this requires + Triton, and performance can be shaky due to the sporadic JIT compilation it imposes. [causal-conv1d](https://github.com/Dao-AILab/causal-conv1d) is + supported and recommended but not required. +- **Qwen3-Next** and **Qwen3.5** currently do not support tensor/expert parallelism. ## Architecture support diff --git a/exllamav3/modules/gated_delta_net.py b/exllamav3/modules/gated_delta_net.py index bd89173..6384db1 100644 --- a/exllamav3/modules/gated_delta_net.py +++ b/exllamav3/modules/gated_delta_net.py @@ -616,8 +616,7 @@ class GatedDeltaNet(Module): ) # Use chunked rule when advantageous and available - # TODO: At least warn if chunked rule (i.e. flash-linear-attention) is not available - # since performance will tank on prompt ingestion + # TODO: Replace chunked fn with non-Triton implementation if seqlen >= self.num_v_heads and chunk_gated_delta_rule is not None: mixed_qkv = mixed_qkv.transpose(1, 2)