From 9db029ded5a40c15e0e748aee5055c5b46549b41 Mon Sep 17 00:00:00 2001 From: turboderp <11859846+turboderp@users.noreply.github.com> Date: Wed, 11 Mar 2026 21:43:45 +0100 Subject: [PATCH] Separate transpose options for fused expert weights (account for differences between Qwen3Moe and Qwen3_5Moe) --- exllamav3/architecture/qwen3_5.py | 2 +- exllamav3/modules/block_sparse_mlp.py | 4 ++++ exllamav3/modules/linear.py | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/exllamav3/architecture/qwen3_5.py b/exllamav3/architecture/qwen3_5.py index daff4a2..b64505f 100644 --- a/exllamav3/architecture/qwen3_5.py +++ b/exllamav3/architecture/qwen3_5.py @@ -381,7 +381,7 @@ class Qwen3_5BaseModel(Model): key_down_split = "experts.down_proj", key_routing_gate = "gate", key_shared_gate = "shared_expert_gate", - transposed_load = False, + transpose_fused_weights = False, qmap = "block.mlp", interm_dtype = torch.half, out_dtype = torch.float, diff --git a/exllamav3/modules/block_sparse_mlp.py b/exllamav3/modules/block_sparse_mlp.py index 4dbb31e..f463028 100644 --- a/exllamav3/modules/block_sparse_mlp.py +++ b/exllamav3/modules/block_sparse_mlp.py @@ -188,6 +188,7 @@ class BlockSparseMLP(Module): routing_last: int | None = None, routing_device: int | None = None, transposed_load: bool = True, + transpose_fused_weights: bool = True, ): super().__init__(config, key, None) @@ -285,6 +286,7 @@ class BlockSparseMLP(Module): qmap = qmap + ".input", out_dtype = self.interm_dtype, transposed_load = transposed_load, + transpose_fused_weights = transpose_fused_weights, ) up = Linear( config = config, @@ -297,6 +299,7 @@ class BlockSparseMLP(Module): qmap = qmap + ".input", out_dtype = self.interm_dtype, transposed_load = transposed_load, + transpose_fused_weights = transpose_fused_weights, ) down = Linear( config = config, @@ -309,6 +312,7 @@ class BlockSparseMLP(Module): out_dtype = self.out_dtype, allow_input_padding = True, transposed_load = transposed_load, + transpose_fused_weights = transpose_fused_weights, ) self.ups.append(up) diff --git a/exllamav3/modules/linear.py b/exllamav3/modules/linear.py index 7eb3d38..c791376 100644 --- a/exllamav3/modules/linear.py +++ b/exllamav3/modules/linear.py @@ -38,6 +38,7 @@ class Linear(Module): allow_input_padding: bool = False, post_scale: float = 1.0, transposed_load: bool = True, + transpose_fused_weights: bool = True, ): super().__init__(config, key, qmap) @@ -62,6 +63,7 @@ class Linear(Module): self.out_dtype = out_dtype self.post_scale = post_scale self.transposed_load = transposed_load + self.transpose_fused_weights = transpose_fused_weights assert self.in_features_unpadded == self.in_features or allow_input_padding, \ f"Input padding is not allowed for {self.key}, in_dim: {self.in_features_unpadded}, pad_to: {pad_to}" @@ -168,7 +170,7 @@ class Linear(Module): weight = self.config.stc.get_tensor( self.fkey, self.device, - transpose = self.transposed_load, + transpose = self.transpose_fused_weights, no_defer = True, fidx = self.fidx )